In [37]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import json

from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from xgboost import XGBClassifier
from helper import *

Load data from both cohorts and combine

In [8]:
datapath = 'data/original/pre-imputed/'
C1W1net_df = pd.read_csv(datapath + 'C1W1_network_preimputed.csv')
C2W1net_df = pd.read_csv(datapath + '221114/C2W1_network_preimputed.csv')
mappings_df = pd.read_csv(datapath + 'mappings.csv')

C1W1net_vars = list(C1W1net_df.columns)
C2W1net_vars = list(C2W1net_df.columns)
if C1W1net_vars == C2W1net_vars:    net_vars = C1W1net_vars

net_df = pd.concat([C1W1net_df, C2W1net_df], axis=0)
print(net_df.shape)

(140, 195)


In [9]:
labelings_NS = {
    'NSX2': 'How long person X has been someone that you could confide in',
    'NSX3': 'How often you hang out or spend time with person X',
    'NSX5': 'Age of person X',
    'NSX7': 'Level of education person X has finished to the best of your knowledge',
    'NSX8': 'Current employment status of person X to the best of your knowledge',
    'NSX9': 'Whether person X is a person you are comfortable using drugs around',
    'NSX10': 'How often you have used drugs with person X in the past 30 days',
    'NSX11': 'How often (you think) person X uses drugs in general',
    'NSX12': 'Relationship with person X'
}

cate_mappings_NS = {
    'NSX2A': {1: 'Less than 6 months', 4: '6 months to a year', 5: '1-2 years', 6: '3-5 years', 7: 'More than 5 years'},
    'NSX3A': {1: 'Less than once a months', 7: 'Once a month', 8: 'Once a week', 9: 'Almost daily'},
    'NSX5A': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7A': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8A': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9A': {1: 'No', 2: 'Yes'},
    'NSX10A': {1: 'Never', 4: 'Less than once a month', 5: 'Once a month', 6: 'Once a week', 7: '2-6 times a week', 8: 'One time per day', 9: '2-3 times per day', 10: '4 or more times per day'},
    'NSX11A': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12A': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2B': {1: 'Less than 6 months', 48: '6 months to a year', 49: '1-2 years', 50: '3-5 years', 51: 'More than 5 years'},
    'NSX3B': {1: 'Less than once a months', 7: 'Once a month', 8: 'Once a week', 9: 'Almost daily'},
    'NSX5B': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7B': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8B': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9B': {1: 'No', 2: 'Yes'},
    'NSX10B': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11B': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12B': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2C': {1: 'Less than 6 months', 4: '6 months to a year', 5: '1-2 years', 6: '3-5 years', 7: 'More than 5 years'},
    'NSX3C': {1: 'Less than once a months', 7: 'Once a month', 8: 'Once a week', 9: 'Almost daily'},
    'NSX5C': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7C': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8C': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9C': {1: 'No', 2: 'Yes'},
    'NSX10C': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11C': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12C': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2D': {1: 'Less than 6 months', 4: '6 months to a year', 5: '1-2 years', 6: '3-5 years', 7: 'More than 5 years'},
    'NSX3D': {1: 'Less than once a months', 7: 'Once a month', 8: 'Once a week', 9: 'Almost daily'},
    'NSX5D': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7D': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8D': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9D': {1: 'No', 2: 'Yes'},
    'NSX10D': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11D': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12D': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2E': {1: 'Less than 6 months', 4: '6 months to a year', 5: '1-2 years', 6: '3-5 years', 7: 'More than 5 years'},
    'NSX3E': {1: 'Less than once a months', 7: 'Once a month', 8: 'Once a week', 9: 'Almost daily'},
    'NSX5E': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7E': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8E': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9E': {1: 'No', 2: 'Yes'},
    'NSX10E': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11E': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12E': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2F': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 10: '3-5 years', 11: 'More than 5 years'},
    'NSX3F': {1: 'Less than once a months', 4: 'Once a month', 5: 'Once a week', 6: 'Almost daily'},
    'NSX5F': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7F': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8F': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9F': {1: 'No', 2: 'Yes'},
    'NSX10F': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11F': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12F': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2G': {1: 'Less than 6 months', 4: '6 months to a year', 5: '1-2 years', 6: '3-5 years', 7: 'More than 5 years'},
    'NSX3G': {1: 'Less than once a months', 8: 'Once a month', 9: 'Once a week', 10: 'Almost daily'},
    'NSX5G': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7G': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8G': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9G': {1: 'No', 2: 'Yes'},
    'NSX10G': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11G': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12G': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2H': {1: 'Less than 6 months', 4: '6 months to a year', 5: '1-2 years', 6: '3-5 years', 7: 'More than 5 years'},
    'NSX3H': {1: 'Less than once a months', 7: 'Once a month', 8: 'Once a week', 9: 'Almost daily'},
    'NSX5H': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7H': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8H': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9H': {1: 'No', 4: 'Yes'},
    'NSX10H': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11H': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12H': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NSX2I': {1: 'Less than 6 months', 4: '6 months to a year', 5: '1-2 years', 6: '3-5 years', 7: 'More than 5 years'},
    'NSX3I': {1: 'Less than once a months', 7: 'Once a month', 8: 'Once a week', 9: 'Almost daily'},
    'NSX5I': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NSX7I': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NSX8I': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NSX9I': {1: 'No', 2: 'Yes'},
    'NSX10I': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX11I': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NSX12I': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'}
}

labelings_ND = {
    'NDX2': 'Age of person X',
    'NDX4': 'Level of education person X has finished to the best of your knowledge',
    'NDX5': 'Current employment status of person X to the best of your knowledge',
    'NDX6': 'Relationship with person X',
    'NDX8': 'How often you used drugs with person X in the past 30 days',
    'NDX9': 'How often (you think) person X uses drugs in general',
    'NDX12': 'How long you have used drugs with person X'
}

cate_mappings_ND = {
    'NDX2J': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4J': {1: 'Less than high school', 4: 'High school degree', 5: 'Some college', 6: '2 year degree', 7: '4 year degree', 8: 'Graduate or professional school', 9: "Don't know"},
    'NDX5J': {1: 'Employed full-time', 4: 'Employed part-time', 5: 'Not employed', 6: 'Others', 7: "Don't know"},
    'NDX6J': {1: 'They are my spouse/romantic partner', 4: 'They are my parent', 5: 'They are my child', 6: 'They are another relative', 7: 'They are my friend', 8: 'They are my something else'},
    'NDX8J': {11: 'Never', 12: 'Less than once a month', 13: 'Once a month', 14: 'Once a week', 15: '2-6 times a week', 16: 'One time per day', 17: '2-3 times per day', 18: '4 or more times per day'},
    'NDX9J': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX12J': {8: 'Less than 6 months', 9: '6 months to a year', 10: '1-2 years', 11: '3-5 years', 12: 'More than 5 years'},
    'NDX2K': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4K': {10: 'Less than high school', 11: 'High school degree', 12: 'Some college', 13: '2 year degree', 14: '4 year degree', 15: 'Graduate or professional school', 16: "Don't know"},
    'NDX5K': {1: 'Employed full-time', 8: 'Employed part-time', 9: 'Not employed', 10: 'Others', 11: "Don't know"},
    'NDX6K': {1: 'They are my spouse/romantic partner', 9: 'They are my parent', 10: 'They are my child', 11: 'They are another relative', 12: 'They are my friend', 14: 'They are my something else'},
    'NDX8K': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 17: '2-3 times per day', 19: '4 or more times per day'},
    'NDX9K': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 17: '2-3 times per day', 19: '4 or more times per day'},
    'NDX12K': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 10: '3-5 years', 11: 'More than 5 years'},
    'NDX2L': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4L': {1: 'Less than high school', 10: 'High school degree', 11: 'Some college', 12: '2 year degree', 13: '4 year degree', 14: 'Graduate or professional school', 15: "Don't know"},
    'NDX5L': {1: 'Employed full-time', 8: 'Employed part-time', 9: 'Not employed', 10: 'Others', 11: "Don't know"},
    'NDX6L': {1: 'They are my spouse/romantic partner', 9: 'They are my parent', 10: 'They are my child', 11: 'They are another relative', 12: 'They are my friend', 13: 'They are my something else'},
    'NDX8L': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 17: '2-3 times per day', 19: '4 or more times per day'},
    'NDX9L': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 17: '2-3 times per day', 19: '4 or more times per day'},
    'NDX12L': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 10: '3-5 years', 11: 'More than 5 years'},
    'NDX2M': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4M': {1: 'Less than high school', 10: 'High school degree', 11: 'Some college', 12: '2 year degree', 13: '4 year degree', 14: 'Graduate or professional school', 15: "Don't know"},
    'NDX5M': {1: 'Employed full-time', 8: 'Employed part-time', 9: 'Not employed', 10: 'Others'},
    'NDX6M': {1: 'They are my spouse/romantic partner', 9: 'They are my parent', 10: 'They are my child', 11: 'They are another relative', 12: 'They are my friend', 13: 'They are my something else'},
    'NDX8M': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 17: '2-3 times per day', 19: '4 or more times per day'},
    'NDX9M': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 17: '2-3 times per day', 19: '4 or more times per day'},
    'NDX12M': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 12: '3-5 years', 15: 'More than 5 years'},
    'NDX2N': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4N': {1: 'Less than high school', 10: 'High school degree', 11: 'Some college', 12: '2 year degree', 13: '4 year degree', 14: 'Graduate or professional school', 15: "Don't know"},
    'NDX5N': {1: 'Employed full-time', 8: 'Employed part-time', 9: 'Not employed', 10: 'Others', 11: "Don't know"},
    'NDX6N': {1: 'They are my spouse/romantic partner', 9: 'They are my parent', 10: 'They are my child', 11: 'They are another relative', 12: 'They are my friend', 13: 'They are my something else'},
    'NDX8N': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX9N': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX12N': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 10: '3-5 years', 11: 'More than 5 years'},
    'NDX2O': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4O': {1: 'Less than high school', 10: 'High school degree', 11: 'Some college', 12: '2 year degree', 13: '4 year degree', 14: 'Graduate or professional school', 15: "Don't know"},
    'NDX5O': {1: 'Employed full-time', 9: 'Employed part-time', 10: 'Not employed', 11: 'Others', 12: "Don't know"},
    'NDX6O': {9: 'They are my spouse/romantic partner', 10: 'They are my parent', 11: 'They are my child', 12: 'They are another relative', 13: 'They are my friend', 14: 'They are my something else'},
    'NDX8O': {11: 'Never', 12: 'Less than once a month', 13: 'Once a month', 14: 'Once a week', 15: '2-6 times a week', 16: 'One time per day', 17: '2-3 times per day', 18: '4 or more times per day'},
    'NDX9O': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX12O': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 10: '3-5 years', 11: 'More than 5 years'},
    'NDX2P': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4P': {1: 'Less than high school', 10: 'High school degree', 11: 'Some college', 12: '2 year degree', 13: '4 year degree', 14: 'Graduate or professional school', 15: "Don't know"},
    'NDX5P': {1: 'Employed full-time', 8: 'Employed part-time', 9: 'Not employed', 10: 'Others', 11: "Don't know"},
    'NDX6P': {1: 'They are my spouse/romantic partner', 9: 'They are my parent', 10: 'They are my child', 11: 'They are another relative', 12: 'They are my friend', 13: 'They are my something else'},
    'NDX8P': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX9P': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX12P': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 10: '3-5 years', 11: 'More than 5 years'},
    'NDX2Q': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4Q': {1: 'Less than high school', 10: 'High school degree', 11: 'Some college', 12: '2 year degree', 13: '4 year degree', 14: 'Graduate or professional school', 15: "Don't know"},
    'NDX5Q': {1: 'Employed full-time', 8: 'Employed part-time', 9: 'Not employed', 10: 'Others', 11: "Don't know"},
    'NDX6Q': {1: 'They are my spouse/romantic partner', 9: 'They are my parent', 10: 'They are my child', 11: 'They are another relative', 12: 'They are my friend', 13: 'They are my something else'},
    'NDX8Q': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX9Q': {11: 'Never', 12: 'Less than once a month', 13: 'Once a month', 14: 'Once a week', 15: '2-6 times a week', 16: 'One time per day', 17: '2-3 times per day', 18: '4 or more times per day'},
    'NDX12Q': {1: 'Less than 6 months', 8: '6 months to a year', 9: '1-2 years', 10: '3-5 years', 11: 'More than 5 years'},
    'NDX2R': {0: 'Children (14 and below)', 1: 'Youth (15 to 24)', 2: 'Adult (25 to 64)', 3: 'Senior (65 and above)'},
    'NDX4R': {1: 'Less than high school', 10: 'High school degree', 11: 'Some college', 12: '2 year degree', 13: '4 year degree', 14: 'Graduate or professional school', 15: "Don't know"},
    'NDX5R': {1: 'Employed full-time', 8: 'Employed part-time', 9: 'Not employed', 10: 'Others', 11: "Don't know"},
    'NDX6R': {1: 'They are my spouse/romantic partner', 9: 'They are my parent', 10: 'They are my child', 11: 'They are another relative', 12: 'They are my friend', 13: 'They are my something else'},
    'NDX8R': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX9R': {1: 'Never', 11: 'Less than once a month', 12: 'Once a month', 13: 'Once a week', 14: '2-6 times a week', 15: 'One time per day', 16: '2-3 times per day', 17: '4 or more times per day'},
    'NDX12R': {1: 'Less than 6 months', 15: '6 months to a year', 16: '1-2 years', 17: '3-5 years', 18: 'More than 5 years'}
}

In [10]:
# net_mappings = {}
# net_mappings['labelings_NS'] = labelings_NS
# net_mappings['labelings_ND'] = labelings_ND
# net_mappings['cate_mappings_NS'] = cate_mappings_NS
# net_mappings['cate_mappings_ND'] = cate_mappings_ND

# with open('saved-vars/labelings_network.json', 'w') as f:
#     json.dump(net_mappings, f)

In [11]:
alters = tuple('ABCDEFGHIJKLMNOPQR')
NS_alters = tuple('ABCDEFGHI')
ND_alters = tuple('JKLMNOPQR')
net_df.columns[net_df.columns.str.endswith(alters)]  # all col variables that end with letters in tuple

Index(['NSX2A', 'NSX3A', 'NSX5A', 'NSX7A', 'NSX8A', 'NSX9A', 'NSX10A',
       'NSX11A', 'NSX12A', 'NSX2B',
       ...
       'NDX8Q', 'NDX9Q', 'NDX12Q', 'NDX2R', 'NDX4R', 'NDX5R', 'NDX6R', 'NDX8R',
       'NDX9R', 'NDX12R'],
      dtype='object', length=144)

In [12]:
alters_info = {'n_nodes': [], 'n_codrug_nodes': [], 'ratio': []}  # each list has N entries (one for each participant)

for index, row in net_df.iterrows():  # for each row i.e., participant
    n_nodes = 0
    n_codrug_nodes = 0
    net_features = {v[:-1]: {cate: 0 for cate in cate_mappings_NS[v].values()} for v in cate_mappings_NS.keys()}  # nested dict, each key is a variable for the alters
    
    for a in alters:  # for each alter
        alter_data = [row[v] for v in net_df.columns[net_df.columns.str.endswith(a)]]
        # print(alter_data)
        if not np.isnan(alter_data[:-1]).all():  # node exists if info of alter is not all nan
            n_nodes += 1
            codrug_var = f'NSX10{a}' if a in NS_alters else f'NDX8{a}'
            cate_mappings = cate_mappings_NS if a in NS_alters else cate_mappings_ND
            if row[codrug_var] > min(cate_mappings[codrug_var].keys()):
                n_codrug_nodes += 1
            
    alters_info['n_nodes'].append(n_nodes)
    alters_info['n_codrug_nodes'].append(n_codrug_nodes)
    try:    ratio = n_codrug_nodes/n_nodes
    except ZeroDivisionError:   ratio = 0
    alters_info['ratio'].append(ratio)

In [13]:
ratios = alters_info['ratio']

Load target variable (drug use)

In [14]:
datapath = 'data/original/pre-imputed/'

C1W1nonet_df = pd.read_csv(datapath + 'C1W1_nonnetwork_preimputed.csv')
C1pred_df = pd.read_csv(datapath + 'C1_nonnetwork_pred.csv')

C2W1nonet_df = pd.read_csv(datapath + '221114/C2W1_nonnetwork_preimputed.csv')
C2pred_df = pd.read_csv(datapath + '221114/C2_nonnetwork_pred.csv')

with open('saved-vars/labelings_non-network.json', 'r') as f:
    nnw_labelings = json.load(f)

In [15]:
def load_data_v4(cohort, drug_key):

    if cohort == 1:
        nonet_df = C1W1nonet_df
    elif cohort == 2:
        nonet_df = C2W1nonet_df
    elif cohort == '1+2':
        nonet_df = pd.concat([C1W1nonet_df, C2W1nonet_df], ignore_index=True)
    nonet_vars = list(C1W1nonet_df.columns)  # same set of columns for both cohorts
    
    df = impute_MARs(nonet_vars, nonet_df)
    pred_var = df[drug_key]

    y = np.array(pred_var)
    drop_idx = np.argwhere(np.isnan(y)).flatten()
    y = y[~np.isnan(y)]
    lbl_enc = LabelEncoder().fit(list(map(int, nnw_labelings[drug_key][1].keys())))
    y = lbl_enc.transform(y)

    return y, drop_idx

In [24]:
y, drop_idx = load_data_v4(cohort='1+2', drug_key='ND1')

X = np.array([ratios[i] for i in range(len(ratios)) if i not in drop_idx])

Plot ratio vs drug use intensity for each drug

In [108]:
drug_keys = ['TB4','TB8','TB12','AL5'] + [f'ID{i}' for i in range(4,12)] + [f'ND{i}' for i in range(1,12) if i !=2]
drug_names = ['cigarettes','e-cigarettes','smokeless tobacco','alcohol','heroin','opioids','meth','cocaine','heroin and cocaine speedball',
                'heroin and meth speedball','crack cocaine','buprenorphine','marijuana','cocaine (non-injection)','Ecstasy/MDMA','PCP/angel dust',
                'amphetamines','meth (non-injection)','barbiturates','benzodiazepines','opiates/opioids','heroin (non-injection)']
drugs_dict = {k: v for k, v in zip(drug_keys, drug_names)}

In [None]:
for drug_key, drug_name in drugs_dict.items():
    y, drop_idx = load_data_v4(cohort='1+2', drug_key=drug_key)
    X = [ratios[i] for i in range(len(ratios)) if i not in drop_idx]
    categories = {k: v for k, v in enumerate(nnw_labelings[drug_key][1].values())}
    plt.figure(figsize=(5,5))
    plt.scatter(y, X)
    plt.xticks(y, [categories[i] for i in y], rotation=90)
    plt.ylabel('ratio')
    plt.title(f'{drug_name} use')
    plt.savefig(f'plots/analysis/network_pred/{drug_key}.pdf', facecolor='white', bbox_inches='tight')

Ordinal classification

In [39]:
# clf = LogisticRegression(solver='saga', penalty='l1'),
# clf = LogisticRegression(solver='saga', penalty='l2'),
# clf = LogisticRegression(solver='saga', penalty='elasticnet', l1_ratio=0.5)
# clf = DecisionTreeClassifier(max_depth=10)
# clf = RandomForestClassifier()
clf = XGBClassifier()
ordinal_clf = OrdinalClassifier(clf)
preds, preds_proba = [], []
results = []
for train_idx, test_idx in LeaveOneOut().split(X):
    X_train, y_train, X_test, y_test = X[train_idx].reshape(-1,1), y[train_idx], X[test_idx].reshape(-1,1), y[test_idx]
    ordinal_clf.fit(X_train, y_train)
    pred = ordinal_clf.predict(X_test)[0]
    preds.append(pred)
    print(pred, y_test[0])
    results.append(pred == y_test[0])
    # preds_proba.append(ordinal_clf.predict_proba(X_test)[:,1][0])

print(np.mean(results))
# print(f'precision: {precision_score(y, np.array(preds)):.2f}/0.5, recall: {recall_score(y, np.array(preds)):.2f}/0.5')

2 4
7 0
6 1
0 4
6 0
6 5
2 3
5 7
5 7
4 0
5 3
4 7
7 4
0 4
6 5
0 4
0 6
6 6
1 7
5 0
2 1
2 5
5 6
6 3
4 6
5 7
2 4
6 4
3 6
4 6
5 2
2 6
7 6
5 5
5 5
7 3
5 5
6 0
6 6
5 6
7 4
6 0
6 0
2 5
6 0
6 5
5 4
2 3
4 0
5 6
7 6
4 2
6 3
6 6
5 3
7 4
6 3
6 6
5 3
5 7
5 5
6 5
7 1
6 0
5 0
0 6
4 7
1 7
6 4
6 0
6 7
5 7
0 0
5 2
7 6
6 4
4 7
5 7
6 5
6 7
6 0
4 7
7 4
5 3
5 4
5 3
5 1
6 7
2 0
6 2
5 4
6 6
6 7
5 6
0 6
6 0
0 6
2 1
5 5
0 7
6 6
7 5
5 5
5 1
6 6
4 2
5 7
0 6
7 0
6 4
5 6
0 7
0 6
5 5
7 0
2 3
0 3
4 6
5 6
6 3
3 1
5 0
4 0
6 1
3 0
0 7
4 2
6 3
0 7
7 5
2 0
2 7
5 4
7 5
0.11194029850746269
