In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set_theme()
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import classification_report
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
import shap
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBRFClassifier
from lightgbm import LGBMClassifier

In [None]:
data = pd.read_csv('../data/data_labeled.csv', index_col=0)

In [None]:
# relabel 'race'
data['race'] = data['race']/10
data['race'] = data['race'].apply(int)
# fix ages
data['age'] = data['age'].apply(lambda x: 2018-x if x>1900 else x)
data['age'] = data['age'].apply(lambda x: 90 if x>90 else x)
# fix family size
data['familysize'] = data['familysize'].apply(lambda x: 12 if x>12 else x)

In [None]:
data['Stress_cat'].replace({1:0}, inplace=True)
data['Stress_cat'].replace({3:4}, inplace=True)
data['Anxiety_cat'].replace({1:0}, inplace=True)
data['Anxiety_cat'].replace({3:4}, inplace=True)
data['Depression_cat'].replace({1:0}, inplace=True)
data['Depression_cat'].replace({3:4}, inplace=True)

# Participants demographics

In [None]:
# age distribution
age_limit = 40

df = data[data['age']<=age_limit]['age'].value_counts().sort_index()
plt.rcParams['figure.figsize'] = 10,8
sns.set(font_scale=1.5, style='dark')
plt.style.use("dark_background")

fig = sns.barplot(x=df.index, y=df.values)
fig.set_xticklabels(df.index , rotation=0)
plt.title('Age distribution', size=20)
plt.savefig('../presentation/age_countbars.png', dpi=200)
plt.savefig('../presentation/age_countbars_transparent.png', dpi=200, transparent=True)
plt.show()

In [None]:
# Stress
index_meanings={0:'Normal-Mild', 2:'Moderate', 4:'Severe-Extreme'}

df = data['Stress_cat'].value_counts().sort_index()
plt.rcParams['figure.figsize'] = 10,8
sns.set(font_scale=1.5, style='dark')
plt.style.use("dark_background")

fig = sns.barplot(x=df.index, y=df.values)
fig.set_xticklabels(df.index.map(index_meanings) , rotation=0)
plt.title('Stress', size=20)
plt.savefig('../presentation/stress_countbars.png', dpi=200)
plt.savefig('../presentation/stress_countbars_transparent.png', dpi=200, transparent=True)
plt.show()

In [None]:
# Anxiety
index_meanings={0:'Normal-Mild', 2:'Moderate', 4:'Severe-Extreme'}

df = data['Anxiety_cat'].value_counts().sort_index()
plt.rcParams['figure.figsize'] = 10,8
sns.set(font_scale=1.5, style='dark')
plt.style.use("dark_background")

fig = sns.barplot(x=df.index, y=df.values)
fig.set_xticklabels(df.index.map(index_meanings) , rotation=0)
plt.title('Anxiety', size=20)
plt.savefig('../presentation/anxiety_countbars.png', dpi=200)
plt.savefig('../presentation/anxiety_countbars_transparent.png', dpi=200, transparent=True)
plt.show()

In [None]:
# Depression
index_meanings={0:'Normal-Mild', 2:'Moderate', 4:'Severe-Extreme'}

df = data['Depression_cat'].value_counts().sort_index()
plt.rcParams['figure.figsize'] = 10,8
sns.set(font_scale=1.5, style='dark')
plt.style.use("dark_background")

fig = sns.barplot(x=df.index, y=df.values)
fig.set_xticklabels(df.index.map(index_meanings) , rotation=0)
plt.title('Depression', size=20)
plt.savefig('../presentation/depression_countbars.png', dpi=200)
plt.savefig('../presentation/depression_countbars_transparent.png', dpi=200, transparent=True)
plt.show()

# Models

In [None]:
# labels
l_cols = [
 'Depression_cat',
 'Anxiety_cat',
 'Stress_cat']

mylist=[]
for col in l_cols:
    mylist.append([col]+list(data[col].value_counts().sort_index().index))
mylist

In [None]:
# Features ready to use
f_cols = [ 'Q1A', 'Q2A', 'Q3A', 'Q4A', 'Q5A', 'Q6A', 'Q7A', 'Q8A', 'Q9A', 'Q10A',
        'Q11A', 'Q12A', 'Q13A', 'Q14A', 'Q15A', 'Q16A', 'Q17A', 'Q18A', 'Q19A',
        'Q20A', 'Q21A', 'Q22A', 'Q23A', 'Q24A', 'Q25A', 'Q26A', 'Q27A', 'Q28A',
        'Q29A', 'Q30A', 'Q31A', 'Q32A', 'Q33A', 'Q34A', 'Q35A', 'Q36A', 'Q37A',
        'Q38A', 'Q39A', 'Q40A', 'Q41A', 'Q42A', 'TIPI1', 'TIPI2', 'TIPI3',
        'TIPI4', 'TIPI5', 'TIPI6', 'TIPI7', 'TIPI8', 'TIPI9', 'TIPI10', 'education', 'age',
        'urban', 'gender', 'hand', 'religion', 'orientation', 'race', 'voted', 'married', 'familysize']

In [None]:
# check feature unique values
mylist=[]
for col in f_cols:
    mylist.append([col]+list(data[col].value_counts().sort_index().index))

In [None]:
# questions related to depression
dep_cols = ['Q3A','Q5A','Q10A','Q13A', 'Q16A', 'Q17A', 'Q21A',
            'Q24A', 'Q26A', 'Q31A', 'Q34A', 'Q37A', 'Q38A', 'Q42A']

In [None]:
# questions related to anxiety
anx_cols = [ 'Q2A', 'Q4A', 'Q7A', 'Q9A', 'Q15A', 'Q19A', 'Q20A',
            'Q23A', 'Q25A', 'Q28A', 'Q30A', 'Q36A', 'Q40A', 'Q41A']

In [None]:
# questions related to stress
stress_cols = [ 'Q1A', 'Q6A', 'Q8A', 'Q11A', 'Q12A', 'Q14A', 'Q18A',
               'Q22A', 'Q27A', 'Q29A', 'Q32A', 'Q33A', 'Q35A', 'Q39A']

In [None]:
# Split (Depression)
X = data[f_cols]
y = data['Depression_cat']
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.20)

In [None]:
# list of descending importances of features (svm) (Depression)
svm_imp = list(pd.Series(abs(svm.coef_[0]), index=f_cols).sort_values(ascending=False).index)
depression_imp = svm_imp

In [None]:
# Create Dataframe of balanced accuracy per features removed (Depression)
bacc=[]
removed=[]
for i in range(47, len(svm_imp)):
    cols = svm_imp[:len(svm_imp)-i]
    svm = SVC(kernel='linear')
    svm.fit(X_train[cols], y_train)
    y_pred = svm.predict(X_test[cols])
    removed.append(i)
    bacc.append(metrics.balanced_accuracy_score(y_test, y_pred))
df_depression = pd.DataFrame({'features_removed':removed, 'Balanced_accuracy':bacc})

In [None]:
# Split (Anxiety)
X = data[f_cols]
y = data['Anxiety_cat']
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.20)

In [None]:
# list of descending importances of features (svm) (Anxiety)
svm_imp = list(pd.Series(abs(svm.coef_[0]), index=f_cols).sort_values(ascending=False).index)
anxiety_imp = svm_imp

In [None]:
# Create Dataframe of balanced accuracy per features removed (Anxiety)
bacc=[]
removed=[]
for i in range(47, len(svm_imp)):
    cols = svm_imp[:len(svm_imp)-i]
    svm = SVC(kernel='linear')
    svm.fit(X_train[cols], y_train)
    y_pred = svm.predict(X_test[cols])
    removed.append(i)
    bacc.append(metrics.balanced_accuracy_score(y_test, y_pred))
df_anxiety = pd.DataFrame({'features_removed':removed, 'Balanced_accuracy':bacc})

In [None]:
# Split (Stress)
X = data[f_cols]
y = data['Stress_cat']
X_train, X_test, y_train, y_test = train_test_split(X,y, stratify=y, test_size=0.20)

In [None]:
# list of descending importances of features (svm) (Stress)
svm_imp = list(pd.Series(abs(svm.coef_[0]), index=f_cols).sort_values(ascending=False).index)
stress_imp = svm_imp

In [None]:
# Create Dataframe of balanced accuracy per features removed (Stress)
bacc=[]
removed=[]
for i in range(47, len(svm_imp)):
    cols = svm_imp[:len(svm_imp)-i]
    svm = SVC(kernel='linear')
    svm.fit(X_train[cols], y_train)
    y_pred = svm.predict(X_test[cols])
    removed.append(i)
    bacc.append(metrics.balanced_accuracy_score(y_test, y_pred))
df_stress = pd.DataFrame({'features_removed':removed, 'Balanced_accuracy':bacc})

In [None]:
df_depression

In [None]:
df_anxiety

In [None]:
df_stress

In [None]:
keep = []

In [None]:
# for b_acc > 0.9 we remove the bottom 53 features
keep = keep + depression_imp[:len(depression_imp)-53]

In [None]:
# for b_acc > 0.9 we remove the bottom 52 features
keep = keep + anxiety_imp[:len(anxiety_imp)-52]

In [None]:
# for b_acc > 0.9 we remove the bottom 52 features
keep = keep + stress_imp[:len(stress_imp)-52]

In [None]:
keep = list(set(keep))

In [None]:
len(keep)