### Data Classification and Feature Selection 📚

In [None]:
import shap
import xgboost
import numpy as np
import pandas as pd
from umap import UMAP
import networkx as nx
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score,roc_curve, auc, log_loss
from sklearn.metrics import roc_curve, precision_recall_curve
import json

In [None]:
pd.options.display.max_columns = 200

In [None]:
df = pd.read_csv('../data/data_labeled.csv', index_col=0)
with open('../data/codebook_dict.json', 'r') as f:
    code = json.load(f)

About Question (Q) columns:
* A indicates the question
* I indicates the position of the question in the survey
* E indicates the time (in ms) it took to answer the question

In [None]:
df.head()

DASS results

Calculate DASS score and categories according to:
https://www.psytoolkit.org/survey-library/depression-anxiety-stress-dass.html

Severities of depression, anxiety and stress are categorized to:

    0 - Normal
    1 - Mild
    2 - Moderate
    3 - Severe
    4 - Extremely severe


In [None]:
diagnosis = {0: 'Normal', 1: 'Mild', 2: 'Moderate', 3: 'Severe', 4: 'Extremely Severe'}

In [None]:
# Dictionaries with the questions related to each category and the corresponding bins
DASS_keys = {'Depression': [3, 5, 10, 13, 16, 17, 21, 24, 26, 31, 34, 37, 38, 42],
             'Anxiety': [2, 4, 7, 9, 15, 19, 20, 23, 25, 28, 30, 36, 40, 41],
             'Stress': [1, 6, 8, 11, 12, 14, 18, 22, 27, 29, 32, 33, 35, 39]}

DASS_bins = {'Depression': [(0, 10), (10, 14), (14, 21), (21, 28)],
             'Anxiety': [(0, 8), (8, 10), (10, 15), (15, 20)],
             'Stress': [(0, 15), (15, 19), (19, 26), (26, 34)]}
             

Basic EDA and Classifier

In [None]:
df_Qdep = df.filter(regex='Q\d{1,2}A|Depression_cat')
counts_dep = df_Qdep.groupby('Depression_cat')['Q1A'].count().reset_index().rename(columns={'Q1A': 'dep_counts'})
counts_dep['severity'] = counts_dep.Depression_cat.replace(diagnosis)
counts_dep['percentage'] = counts_dep.dep_counts/counts_dep.dep_counts.sum()*100

In [None]:
df_Qanx = df.filter(regex='Q\d{1,2}A|Anxiety_cat')
counts_dep = df_Qdep.groupby('Depression_cat')['Q1A'].count().reset_index().rename(columns={'Q1A': 'dep_counts'})
counts_dep['severity'] = counts_dep.Depression_cat.replace(diagnosis)
counts_dep['percentage'] = counts_dep.dep_counts/counts_dep.dep_counts.sum()*100