# Predicting depression - DASS-42

#### Objective: Implement four supervised machine learning models, with the goal of predicting depression using a labelled dataset. DASS-42: Depression anxiety stress scale. A 42 item questionnaire asking participants about their mental health.
## Steps

1. EDA - Understand data, clean data and analyse relationships
2. Data preprocessing - Create target variables
3. Implement - Create and use the models
3. Results - Predictions
4. Evaluation/refining - Improve accuracy

#### Note: To classify, Q1A to Q42A will be used to build the target. Other features will be dropped or used as part of the training/testing.

In [None]:
# All libraries required
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
%matplotlib inline
import pandas as pd
import sklearn as sk
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler
import IPython
!pip install nbconvert
!pip install sweetviz
import sweetviz as sv

#classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn import svm
from sklearn.svm import SVC

#metrics/other
from sklearn.metrics import accuracy_score, log_loss, confusion_matrix
from google.colab import drive
from sklearn.metrics import classification_report
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.metrics import recall_score

In [None]:
df = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/CN6000_diss/data3.csv",sep= '\t')
pd.set_option("display.max_columns", None)

In [None]:
report =  sv.analyze(df)
report.show_html ('analyze.html', open_browser = False)

#### Inital report generated from sweetviz library

In [None]:
report.show_notebook()

In [None]:
df.sample(4) #random selection

In [None]:
df.tail(2) #show the last two records

In [None]:
df.info()

In [None]:
print(df.duplicated().sum())

In [None]:
df_size = df.shape #store shape in size

row = df_size[0]    #0 = row, 1 = columns
col = df_size[1]

print("the number of rows:", row)
print("the number of columns:", col)

In [None]:
df.describe() #describe all numerical columns

In [None]:
df.Q1A.describe()

In [None]:
df.age.describe()

In [None]:
df['age'].unique()

Some people decided to enter their year of birth instead of age. These are considered outliers.

In [None]:
df = df.drop(df[df.age > 90].index) #Get rid of all participants older than 90
df.age.unique()

In [None]:
df['major'].unique()

In [None]:
df['major'].value_counts()

In [None]:
df.isnull().sum() # the column for "major" has 11403 null values

In [None]:
plt.figure(figsize=(17, 8))
sns.heatmap(df.isnull())

#### When dealing with null values, we can either delete or replace those values. In this case, I will replace every null value with the value "no degree".

In [None]:
df2 = df.fillna(value='No degree') #create new dataframe
df2

In [None]:
df2.major.describe()

In [None]:
df2['major'].value_counts()

In [None]:
df2_eng = df2[df2["major"] == "English"] #show all english majors
df2_eng

In [None]:
df2=df2.replace('english','English')

In [None]:
df2.major.value_counts(normalize=True)[:20].plot.bar() #top 20 majors
plt.show()
#measured as a percentage

In [None]:
df2.isnull().sum() # df2 has no null values

In [None]:
plt.figure(figsize=(17, 8))
sns.heatmap(df2.isnull())

### Since I am classifying for depression. I need to drop features that have little to no cause of depression.

In [None]:
# building a list to delete features
remove = [f'Q{i}E' for i in range(1, 43)] #  'Q1E' to 'Q42E' to be removed
remove.extend([f'Q{i}I' for i in range(1, 43)]) # 'Q1E' to 'Q42E' to be removed
remove.extend([f'VCL{i}' for i in range(1, 17)]) # 'VCL1' to 'VCL16' to be removed
remove.extend([ 'source', 'voted','introelapse', 'hand','orientation', 'uniquenetworklocation','surveyelapse', 'engnat',
     'country', 'testelapse','screensize',])

In [None]:
df3 = df2.drop(remove, axis=1)
df3.head()

In [None]:
df3.religion.describe()

In [None]:
df3.TIPI1.describe()

In [None]:
df3.shape #we now have 63 columns

In [None]:
correlation = df3.corr()

In [None]:
plt.figure(figsize=(12,12))
sns.heatmap(correlation, xticklabels=correlation.columns, yticklabels=correlation.columns) #relationship between features

In [None]:
plt.figure(figsize=(17, 8))
sns.heatmap(df3.isnull())

In [None]:
df3.corr()

In [None]:
plt.figure(figsize=(4,4))
df3.gender.value_counts(normalize=True)
df3.gender.value_counts(normalize=True).plot.pie()
plt.legend()
plt.show()

2 is female, 1 is male, 0 and 3 are classified as others.

In [None]:
df3.gender.unique()

In [None]:
plt.figure(figsize=(4,4))
df3.married.value_counts(normalize=True)
df3.married.value_counts(normalize=True).plot.pie()
plt.legend()
plt.show()

1 are participants that have never been married, 2 is for participants who have been married.

In [None]:
df3.married.unique()

In [None]:
df3.education.value_counts(normalize=True)
df3.education.value_counts(normalize=True)[:20].plot.bar()
plt.show()
#measured as a percentage

0 and 1 less than high school, 2 high school completed, 3 university degree, 4 post grad degree

In [None]:
sumofQ = [f'Q{i}A' for i in range(1, 43)]
df3['score']=0
for i in range(0,42):                        #Total score of all 42 questions
  df3['score']=df3['score']+df3[sumofQ[i]]
sns.displot(x=df3['score'])

In [None]:
def depressionindicators(value):      #function to create indicator target
    if value <= 50:
        return 0
    if 51 <= value <= 70:
        return 1
    if 51 <= value <= 80:
        return 2
    if 81 <= value <= 100:
        return 3
    if value > 130:
        return 4
plt.figure(figsize=(6,6))
df3['indicator'] = df3['score'].apply(depressionindicators)
sns.countplot(x=df3['indicator']);

In [None]:
#object values in df3
df3_cat=df2.select_dtypes(object)
df3_cat

In [None]:
encodedoutput_Y= LabelEncoder()
df3['major'] = encodedoutput_Y.fit_transform(df.iloc[:,1].values) #changing major to values
df3

In [None]:
#12355 nulls values in the indicator column initially
nan_count = df3.isna().sum()
print(nan_count )

In [None]:
df3=df3.fillna(df3.mean())
df3

##Splitting dataset

In [None]:
y = df3['indicator']                         #target feature
y=y.astype('int')                            #change indicator to int datatype
x= df3.drop(['score', 'indicator'], axis=1)  #drop indicator and score features

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.2, random_state=42) #80 to 20%
print(f'x_train: {x_train.shape}, y_train: {y_train.shape}')
print(f'x_test: {x_test.shape}, y_test: {y_test.shape}')

Comparison report between x_train and x_test using sweetviz library

In [None]:
compare_report = sv.compare([x_train, 'Train'], [x_test, 'Test'])
compare_report.show_html('Compare.html', open_browser = False)

In [None]:
IPython.display.HTML('Compare.html')

In [None]:
#normalization
scale = StandardScaler()
x_train_scaled = scale.fit_transform(x_train)
x_test_scaled = scale.transform(x_test)

## Naive bayes classifier

In [None]:
nb_classifier = MultinomialNB()

In [None]:
nb_classifier.fit(x_train, y_train)

In [None]:
y_pred = nb_classifier.predict(x_test)

In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
gnb = GaussianNB()
gnb.fit(x_train, y_train)

In [None]:
y_pred = gnb.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
bnb = BernoulliNB()
bnb.fit(x_train, y_train)

In [None]:
y_pred = bnb.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

## Random forest classifier

In [None]:
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42) #100 trees

In [None]:
rf_classifier.fit(x_train, y_train)

In [None]:
y_pred = rf_classifier.predict(x_test)

In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

## Decision tree classifier

In [None]:
rdt_classifier = DecisionTreeClassifier(random_state=42)

In [None]:
rdt_classifier.fit(x_train, y_train)
y_pred = rdt_classifier.predict(x_test)

In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

##SVM

In [None]:
svm_classifier = SVC(kernel='linear')

In [None]:
svm_classifier.fit(x_train, y_train)
y_pred = svm_classifier.predict(x_test)


In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
poly_svm = svm.SVC(kernel='poly', degree=3)

In [None]:
poly_svm.fit(x_train, y_train)
y_pred = poly_svm.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
rbf_svm = svm.SVC(kernel='rbf', gamma='scale')

In [None]:
rbf_svm.fit(x_train, y_train)
y_pred = rbf_svm.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
sig_svm = svm.SVC(kernel='sigmoid')

In [None]:
sig_svm.fit(x_train, y_train)
y_pred = sig_svm.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

## Refining models

In [None]:
bnb2 = BernoulliNB(binarize = True)
bnb2.fit(x_train, y_train)

In [None]:
y_pred = bnb2.predict(x_test)

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
rdt_classifier2 = DecisionTreeClassifier(random_state=42,criterion = 'entropy', max_depth = 16)

In [None]:
rdt_classifier2.fit(x_train, y_train)
y_pred = rdt_classifier2.predict(x_test)

In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
rdt_classifier3 = DecisionTreeClassifier(random_state=42,criterion = 'gini', max_depth = 16)

In [None]:
rdt_classifier3.fit(x_train, y_train)
y_pred = rdt_classifier3.predict(x_test)

In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))

In [None]:
rdt_classifier4 = DecisionTreeClassifier(random_state=42,criterion = 'log_loss', max_depth = 16)

In [None]:
rdt_classifier4.fit(x_train, y_train)
y_pred = rdt_classifier4.predict(x_test)

In [None]:
print('Accuracy Score:', accuracy_score(y_test, y_pred))

In [None]:
conf_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
fig, ax = plt.subplots(figsize=(5, 5))
ax.matshow(conf_matrix, cmap=plt.cm.Blues, alpha=0.3)
for i in range(conf_matrix.shape[0]):
    for j in range(conf_matrix.shape[1]):
        ax.text(x=j, y=i,s=conf_matrix[i, j], va='center', ha='center', size='xx-large')

plt.xlabel('Predictions', fontsize=18)
plt.ylabel('Actuals', fontsize=18)
plt.title('Confusion Matrix', fontsize=18)
plt.show()
print(classification_report(y_test, y_pred))