In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report,confusion_matrix,roc_auc_score,roc_curve,auc,f1_score,precision_score
from sklearn import metrics
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score, KFold

In [12]:
PHQ9Post = pd.DataFrame()
PHQ9Post = pd.read_csv('../../dataset/PHQ9/PHQ9PostClassified.csv')
# print(PHQ9Pre)

In [13]:
AllMerged = pd.DataFrame()
AllMerged = pd.read_csv('../../dataset/BasicFeatures/Merged/AllMerged.csv')
# print(LockData)

## Model Code

In [14]:
#Function to train and evaluate:
def TrainandEval(model, features, target, name):
    print(name)
    X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)
    model.fit(X_train, y_train)

    y_pred = model.predict(X_test)
    print("Number of mislabeled points out of a total %d points : %d" % (X_test.shape[0], (y_test != y_pred).sum()))
    print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
    tn, fp, fn, tp = metrics.confusion_matrix(y_test, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2

    print("Sensitivity:", sensitivity)
    print("Specificity:", specificity)
    print("Balanced Accuracy:", balanced_accuracy)


In [15]:
#declare all model vars:
LR = LogisticRegression()
SVM = SVC()
RF = RandomForestClassifier() 
XGB = XGBClassifier()

In [16]:
def RunAllModels(data,PHQ9):
    data = pd.merge(data, PHQ9, on='userId')
    data = data.drop(columns=['userId','WeekId'])
    X = data.drop(['PHQ9'], axis=1)
    y= data['PHQ9']
    TrainandEval(LogisticRegression(), X, y, 'LR')
    TrainandEval(SVM, X, y, 'SVM')
    TrainandEval(RF, X, y, 'RF')
    TrainandEval(XGB, X, y, 'XGB')

In [17]:
def RunAllModelsNorm(data,PHQ9):
    data = pd.merge(data, PHQ9, on='userId')
    data = data.drop(columns=['userId','WeekId'])
    X = data.drop(['PHQ9'], axis=1)

    scaler = MinMaxScaler()
    XNorm = scaler.fit_transform(X)

    y= data['PHQ9']
    TrainandEval(LogisticRegression(), XNorm, y, 'LR')
    TrainandEval(SVM, XNorm, y, 'SVM')
    TrainandEval(RF, XNorm, y, 'RF')
    TrainandEval(XGB, XNorm, y, 'XGB')

In [18]:
def RunAllModelsOversampled(data,PHQ9):
    data = pd.merge(data, PHQ9, on='userId')
    data = data.drop(columns=['userId','WeekId'])
    X = data.drop(['PHQ9'], axis=1)
    y= data['PHQ9']

    # Normatisation
    scaler = MinMaxScaler()
    XNorm = scaler.fit_transform(X)

    # Oversampling
    oversampler = SMOTE(random_state=42)
    features_resampled, target_resampled = oversampler.fit_resample(XNorm, y)

    TrainandEval(LogisticRegression(), features_resampled, target_resampled, 'LR')
    TrainandEval(SVM, features_resampled, target_resampled, 'SVM')
    TrainandEval(RF, features_resampled, target_resampled, 'RF')
    TrainandEval(XGB, features_resampled, target_resampled, 'XGB')
    



## Normalised Data

In [19]:
RunAllModelsNorm(AllMerged,PHQ9Post)

LR
Number of mislabeled points out of a total 43 points : 8
Accuracy: 0.813953488372093
Sensitivity: 0.5
Specificity: 1.0
Balanced Accuracy: 0.75
SVM
Number of mislabeled points out of a total 43 points : 5
Accuracy: 0.8837209302325582
Sensitivity: 0.6875
Specificity: 1.0
Balanced Accuracy: 0.84375
RF
Number of mislabeled points out of a total 43 points : 5
Accuracy: 0.8837209302325582
Sensitivity: 0.6875
Specificity: 1.0
Balanced Accuracy: 0.84375
XGB
Number of mislabeled points out of a total 43 points : 6
Accuracy: 0.8604651162790697
Sensitivity: 0.6875
Specificity: 0.9629629629629629
Balanced Accuracy: 0.8252314814814814


## Oversampled Models

In [20]:
RunAllModelsOversampled(AllMerged,PHQ9Post)

LR
Number of mislabeled points out of a total 55 points : 7
Accuracy: 0.8727272727272727
Sensitivity: 0.7931034482758621
Specificity: 0.9615384615384616
Balanced Accuracy: 0.8773209549071619
SVM
Number of mislabeled points out of a total 55 points : 1
Accuracy: 0.9818181818181818
Sensitivity: 0.9655172413793104
Specificity: 1.0
Balanced Accuracy: 0.9827586206896552
RF
Number of mislabeled points out of a total 55 points : 3
Accuracy: 0.9454545454545454
Sensitivity: 0.896551724137931
Specificity: 1.0
Balanced Accuracy: 0.9482758620689655
XGB
Number of mislabeled points out of a total 55 points : 1
Accuracy: 0.9818181818181818
Sensitivity: 0.9655172413793104
Specificity: 1.0
Balanced Accuracy: 0.9827586206896552


## K-Fold

In [21]:
# Function to calculate sensitivity, specificity, and balanced accuracy
def calculate_metrics(y_true, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    sensitivity = tp / (tp + fn)
    specificity = tn / (tn + fp)
    balanced_accuracy = (sensitivity + specificity) / 2
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    return sensitivity, specificity, balanced_accuracy, accuracy, precision

# Function to train and evaluate with k-fold cross-validation
def TrainandEvalWithCrossValidation(model, features, target, name, cv=5):
    print(name)
    # Perform k-fold cross-validation
    kfold = KFold(n_splits=cv, shuffle=True, random_state=42)
    
    accuracies = []
    sensitivities = []
    specificities = []
    balanced_accuracies = []
    precisions = []


    for train_idx, test_idx in kfold.split(features):
        X_train, X_test = features[train_idx], features[test_idx]
        y_train, y_test = target[train_idx], target[test_idx]

        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        sensitivity, specificity, balanced_accuracy, accuracy, precision = calculate_metrics(y_test, y_pred)


        sensitivities.append(sensitivity)
        specificities.append(specificity)
        balanced_accuracies.append(balanced_accuracy)
        accuracies.append(accuracy)
        precisions.append(precision)

    # Print average metrics across all folds
    print("Mean Accuracy:", sum(accuracies) / len(accuracies))
    print("Mean Balanced Accuracy:", sum(balanced_accuracies) / len(balanced_accuracies))
    print("Mean Sensitivity:", sum(sensitivities) / len(sensitivities))
    print("Mean Specificity:", sum(specificities) / len(specificities))
    print("Mean Precision:", sum(precisions) / len(precisions))

# Amend function to run all models with k-fold cross-validation
def RunAllModelsNormWithCrossValidation(data, PHQ9, cv=5):
    data = pd.merge(data, PHQ9, on='userId')
    data = data.drop(columns=['userId', 'WeekId'])
    X = data.drop(['PHQ9'], axis=1)
    y = data['PHQ9']

    scaler = MinMaxScaler()
    XNorm = scaler.fit_transform(X)

    
    TrainandEvalWithCrossValidation(LogisticRegression(), XNorm, y, 'LR', cv)
    TrainandEvalWithCrossValidation(SVM, XNorm, y, 'SVM', cv)
    TrainandEvalWithCrossValidation(RF, XNorm, y, 'RF', cv)
    TrainandEvalWithCrossValidation(XGB, XNorm, y, 'XGB', cv)


In [22]:
RunAllModelsNormWithCrossValidation(AllMerged,PHQ9Post , 10)

LR
Mean Accuracy: 0.8874458874458874
Mean Balanced Accuracy: 0.8735876623376623
Mean Sensitivity: 0.8043181818181818
Mean Specificity: 0.9428571428571428
Mean Precision: 0.88015873015873
SVM
Mean Accuracy: 0.8777056277056277
Mean Balanced Accuracy: 0.8566035353535353
Mean Sensitivity: 0.7632070707070708
Mean Specificity: 0.95
Mean Precision: 0.8928571428571427
RF
Mean Accuracy: 0.8965367965367965
Mean Balanced Accuracy: 0.8729608585858586
Mean Sensitivity: 0.7605050505050506
Mean Specificity: 0.9854166666666668
Mean Precision: 0.9708333333333332
XGB
Mean Accuracy: 0.9062770562770565
Mean Balanced Accuracy: 0.8910416666666666
Mean Sensitivity: 0.8175000000000001
Mean Specificity: 0.9645833333333333
Mean Precision: 0.9311507936507937


## K-fold oversampled

In [23]:
# Amend function to run all models with k-fold cross-validation
def RunAllModelsNormOversampledWithCrossValidation(data, PHQ9, cv=5):
    data = pd.merge(data, PHQ9, on='userId')
    data = data.drop(columns=['userId', 'WeekId'])
    X = data.drop(['PHQ9'], axis=1)
    y = data['PHQ9']

    scaler = MinMaxScaler()
    XNorm = scaler.fit_transform(X)



     # Oversampling
    oversampler = SMOTE(random_state=42)
    features_resampled, target_resampled = oversampler.fit_resample(XNorm, y)


    TrainandEvalWithCrossValidation(LogisticRegression(), features_resampled, target_resampled, 'LR', cv)
    TrainandEvalWithCrossValidation(SVM, features_resampled, target_resampled, 'SVM', cv)
    TrainandEvalWithCrossValidation(RF, features_resampled, target_resampled, 'RF', cv)
    TrainandEvalWithCrossValidation(XGB, features_resampled, target_resampled, 'XGB', cv)



In [24]:
RunAllModelsNormOversampledWithCrossValidation(AllMerged,PHQ9Post, 10)

LR
Mean Accuracy: 0.9043650793650793
Mean Balanced Accuracy: 0.9116012622997918
Mean Sensitivity: 0.9555555555555555
Mean Specificity: 0.867646969044028
Mean Precision: 0.8760401853048914
SVM
Mean Accuracy: 0.9482804232804231
Mean Balanced Accuracy: 0.9477004327371976
Mean Sensitivity: 0.9933333333333334
Mean Specificity: 0.9020675321410616
Mean Precision: 0.9116712454212454
RF
Mean Accuracy: 0.9522486772486772
Mean Balanced Accuracy: 0.9580195090856856
Mean Sensitivity: 0.9555555555555555
Mean Specificity: 0.9604834626158156
Mean Precision: 0.9534981684981686
XGB
Mean Accuracy: 0.9482804232804234
Mean Balanced Accuracy: 0.9530575755943402
Mean Sensitivity: 0.9734188034188035
Mean Specificity: 0.9326963477698772
Mean Precision: 0.9288873626373626
