In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, PredefinedSplit, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [2]:
# male/female train/test set creation

df = pd.read_json("../data/postdatalinesvectors.json",orient='records',lines=True)

df_male = df[df['gender'] == 'm']
df_female = df[df['gender'] == 'f']

# create male split
Xm = df_male[[str(i) for i in range(38)]]
ym = df_male['label']
X_trainm, X_testm, y_trainm, y_testm = train_test_split(Xm,ym, test_size=0.2, random_state=101)
# create female split
Xf = df_female[[str(i) for i in range(38)]]
yf = df_female['label']
X_trainf, X_testf, y_trainf, y_testf = train_test_split(Xf,yf, test_size=0.2, random_state=101)
# combine splits
X_train = pd.concat([X_trainm,X_trainf])
y_train = pd.concat([y_trainm,y_trainf])

X_test = pd.concat([X_testm,X_testf])
y_test = pd.concat([y_testm,y_testf])

# Scaling for SVM
scaler = StandardScaler()

X_train = scaler.fit_transform(X_train)
X_trainm = scaler.fit_transform(X_trainm)
X_trainf = scaler.fit_transform(X_trainf)

X_test = scaler.transform(X_test)
X_testm = scaler.transform(X_testm)
X_testf = scaler.transform(X_testf)

In [3]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

model = SVC(random_state=0, probability=True, class_weight='balanced', C=1000, gamma=10, kernel='rbf')
model.fit(X_train,y_train)

In [4]:
pred_male = model.predict(X_testm)
pred_female = model.predict(X_testf)

In [5]:
# Performance measures

def prediction_measures(y_test,y_pred,confusion=False):
    performance = classification_report(y_test,y_pred,output_dict=True)
    # display confusion matrix
    if confusion == True:
        confusionmatrix = confusion_matrix(y_test,y_pred)
        cm_display = ConfusionMatrixDisplay(confusion_matrix = confusionmatrix, display_labels = [False, True]) 
        cm_display.plot()
        plt.show()
    return performance['depression'],performance['normal'],performance['accuracy']

# Support measures

def percentage_depression(pred):
    return np.sum(pred == 'depression') / len(pred)

def true_positives(pred, true):
    return np.sum((pred == 'depression') & (true == "depression"))

def true_negatives(pred, true):
    return np.sum((pred == 'normal') & (true == "normal"))

def true_positive_rate(pred, true):
    return true_positives(pred,true)/(true_positives(pred,true)+true_negatives(pred,true))

def true_negative_rate(pred, true):
    return true_negatives(pred,true)/(true_positives(pred,true)+true_negatives(pred,true))

# Complete measures

def statistical_parity(predsensitive,predother):
    return percentage_depression(predsensitive)/percentage_depression(predother)

def equal_opportunity(predsensitive, truesensitive, predother, trueother):
    return true_positive_rate(predsensitive,truesensitive)/true_positive_rate(predother,trueother)

def equalised_odds(predsensitive, truesensitive, predother, trueother):
    return (true_positive_rate(predsensitive,truesensitive)+true_negative_rate(predsensitive,truesensitive))/(true_positive_rate(predother,trueother)+true_negative_rate(predsensitive,truesensitive))

def equal_accuracy(predsensitive, truesensitive, predother, trueother):
    return accuracy_score(predsensitive,truesensitive)/accuracy_score(predother,trueother)

# All measurements

def all_measures(predsensitive, truesensitive, predother, trueother, name='test',single=False):
    score_dict = {}
    score_dict['test'] = name
    depression_performance, normal_performance ,accuracy = prediction_measures(predsensitive,truesensitive)
    score_dict['depression0precision'] = depression_performance['precision']
    score_dict['depression0recall'] = depression_performance['recall']
    score_dict['depression0f1'] = depression_performance['f1-score']
    score_dict['depression0support'] = depression_performance['support']
    score_dict['normal0precision'] = normal_performance['precision']
    score_dict['normal0recall'] = normal_performance['recall']
    score_dict['normal0f1'] = normal_performance['f1-score']
    score_dict['normal0support'] = normal_performance['support']
    score_dict['accuracy0'] = accuracy
    if single == False:
        depression_performance, normal_performance ,accuracy = prediction_measures(predother,trueother)
        score_dict['depression1precision'] = depression_performance['precision']
        score_dict['depression1recall'] = depression_performance['recall']
        score_dict['depression1f1'] = depression_performance['f1-score']
        score_dict['depression1support'] = depression_performance['support']
        score_dict['normal1precision'] = normal_performance['precision']
        score_dict['normal1recall'] = normal_performance['recall']
        score_dict['normal1f1'] = normal_performance['f1-score']
        score_dict['normal1support'] = normal_performance['recall']
        score_dict['accuracy1'] = accuracy
        score_dict['statisticalParity01'] = statistical_parity(predsensitive,predother)
        score_dict['equalOpportunity01'] = equal_opportunity(predsensitive, truesensitive, predother, trueother)
        score_dict['equalisedOdds'] = equalised_odds(predsensitive, truesensitive, predother, trueother)
        score_dict['equalAccuracy'] = equal_accuracy(predsensitive, truesensitive, predother, trueother)
    return score_dict

In [6]:
df = pd.DataFrame()

df = pd.concat([df,pd.DataFrame(all_measures(pred_female,y_testf,pred_male,y_testm),index=[len(df)])])

df

Unnamed: 0,test,depression0precision,depression0recall,depression0f1,depression0support,normal0precision,normal0recall,normal0f1,normal0support,accuracy0,...,depression1support,normal1precision,normal1recall,normal1f1,normal1support,accuracy1,statisticalParity01,equalOpportunity01,equalisedOdds,equalAccuracy
0,test,0.526316,0.681818,0.594059,44.0,0.72,0.571429,0.637168,63.0,0.616822,...,19.0,0.851852,0.657143,0.741935,0.657143,0.703704,1.168716,1.151515,1.063613,0.876537
