In [23]:
import pandas as pd
import numpy as np

from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

In [24]:
# Performance measures

def prediction_measures(y_test,y_pred):
    performance = classification_report(y_test,y_pred,output_dict=True)
    return performance['depression'],performance['normal'],performance['accuracy']

# # Support measures

# def percentage_depression(pred):
#     return np.sum(pred == 'depression') / len(pred)

# def true_positives(pred, true):
#     return np.sum((pred == 'depression') & (true == "depression"))

# def true_negatives(pred, true):
#     return np.sum((pred == 'normal') & (true == "normal"))

# def true_positive_rate(pred, true):
#     return true_positives(pred,true)/(true_positives(pred,true)+true_negatives(pred,true))

# def true_negative_rate(pred, true):
#     return true_negatives(pred,true)/(true_positives(pred,true)+true_negatives(pred,true))

# def calculate_rates(pred, true):
#     tp = np.sum((pred == 'depression') & (true == 'depression'))
#     tn = np.sum((pred == 'normal') & (true == 'normal'))
#     fn = np.sum((pred == 'normal') & (true == 'depression'))
#     fp = np.sum((pred == 'depression') & (true == 'normal'))
#     tpr = tp / (tp + fn) if (tp + fn) != 0 else 0  # True Positive Rate
#     tnr = tn / (tn + fp) if (tn + fp) != 0 else 0  # True Negative Rate
#     return tpr, tnr, tp, tn, fp, fn

# # Complete measures

# def statistical_parity(predsensitive,predother):
#     return percentage_depression(predsensitive)/percentage_depression(predother)

# def equal_opportunity(predsensitive, truesensitive, predother, trueother):
#     return true_positive_rate(predsensitive,truesensitive)/true_positive_rate(predother,trueother)

# def equalised_odds(predsensitive, truesensitive, predother, trueother):
#     return (true_positive_rate(predsensitive,truesensitive)+true_negative_rate(predsensitive,truesensitive))/(true_positive_rate(predother,trueother)+true_negative_rate(predsensitive,truesensitive))

# def equal_accuracy(predsensitive, truesensitive, predother, trueother):
#     return accuracy_score(predsensitive,truesensitive)/accuracy_score(predother,trueother)



In [36]:
# All measurements

def all_measures(predsensitive, truesensitive, predother, trueother, name='test',single=False):
    score_dict = {}
    score_dict['predictor'] = name
    # Metrics for sensitive group
    depression_performance, normal_performance ,accuracy = prediction_measures(predsensitive,truesensitive)
    score_dict['depressionSensPrecision'] = depression_performance['precision']
    score_dict['depressionSensRecall'] = depression_performance['recall']
    score_dict['depressionSensF1'] = depression_performance['f1-score']
    score_dict['depressionSensSupport'] = depression_performance['support']
    score_dict['normalSensPrecision'] = normal_performance['precision']
    score_dict['normalSensRecall'] = normal_performance['recall']
    score_dict['normalSensF1'] = normal_performance['f1-score']
    score_dict['normalSensSupport'] = normal_performance['support']
    score_dict['accuracySens'] = accuracy
    if single == False:
        # Metrics for other group
        depression_performance, normal_performance ,accuracy = prediction_measures(predother,trueother)
        score_dict['depressionOtherPrecision'] = depression_performance['precision']
        score_dict['depressionOtherRecall'] = depression_performance['recall']
        score_dict['depressionOtherF1'] = depression_performance['f1-score']
        score_dict['depressionOtherSupport'] = depression_performance['support']
        score_dict['normalOtherPrecision'] = normal_performance['precision']
        score_dict['normalOtherRecall'] = normal_performance['recall']
        score_dict['normalOtherF1'] = normal_performance['f1-score']
        score_dict['normalOtherSupport'] = normal_performance['recall']
        score_dict['accuracyOther'] = accuracy
        # Metrics for total
        depression_performance, normal_performance ,accuracy = prediction_measures(pd.concat([predsensitive,predother]),pd.concat([truesensitive,trueother]))
        score_dict['depressionTotalPrecision'] = depression_performance['precision']
        score_dict['depressionTotalRecall'] = depression_performance['recall']
        score_dict['depressionTotalF1'] = depression_performance['f1-score']
        score_dict['depressionTotalSupport'] = depression_performance['support']
        score_dict['normalTotalPrecision'] = normal_performance['precision']
        score_dict['normalTotalRecall'] = normal_performance['recall']
        score_dict['normalTotalF1'] = normal_performance['f1-score']
        score_dict['normalTotalSupport'] = normal_performance['recall']
        score_dict['accuracyTotal'] = accuracy
        # Fairness metrics between groups
        TNS, FPS, FNS, TPS = confusion_matrix(truesensitive, predsensitive).ravel()
        TNO, FPO, FNO, TPO = confusion_matrix(trueother,predother).ravel()
        score_dict['statisticalParity'] = ((TPS+FPS)/len(truesensitive)) / ((TPO+FPO)/len(trueother))
        score_dict['equalOpportunity'] = (TPS / (TPS+FNS)) / (TPO / (TPO+FNO))
        score_dict['equalisedOdds'] =  ((TPS / (TPS+FNS)) + (TNS / (TNS+FPS))) / ((TPO / (TPO+FNO))+ (TNO) / (TNO+FPO))
        score_dict['equalAccuracy'] = (accuracy_score(truesensitive,predsensitive)/accuracy_score(trueother,predother))
    return score_dict

In [41]:


options = ['w2vec','ruleBased','DictionaryBased','TFIDF','blingfire','reweightedblingfire','nltk','reweightednltk','spacysm','reweightedspacysm','spacylg','reweightedspacylg','spacytrf','reweightedspacytrf']

for name in options:
    df = pd.read_json(f'data/predictionData/{name}Pred.json', lines=True, orient='records')

    pred_male = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), ['prediction']]
    pred_female = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), ['prediction']]
    y_testm = df.loc[(df['gender'] == 'm') & (df['split'] == 'test'), ['label']]
    y_testf = df.loc[(df['gender'] == 'f') & (df['split'] == 'test'), ['label']]

    score_dict = all_measures(pred_female,y_testf,pred_male,y_testm, name)

    try:
        df = pd.read_csv('data/results/testresults.csv')
    except:
        df = pd.DataFrame()

    df_new = pd.DataFrame(score_dict,index=[len(df)])

    df = pd.concat([df,df_new],ignore_index=True)

    df.to_csv('data/results/testresults.csv', index=False)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  score_dict['statisticalParity'] = ((TPS+FPS)/len(truesensitive)) / ((TPO+FPO)/len(trueother))
  score_dict['equalOpportunity'] = (TPS / (TPS+FNS)) / (TPO / (TPO+FNO))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _war