In [13]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import imblearn
from os import path
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
import sklearn.preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
import src.lib.utility_classfier as uclf
import src.lib.optimal_threhold_related as thres
import src.lib.fairness_tests as fair

In [14]:
data_path='/Users/lifuchen/Desktop/research/data.csv'
df = pd.read_csv(data_path)
df.shape

(109490, 89)

In [15]:
y = df.Class.values
X = df.drop(['GRID','Class'], axis=1)
X.shape

(109490, 87)

In [16]:
def save_prediction(classifier, characteristic, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female):
    method_to_call = getattr(uclf, classifier)
    file_path = '/Users/lifuchen/Desktop/Evaluating-and-Mitigating-Bias-in-ML-Models-for-CVD/Models/'
    filename = str(classifier) + '_' + characteristic[-1] + '_model.sav'
    clf = method_to_call(X_train_scaled, y_train, X_test_scaled, y_test, dump_model=False, file_name = file_path + filename)
    
    y_val_score = clf.predict_proba(X_val_scaled)[:, 1]
    y_test_score = clf.predict_proba(X_test_scaled)[:, 1]
    y_val_score_male = clf.predict_proba(X_val_male_scaled)[:, 1]
    y_test_score_male = clf.predict_proba(X_test_male_scaled)[:, 1]
    y_val_score_female = clf.predict_proba(X_val_female_scaled)[:, 1]
    y_test_score_female = clf.predict_proba(X_test_female_scaled)[:, 1] 
    
    my_dict = dict(val_score = y_val_score, test_score = y_test_score, val_1_score = y_val_score_male, test_1_score = y_test_score_male, val_2_score = y_val_score_female, test_2_score = y_test_score_female)
    overall_prediction = pd.DataFrame.from_dict(my_dict, orient='index')
    overall_prediction = overall_prediction.transpose()

    result_path='/Users/lifuchen/Desktop/research/predictions/'
    filename = str(classifier) + str(characteristic) + "prediction.csv"
    overall_prediction.to_csv(path.join(result_path, filename), index=False)

In [17]:
def get_result (classifier, characteristic, records, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female):
    result_path='/Users/lifuchen/Desktop/research/predictions/'
    filename = str(classifier) + characteristic + "prediction.csv"
    prediction = pd.read_csv(path.join(result_path, filename))

    y_val_score = prediction['val_score'][prediction['val_score'].notna()]
    y_test_score = prediction['test_score'][prediction['test_score'].notna()]

    y_val_score_male = prediction['val_1_score'][prediction['val_1_score'].notna()]
    y_test_score_male = prediction['test_1_score'][prediction['test_1_score'].notna()]

    y_val_score_female = prediction['val_2_score'][prediction['val_2_score'].notna()]
    y_test_score_female = prediction['test_2_score'][prediction['test_2_score'].notna()]

    threshold, ba_val, ba_test = balance_accuracy (y_val, y_val_score,y_test, y_test_score)
    auroc = roc_auc_score(y_test, y_test_score)
    precision, recall, tpr, tnr, pd_overall = thres.calculate_precision_metrics(y_test, y_test_score,threshold)

    threshold_male, ba_val_male, ba_test_male = balance_accuracy (y_val_male, y_val_score_male,y_test_male, y_test_score_male)
    precision_male, recall_male, tpr_male, tnr_male, pd_male = thres.calculate_precision_metrics(y_test_male, y_test_score_male,threshold_male)

    threshold_female, ba_val_female, ba_test_female = balance_accuracy (y_val_female, y_val_score_female, y_test_female, y_test_score_female)
    precision_female, recall_female, tpr_female, tnr_female, pd_female = thres.calculate_precision_metrics(y_test_female, y_test_score_female,threshold_female)

    eod = fair.get_EOD(y_test_male, y_test_score_male,threshold_male, y_test_female, y_test_score_female, threshold_female)
    sp = fair.get_SP(y_test_male, y_test_score_male,threshold_male, y_test_female, y_test_score_female, threshold_female)

    records.append({
        'auroc': auroc,
        'overall threshold': threshold,
        'male threshold': threshold_male,
        'female threshold': threshold_female,
        'overall ba validation': ba_val,
        'overall ba test': ba_test,
        'male ba validation': ba_val_male,
        'male ba test': ba_test_male,
        'female ba validation': ba_val_female,
        'female ba test': ba_test_female,
        'overall precision':precision,
        'overall recall':recall,
        'overall tpr':tpr,
        'overall tnr':tnr,
        'overall pd':pd_overall,
        'male precision':precision_male,
        'male recall':recall_male,
        'male tpr':tpr_male,
        'male tnr':tnr_male,
        'male pd':pd_male,
        'female precision':precision_female,
        'female recall':recall_female,
        'female tpr':tpr_female,
        'female tnr':tnr_female,
        'female pd':pd_female,
        'eod': eod,
        'di': sp,
        })

In [18]:
def balance_accuracy (y_val, y_val_score,y_test, y_test_score):
    
    threshold, _ = thres.get_optimal_threshold_Jvalue (y_val, y_val_score)
    print ("Optimal threshold by J value is ",threshold)

    ba_val = thres.calculate_balanced_accuracy(y_val, y_val_score, threshold)
    print ("Balanced accuracy score of val is ", ba_val)

    ba_test = thres.calculate_balanced_accuracy(y_test, y_test_score, threshold)
    print ("Balanced accuracy score of test is ",ba_test)

    return threshold, ba_val, ba_test

In [19]:
def fairness_metrics (X, y, attribute, random_state):
    X_train, y_train, X_val, y_val, X_test, y_test, X_val_female, X_val_male, y_val_female, y_val_male, X_test_female, X_test_male, y_test_female, y_test_male \
        = fair.split_by_trait_balance_proportion_no_protected_trait(X, y, attribute, random_state)
    print("X train", X_train.shape[0])
    print("Y train", y_train.shape[0])
    print(X_val.shape[0], X_val_male.shape[0], X_val_female.shape[0])
    print(y_val.shape[0], y_val_male.shape[0], y_val_female.shape[0])
    print(X_test.shape[0], X_test_male.shape[0], X_test_female.shape[0])
    print(y_test.shape[0], y_test_male.shape[0], y_test_female.shape[0])

    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_train_scaled = max_abs_scaler.fit_transform(X_train)
    X_test_scaled = max_abs_scaler.transform(X_test)
    X_test_male_scaled = max_abs_scaler.transform(X_test_male)
    X_test_female_scaled = max_abs_scaler.transform(X_test_female)
    X_val_scaled = max_abs_scaler.transform(X_val)
    X_val_male_scaled = max_abs_scaler.transform(X_val_male)
    X_val_female_scaled = max_abs_scaler.transform(X_val_female)

    characteristic = attribute + "resample-by-proportion" + str(random_state)
    #save_prediction ("logic_regression", characteristic, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)
    #save_prediction ("random_forest", characteristic, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)
    #save_prediction ("decision_tree", characteristic, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)
    #save_prediction ("gradiant_boosting", characteristic, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)

    get_result ("logic_regression", characteristic, records_lr, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)
    get_result ("random_forest", characteristic, records_rf, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)
    #get_result ("decision_tree", characteristic, records_dt, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)
    get_result ("gradiant_boosting", characteristic, records_gbt, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_male_scaled, y_val_male, X_test_male_scaled, y_test_male, X_val_female_scaled, y_val_female, X_test_female_scaled, y_test_female)

In [20]:
records_lr = []
records_rf = []
#records_dt = []
records_gbt = []
for random_state in range(10):
    fairness_metrics (X, y, "GENDER", random_state)

result_lr = pd.DataFrame(records_lr)
result_rf = pd.DataFrame(records_rf)
#result_dt = pd.DataFrame(records_dt)
result_gbt = pd.DataFrame(records_gbt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23364, 88)
(42330, 88)
0.12175917034760898 0.08538461538461538
0.12174358974358974
(67112, 87)
X train 67112
Y train 67112
21898 7782 14116
21898 7782 14116
21898 7707 14191
21898 7707 14191
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.40099999999999997
threshold:0.2, J-value:0.28700000000000003
threshold:0.30000000000000004, J-value:0.149
threshold:0.4, J-value:0.07300000000000001
threshold:0.5, J-value:0.027
threshold:0.6000000000000001, J-value:0.009999999999999998
threshold:0.7000000000000001, J-value:0.002
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7006683988863698
Balanced accuracy score of test is  0.6898694663877558
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.38
threshold:0.2, J-value:0.246
threshold:0.30000000000000004, J-value:0.11099999999999999
threshold:0.4, J-value:0.044
threshold:0.5, J-value:0.019
threshold:0.6000000000000001, J-value:0.005
threshold:0.7000000000000001

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23381, 88)
(42313, 88)
0.12489776280971855 0.0869274833671556
0.12489403786380333
(67172, 87)
X train 67172
Y train 67172
21898 7665 14233
21898 7665 14233
21898 7807 14091
21898 7807 14091
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.388
threshold:0.2, J-value:0.29400000000000004
threshold:0.30000000000000004, J-value:0.172
threshold:0.4, J-value:0.075
threshold:0.5, J-value:0.026
threshold:0.6000000000000001, J-value:0.009999999999999998
threshold:0.7000000000000001, J-value:0.002
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6936192970864503
Balanced accuracy score of test is  0.6944109083974054
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.384
threshold:0.2, J-value:0.298
threshold:0.30000000000000004, J-value:0.175
threshold:0.4, J-value:0.064
threshold:0.5, J-value:0.02
threshold:0.6000000000000001, J-value:0.005
threshold:0.7000000000000001, J-value:0.001
threshold:0.8, J-value:0.00

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23370, 88)
(42324, 88)
0.12097083653108212 0.0856483262793382
0.12096960369372836
(67071, 87)
X train 67071
Y train 67071
21898 7743 14155
21898 7743 14155
21898 7740 14158
21898 7740 14158
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.37999999999999995
threshold:0.2, J-value:0.271
threshold:0.30000000000000004, J-value:0.141
threshold:0.4, J-value:0.063
threshold:0.5, J-value:0.024
threshold:0.6000000000000001, J-value:0.008
threshold:0.7000000000000001, J-value:0.002
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6901397487558733
Balanced accuracy score of test is  0.7027920406337371
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.378
threshold:0.2, J-value:0.254
threshold:0.30000000000000004, J-value:0.11199999999999999
threshold:0.4, J-value:0.049999999999999996
threshold:0.5, J-value:0.019
threshold:0.6000000000000001, J-value:0.006
threshold:0.7000000000000001, J-value:0.002
threshold:0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23345, 88)
(42349, 88)
0.12230181241286477 0.08478700786393094
0.1222879684418146
(67158, 87)
X train 67158
Y train 67158
21898 7751 14147
21898 7751 14147
21898 7757 14141
21898 7757 14141
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.39799999999999996
threshold:0.2, J-value:0.28600000000000003
threshold:0.30000000000000004, J-value:0.16
threshold:0.4, J-value:0.075
threshold:0.5, J-value:0.029
threshold:0.6000000000000001, J-value:0.013000000000000001
threshold:0.7000000000000001, J-value:0.001
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6991617722932525
Balanced accuracy score of test is  0.6979454399401182
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.378
threshold:0.2, J-value:0.269
threshold:0.30000000000000004, J-value:0.142
threshold:0.4, J-value:0.05600000000000001
threshold:0.5, J-value:0.021
threshold:0.6000000000000001, J-value:0.008
threshold:0.7000000000000001, J-value:0.002

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23252, 88)
(42442, 88)
0.12775244931613153 0.08639004786648578
0.12772928558630045
(67309, 87)
X train 67309
Y train 67309
21898 7842 14056
21898 7842 14056
21898 7759 14139
21898 7759 14139
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.399
threshold:0.2, J-value:0.296
threshold:0.30000000000000004, J-value:0.157
threshold:0.4, J-value:0.07200000000000001
threshold:0.5, J-value:0.028
threshold:0.6000000000000001, J-value:0.008
threshold:0.7000000000000001, J-value:0.002
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6996271625911803
Balanced accuracy score of test is  0.6982269438462144
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.366
threshold:0.2, J-value:0.279
threshold:0.30000000000000004, J-value:0.146
threshold:0.4, J-value:0.061
threshold:0.5, J-value:0.023
threshold:0.6000000000000001, J-value:0.008
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23261, 88)
(42433, 88)
0.12388268831231579 0.08438322557563058
0.12386598860238686
(67239, 87)
X train 67239
Y train 67239
21898 7814 14084
21898 7814 14084
21898 7778 14120
21898 7778 14120
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.39099999999999996
threshold:0.2, J-value:0.275
threshold:0.30000000000000004, J-value:0.145
threshold:0.4, J-value:0.061
threshold:0.5, J-value:0.024999999999999998
threshold:0.6000000000000001, J-value:0.011
threshold:0.7000000000000001, J-value:0.006
threshold:0.8, J-value:0.002
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6953192208599531
Balanced accuracy score of test is  0.6967844861038579
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.387
threshold:0.2, J-value:0.263
threshold:0.30000000000000004, J-value:0.131
threshold:0.4, J-value:0.045
threshold:0.5, J-value:0.015
threshold:0.6000000000000001, J-value:0.009000000000000001
threshold:0.7000000000000001, J-value:0.005
threshold:

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23328, 88)
(42366, 88)
0.12332065295902152 0.08630769230769231
0.12330769230769231
(67137, 87)
X train 67137
Y train 67137
21898 7644 14254
21898 7644 14254
21898 7881 14017
21898 7881 14017
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.40399999999999997
threshold:0.2, J-value:0.29400000000000004
threshold:0.30000000000000004, J-value:0.152
threshold:0.4, J-value:0.063
threshold:0.5, J-value:0.026000000000000002
threshold:0.6000000000000001, J-value:0.009999999999999998
threshold:0.7000000000000001, J-value:0.003
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7015540647898044
Balanced accuracy score of test is  0.6927785922855503
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.364
threshold:0.2, J-value:0.255
threshold:0.30000000000000004, J-value:0.11499999999999999
threshold:0.4, J-value:0.046
threshold:0.5, J-value:0.009000000000000001
threshold:0.6000000000000001, J-value:0.004
threshold:0

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23296, 88)
(42398, 88)
0.12005384874272802 0.085263777612819
0.1200501702203906
(67053, 87)
X train 67053
Y train 67053
21898 7795 14103
21898 7795 14103
21898 7762 14136
21898 7762 14136
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.40199999999999997
threshold:0.2, J-value:0.29600000000000004
threshold:0.30000000000000004, J-value:0.163
threshold:0.4, J-value:0.07
threshold:0.5, J-value:0.024
threshold:0.6000000000000001, J-value:0.005
threshold:0.7000000000000001, J-value:0.003
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7005935370190619
Balanced accuracy score of test is  0.6980918831582428
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.381
threshold:0.2, J-value:0.269
threshold:0.30000000000000004, J-value:0.131
threshold:0.4, J-value:0.047
threshold:0.5, J-value:0.013000000000000001
threshold:0.6000000000000001, J-value:0.001
threshold:0.7000000000000001, J-value:0.002
threshold:0.8, J-

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23200, 88)
(42494, 88)
0.12305160228482913 0.08621967741110913
0.12302855243986606
(67134, 87)
X train 67134
Y train 67134
21898 7804 14094
21898 7804 14094
21898 7849 14049
21898 7849 14049
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.39199999999999996
threshold:0.2, J-value:0.275
threshold:0.30000000000000004, J-value:0.149
threshold:0.4, J-value:0.07
threshold:0.5, J-value:0.032
threshold:0.6000000000000001, J-value:0.009000000000000001
threshold:0.7000000000000001, J-value:0.003
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6959446542018872
Balanced accuracy score of test is  0.6957373678323
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.367
threshold:0.2, J-value:0.261
threshold:0.30000000000000004, J-value:0.139
threshold:0.4, J-value:0.063
threshold:0.5, J-value:0.030000000000000002
threshold:0.6000000000000001, J-value:0.008
threshold:0.7000000000000001, J-value:0.002
threshold:0.8,

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(23317, 88)
(42377, 88)
0.12501206214416674 0.08378302345208563
0.12500959054755634
(67306, 87)
X train 67306
Y train 67306
21898 7696 14202
21898 7696 14202
21898 7840 14058
21898 7840 14058
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.39999999999999997
threshold:0.2, J-value:0.29500000000000004
threshold:0.30000000000000004, J-value:0.16699999999999998
threshold:0.4, J-value:0.07400000000000001
threshold:0.5, J-value:0.029
threshold:0.6000000000000001, J-value:0.009000000000000001
threshold:0.7000000000000001, J-value:0.003
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7004070820282706
Balanced accuracy score of test is  0.7035261250377529
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.393
threshold:0.2, J-value:0.276
threshold:0.30000000000000004, J-value:0.147
threshold:0.4, J-value:0.069
threshold:0.5, J-value:0.018000000000000002
threshold:0.6000000000000001, J-value:0.007
threshold:0.

In [21]:
def add_mean_sd(records, result_table, overall_records, type):
    records.append({
        'auroc': result_table["auroc"].mean(),
        'overall threshold': result_table["overall threshold"].mean(),
        'male threshold': result_table["male threshold"].mean(),
        'female threshold': result_table["female threshold"].mean(),
        'overall ba validation': result_table["overall ba validation"].mean(),
        'overall ba test': result_table["overall ba test"].mean(),
        'male ba validation': result_table["male ba validation"].mean(),
        'male ba test': result_table["male ba test"].mean(),
        'female ba validation': result_table["female ba validation"].mean(),
        'female ba test': result_table["female ba test"].mean(),
        'overall precision':result_table["overall precision"].mean(),
        'overall recall':result_table["overall recall"].mean(),
        'overall tpr':result_table["overall tpr"].mean(),
        'overall tnr':result_table["overall tnr"].mean(),
        'overall pd':result_table["overall pd"].mean(),
        'male precision':result_table["male precision"].mean(),
        'male recall':result_table["male recall"].mean(),
        'male tpr':result_table["male tpr"].mean(),
        'male tnr':result_table["male tnr"].mean(),
        'male pd':result_table["male pd"].mean(),
        'female precision':result_table["female precision"].mean(),
        'female recall':result_table["female recall"].mean(),
        'female tpr':result_table["female tpr"].mean(),
        'female tnr':result_table["female tnr"].mean(),
        'female pd':result_table["female pd"].mean(),
        'eod': result_table["eod"].mean(),
        'di': result_table["di"].mean(),
        })
    records.append({
        'auroc': result_table["auroc"].std(),
        'overall threshold': result_table["overall threshold"].std(),
        'male threshold': result_table["male threshold"].std(),
        'female threshold': result_table["female threshold"].std(),
        'overall ba validation': result_table["overall ba validation"].std(),
        'overall ba test': result_table["overall ba test"].std(),
        'male ba validation': result_table["male ba validation"].std(),
        'male ba test': result_table["male ba test"].std(),
        'female ba validation': result_table["female ba validation"].std(),
        'female ba test': result_table["female ba test"].std(),
        'overall precision':result_table["overall precision"].std(),
        'overall recall':result_table["overall recall"].std(),
        'overall tpr':result_table["overall tpr"].std(),
        'overall tnr':result_table["overall tnr"].std(),
        'overall pd':result_table["overall pd"].std(),
        'male precision':result_table["male precision"].std(),
        'male recall':result_table["male recall"].std(),
        'male tpr':result_table["male tpr"].std(),
        'male tnr':result_table["male tnr"].std(),
        'male pd':result_table["male pd"].std(),
        'female precision':result_table["female precision"].std(),
        'female recall':result_table["female recall"].std(),
        'female tpr':result_table["female tpr"].std(),
        'female tnr':result_table["female tnr"].std(),
        'female pd':result_table["female pd"].std(),
        'eod': result_table["eod"].std(),
        'di': result_table["di"].std(),
        })
    overall_records.append({
        'type': type,
        'auroc': result_table["auroc"].mean(),
        'auroc_std': result_table["auroc"].std(),
        'overall threshold': result_table["overall threshold"].mean(),
        'male threshold': result_table["male threshold"].mean(),
        'female threshold': result_table["female threshold"].mean(),
        'overall ba test': result_table["overall ba test"].mean(),
        'ba_std': result_table["overall ba test"].std(),
        'male ba test': result_table["male ba test"].mean(),
        'female ba test': result_table["female ba test"].mean(),
        'overall tpr':result_table["overall tpr"].mean(),
        'overall pd':result_table["overall pd"].mean(),
        'male tpr':result_table["male tpr"].mean(),
        'male pd':result_table["male pd"].mean(),
        'female tpr':result_table["female tpr"].mean(),
        'female pd':result_table["female pd"].mean(),
        'eod': result_table["eod"].mean(),
        'eod_std': result_table["eod"].std(),
        'di': result_table["di"].mean(),
        'di_std': result_table["di"].std(),
        })
    pd_result = pd.DataFrame(records)
    return pd_result, overall_records

In [22]:
overall_table = []
result_lr, overall_records = add_mean_sd (records_lr, result_lr, overall_table, 'lr')
result_rf, overall_records = add_mean_sd (records_rf, result_rf, overall_records, 'rf')
#result_dt, overall_records = add_mean_sd (records_dt, result_dt, overall_records, 'dt')
result_gbt, overall_records = add_mean_sd (records_gbt, result_gbt, overall_records, 'gbt')

result_path='/Users/lifuchen/Desktop/research/resample_data/'
result_lr.to_csv(path.join(result_path,'gender-lr-resample-proportion-result.csv'), index=False)
result_rf.to_csv(path.join(result_path,'gender-rf-resample-proportion-result.csv'), index=False)
#result_dt.to_csv(path.join(result_path,'gender-dt-resample-proportion-result.csv'), index=False)
result_gbt.to_csv(path.join(result_path,'gender-gbt-resample-proportion-result.csv'), index=False)

overall_result = pd.DataFrame(overall_table)
result_path='/Users/lifuchen/Desktop/research/resample_result/'
overall_result.to_csv(path.join(result_path,'gender-resample-proportion.csv'), index=False)