In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from os import path
import imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
import sklearn.preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
import src.lib.utility_classfier as uclf
import src.lib.optimal_threhold_related as thres
import src.lib.fairness_tests as fair

In [30]:
data_path='/Users/lifuchen/Desktop/research/data.csv'
df = pd.read_csv(data_path)

In [None]:
y = df.Class.values
X = df.drop(['GRID','Class'], axis=1)
X.shape

In [41]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
y = df.Race_W
X = df.drop(['GRID','Race_W'], axis=1)
X_over, y_over = SMOTE().fit_resample(X,y)

y_white = y_over[y_over == 1]
y_black = y_over[y_over == 0]
print (y_white.shape)
print (y_black.shape)

(94506,)
(94506,)


In [44]:
X_over ['Race_W'] = y_over
X_over.shape

(189012, 88)

In [45]:
y = X_over.Class.values
X = X_over.drop(['Class'], axis=1)
X.shape

(189012, 87)

In [49]:
def get_result (classifier, records, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black):
        method_to_call = getattr(uclf, classifier)
        y_val_score = method_to_call(X_train_scaled, y_train,X_val_scaled, y_val)
        y_test_score = method_to_call(X_train_scaled, y_train,X_test_scaled, y_test)
        
        threshold, ba_val, ba_test = balance_accuracy (y_val, y_val_score,y_test, y_test_score)
        auroc = roc_auc_score(y_val, y_val_score)
        
        y_val_score_white = method_to_call(X_train_scaled, y_train, X_val_white_scaled, y_val_white)
        y_test_score_white = method_to_call(X_train_scaled, y_train,X_test_white_scaled, y_test_white)
        threshold_white, ba_val_white, ba_test_white = balance_accuracy (y_val_white, y_val_score_white,y_test_white, y_test_score_white)

        y_val_score_black = method_to_call(X_train_scaled, y_train, X_val_black_scaled, y_val_black)
        y_test_score_black = method_to_call(X_train_scaled, y_train,X_test_black_scaled, y_test_black)
        threshold_black, ba_val_black, ba_test_black = balance_accuracy (y_val_black, y_val_score_black, y_test_black, y_test_score_black)

        eod = fair.get_EOD(y_test_white, y_test_score_white,threshold_white, y_test_black, y_test_score_black, threshold_black)
        sp = fair.get_SP(y_test_white, y_test_score_white,threshold_white, y_test_black, y_test_score_black, threshold_black)

        records.append({
            'auroc': auroc,
            'overall threshold': threshold,
            'overall ba validation': ba_val,
            'overall ba test': ba_test,
            'white threshold': threshold_white,
            'white ba validation': ba_val_white,
            'white ba test': ba_test_white,
            'black threshold': threshold_black,
            'black ba validation': ba_val_black,
            'black ba test': ba_test_black,
            'eod': eod,
            'di': sp,
        })

In [50]:
def balance_accuracy (y_val, y_val_score,y_test, y_test_score):
    
    threshold, _ = thres.get_optimal_threshold_Jvalue (y_val, y_val_score)
    print ("Optimal threshold by J value is ",threshold)

    ba_val = thres.calculate_balanced_accuracy(y_val, y_val_score, threshold)
    print ("Balanced accuracy score of val is ", ba_val)

    ba_test = thres.calculate_balanced_accuracy(y_test, y_test_score, threshold)
    print ("Balanced accuracy score of test is ",ba_test)

    return threshold, ba_val, ba_test

In [51]:
def fairness_metrics (X, y, attribute, random_state):
    global threshold
    X_train, y_train, X_val, y_val, X_test, y_test, X_val_white, X_val_black, y_val_white, y_val_black, X_test_white, X_test_black, y_test_white, y_test_black \
        = fair.split_by_trait(X, y, attribute, random_state)
    
    print("X train", X_train.shape[0])
    print("Y train", y_train.shape[0])
    print(X_val.shape[0], X_val_white.shape[0], X_val_black.shape[0])
    print(y_val.shape[0], y_val_white.shape[0], y_val_black.shape[0])
    print(X_test.shape[0], X_test_white.shape[0], X_test_black.shape[0])
    print(y_test.shape[0], y_test_white.shape[0], y_test_black.shape[0])

    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_train_scaled = max_abs_scaler.fit_transform(X_train)
    X_test_scaled = max_abs_scaler.transform(X_test)
    X_test_white_scaled = max_abs_scaler.transform(X_test_white)
    X_test_black_scaled = max_abs_scaler.transform(X_test_black)
    X_val_scaled = max_abs_scaler.transform(X_val)
    X_val_white_scaled = max_abs_scaler.transform(X_val_white)
    X_val_black_scaled = max_abs_scaler.transform(X_val_black)

    get_result ("logic_regression", records_lr, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)
    get_result ("random_forest", records_rf, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)
    get_result ("decision_tree", records_dt, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)
    get_result ("gradiant_boosting", records_gbt, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)

In [54]:
records_lr = []
records_rf = []
records_dt = []
records_gbt = []
for random_state in range(0,4):
    fairness_metrics (X, y, "Race_W", random_state)

result_lr = pd.DataFrame(records_lr)
result_rf = pd.DataFrame(records_rf)
result_dt = pd.DataFrame(records_dt)
result_gbt = pd.DataFrame(records_gbt)

X train 113406
Y train 113406
37803 18833 18970
37803 18833 18970
37803 18797 19006
37803 18797 19006


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17643428857667923
0.1741411748734776
Classification report
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     35681
           1       0.45      0.04      0.07      2122

    accuracy                           0.94     37803
   macro avg       0.70      0.52      0.52     37803
weighted avg       0.92      0.94      0.92     37803

Confusion_matrix
[[35577   104]
 [ 2036    86]]
done in 0.835639s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17643428857667923
0.1793246229826585
Classification report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     35616
           1       0.41      0.04      0.07      2187

    accuracy                           0.94     37803
   macro avg       0.68      0.52      0.52     37803
weighted avg       0.91      0.94      0.92     37803

Confusion_matrix
[[35500   116]
 [ 2107    80]]
done in 0.808748s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.43699999999999994
threshold:0.2, J-value:0.271
threshold:0.30000000000000004, J-value:0.155
threshold:0.4, J-value:0.087
threshold:0.5, J-value:0.038
threshold:0.6000000000000001, J-value:0.016999999999999998
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.002
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7185032765334654
Balanced accuracy score of test is  0.7185486255156357


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17643428857667923
0.25883679056606895
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17145
           1       0.45      0.05      0.09      1688

    accuracy                           0.91     18833
   macro avg       0.68      0.52      0.52     18833
weighted avg       0.87      0.91      0.87     18833

Confusion_matrix
[[17042   103]
 [ 1605    83]]
done in 0.852995s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17643428857667923
0.2632369956293377
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17089
           1       0.40      0.04      0.08      1708

    accuracy                           0.91     18797
   macro avg       0.66      0.52      0.52     18797
weighted avg       0.87      0.91      0.87     18797

Confusion_matrix
[[16976   113]
 [ 1632    76]]
done in 0.822278s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.405
threshold:0.2, J-value:0.28200000000000003
threshold:0.30000000000000004, J-value:0.16999999999999998
threshold:0.4, J-value:0.098
threshold:0.5, J-value:0.043000000000000003
threshold:0.6000000000000001, J-value:0.020999999999999998
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.003
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7023468803168956
Balanced accuracy score of test is  0.7018225496138619


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17643428857667923
0.09005722493470199
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18536
           1       0.75      0.01      0.01       434

    accuracy                           0.98     18970
   macro avg       0.86      0.50      0.50     18970
weighted avg       0.97      0.98      0.97     18970

Confusion_matrix
[[18535     1]
 [  431     3]]
done in 1.054295s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17643428857667923
0.09633499504202764
Classification report
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     18527
           1       0.57      0.01      0.02       479

    accuracy                           0.97     19006
   macro avg       0.77      0.50      0.50     19006
weighted avg       0.96      0.97      0.96     19006

Confusion_matrix
[[18524     3]
 [  475     4]]
done in 0.830326s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.20900000000000002
threshold:0.2, J-value:0.096
threshold:0.30000000000000004, J-value:0.045
threshold:0.4, J-value:0.022
threshold:0.5, J-value:0.007
threshold:0.6000000000000001, J-value:0.002
threshold:0.7000000000000001, J-value:0.002
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.604849151433305
Balanced accuracy score of test is  0.6200359504657931
True positive rate of class 1 is  0.655
True positive ra

0.18144981765295143
0.1011845210001104
Classification report
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     18527
           1       0.25      0.00      0.01       479

    accuracy                           0.97     19006
   macro avg       0.61      0.50      0.50     19006
weighted avg       0.96      0.97      0.96     19006

Confusion_matrix
[[18521     6]
 [  477     2]]
done in 1.351923s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.33
threshold:0.2, J-value:0.14600000000000002
threshold:0.30000000000000004, J-value:0.063
threshold:0.4, J-value:0.007
threshold:0.5, J-value:0.007
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6650624068943434
Balanced accuracy score of test is  0.6757363540859456
True positive rate of class 1 is  0.735
True positive rate of

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17442492876414686
0.17885625765042965
Classification report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     35603
           1       0.42      0.04      0.07      2200

    accuracy                           0.94     37803
   macro avg       0.68      0.52      0.52     37803
weighted avg       0.91      0.94      0.92     37803

Confusion_matrix
[[35493   110]
 [ 2121    79]]
done in 0.824977s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17442492876414686
0.1801415718152631
Classification report
              precision    recall  f1-score   support

           0       0.94      1.00      0.97     35594
           1       0.45      0.04      0.07      2209

    accuracy                           0.94     37803
   macro avg       0.70      0.52      0.52     37803
weighted avg       0.91      0.94      0.92     37803

Confusion_matrix
[[35489   105]
 [ 2122    87]]
done in 0.814686s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.43999999999999995
threshold:0.2, J-value:0.247
threshold:0.30000000000000004, J-value:0.137
threshold:0.4, J-value:0.06599999999999999
threshold:0.5, J-value:0.032999999999999995
threshold:0.6000000000000001, J-value:0.012
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7201201890545486
Balanced accuracy score of test is  0.713668559202187


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17442492876414686
0.2671928913486074
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17129
           1       0.41      0.04      0.08      1753

    accuracy                           0.91     18882
   macro avg       0.66      0.52      0.51     18882
weighted avg       0.86      0.91      0.87     18882

Confusion_matrix
[[17021   108]
 [ 1677    76]]
done in 0.782763s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17442492876414686
0.2625678063331771
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17258
           1       0.45      0.05      0.08      1732

    accuracy                           0.91     18990
   macro avg       0.68      0.52      0.52     18990
weighted avg       0.87      0.91      0.87     18990

Confusion_matrix
[[17157   101]
 [ 1651    81]]
done in 0.780674s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.403
threshold:0.2, J-value:0.254
threshold:0.30000000000000004, J-value:0.146
threshold:0.4, J-value:0.072
threshold:0.5, J-value:0.037
threshold:0.6000000000000001, J-value:0.014
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7017889018190445
Balanced accuracy score of test is  0.7009869506580875


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17442492876414686
0.09070170358410162
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18474
           1       0.60      0.01      0.01       447

    accuracy                           0.98     18921
   macro avg       0.79      0.50      0.50     18921
weighted avg       0.97      0.98      0.97     18921

Confusion_matrix
[[18472     2]
 [  444     3]]
done in 0.782595s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17442492876414686
0.09693983931671482
Classification report
              precision    recall  f1-score   support

           0       0.97      1.00      0.99     18336
           1       0.60      0.01      0.02       477

    accuracy                           0.97     18813
   macro avg       0.79      0.51      0.51     18813
weighted avg       0.97      0.97      0.96     18813

Confusion_matrix
[[18332     4]
 [  471     6]]
done in 0.780016s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.24700000000000003
threshold:0.2, J-value:0.097
threshold:0.30000000000000004, J-value:0.048999999999999995
threshold:0.4, J-value:0.020999999999999998
threshold:0.5, J-value:0.007
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6237269429265
Balanced accuracy score of test is  0.6095012252077228
True positive rate of class 1 is

  _warn_prf(average, modifier, msg_start, len(result))


0.17933456523301572
0.27652074118050224
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     17258
           1       0.25      0.00      0.00      1732

    accuracy                           0.91     18990
   macro avg       0.58      0.50      0.48     18990
weighted avg       0.85      0.91      0.87     18990

Confusion_matrix
[[17255     3]
 [ 1731     1]]
done in 1.346098s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.354
threshold:0.2, J-value:0.187
threshold:0.30000000000000004, J-value:0.11000000000000001
threshold:0.4, J-value:0.0
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6769897676225343
Balanced accuracy score of test is  0.6809103426144771
0.17933456523301572
0.09388085758428068
Classification report
 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17730718295858042
0.17686096987514907
Classification report
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     35659
           1       0.42      0.05      0.08      2144

    accuracy                           0.94     37803
   macro avg       0.68      0.52      0.53     37803
weighted avg       0.92      0.94      0.92     37803

Confusion_matrix
[[35524   135]
 [ 2047    97]]
done in 0.912558s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17730718295858042
0.17334019266520587
Classification report
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     35721
           1       0.43      0.05      0.09      2082

    accuracy                           0.94     37803
   macro avg       0.69      0.52      0.53     37803
weighted avg       0.92      0.94      0.92     37803

Confusion_matrix
[[35591   130]
 [ 1983    99]]
done in 0.884239s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.43199999999999994
threshold:0.2, J-value:0.261
threshold:0.30000000000000004, J-value:0.15100000000000002
threshold:0.4, J-value:0.083
threshold:0.5, J-value:0.040999999999999995
threshold:0.6000000000000001, J-value:0.015
threshold:0.7000000000000001, J-value:0.006
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7161022389524656
Balanced accuracy score of test is  0.7141917866991437


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17730718295858042
0.2645900757326192
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17107
           1       0.41      0.05      0.10      1697

    accuracy                           0.91     18804
   macro avg       0.66      0.52      0.52     18804
weighted avg       0.87      0.91      0.87     18804

Confusion_matrix
[[16975   132]
 [ 1604    93]]
done in 0.827612s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17730718295858042
0.25999357490584357
Classification report
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     17205
           1       0.43      0.06      0.10      1668

    accuracy                           0.91     18873
   macro avg       0.68      0.53      0.53     18873
weighted avg       0.87      0.91      0.88     18873

Confusion_matrix
[[17080   125]
 [ 1572    96]]
done in 0.838952s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.383
threshold:0.2, J-value:0.269
threshold:0.30000000000000004, J-value:0.161
threshold:0.4, J-value:0.094
threshold:0.5, J-value:0.047
threshold:0.6000000000000001, J-value:0.017
threshold:0.7000000000000001, J-value:0.007
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6916814507902167
Balanced accuracy score of test is  0.6910341648215865


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17730718295858042
0.0900322890738507
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18552
           1       0.57      0.01      0.02       447

    accuracy                           0.98     18999
   macro avg       0.77      0.50      0.50     18999
weighted avg       0.97      0.98      0.97     18999

Confusion_matrix
[[18549     3]
 [  443     4]]
done in 0.896553s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17730718295858042
0.08694773186079195
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18516
           1       0.38      0.01      0.01       414

    accuracy                           0.98     18930
   macro avg       0.68      0.50      0.50     18930
weighted avg       0.97      0.98      0.97     18930

Confusion_matrix
[[18511     5]
 [  411     3]]
done in 0.902698s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.27299999999999996
threshold:0.2, J-value:0.09899999999999999
threshold:0.30000000000000004, J-value:0.057
threshold:0.4, J-value:0.018
threshold:0.5, J-value:0.009
threshold:0.6000000000000001, J-value:0.002
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6364123262456914
Balanced accuracy score of test is  0.628801647458837
True positive rate of class 1 is  0.629
True

  _warn_prf(average, modifier, msg_start, len(result))


0.18225920825640143
0.0920400581507891
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18516
           1       0.00      0.00      0.00       414

    accuracy                           0.98     18930
   macro avg       0.49      0.50      0.49     18930
weighted avg       0.96      0.98      0.97     18930

Confusion_matrix
[[18514     2]
 [  414     0]]
done in 1.354095s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.252
threshold:0.2, J-value:0.055
threshold:0.30000000000000004, J-value:0.033
threshold:0.4, J-value:0.022
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6258063675907517
Balanced accuracy score of test is  0.6081203565424028
True positive rate of class 1 is  0.596
True positive rate of class 2 is  0.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.176576795853853
0.17465482486140174
Classification report
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     35674
           1       0.45      0.05      0.09      2129

    accuracy                           0.94     37803
   macro avg       0.70      0.52      0.53     37803
weighted avg       0.92      0.94      0.92     37803

Confusion_matrix
[[35552   122]
 [ 2028   101]]
done in 0.869481s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.176576795853853
0.1777837726366141
Classification report
              precision    recall  f1-score   support

           0       0.95      1.00      0.97     35653
           1       0.38      0.04      0.07      2150

    accuracy                           0.94     37803
   macro avg       0.66      0.52      0.52     37803
weighted avg       0.91      0.94      0.92     37803

Confusion_matrix
[[35519   134]
 [ 2067    83]]
done in 0.853035s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.44499999999999995
threshold:0.2, J-value:0.26
threshold:0.30000000000000004, J-value:0.16
threshold:0.4, J-value:0.08299999999999999
threshold:0.5, J-value:0.044
threshold:0.6000000000000001, J-value:0.016999999999999998
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.002
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7224187098171209
Balanced accuracy score of test is  0.7148967600495473


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.176576795853853
0.26297620814866535
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17196
           1       0.46      0.06      0.10      1712

    accuracy                           0.91     18908
   macro avg       0.68      0.53      0.53     18908
weighted avg       0.87      0.91      0.87     18908

Confusion_matrix
[[17079   117]
 [ 1614    98]]
done in 0.834358s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.176576795853853
0.2631313394183473
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17295
           1       0.38      0.05      0.08      1705

    accuracy                           0.91     19000
   macro avg       0.64      0.52      0.52     19000
weighted avg       0.87      0.91      0.87     19000

Confusion_matrix
[[17164   131]
 [ 1626    79]]
done in 0.868742s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.4
threshold:0.2, J-value:0.26
threshold:0.30000000000000004, J-value:0.174
threshold:0.4, J-value:0.091
threshold:0.5, J-value:0.05
threshold:0.6000000000000001, J-value:0.02
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.002
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6998697534527699
Balanced accuracy score of test is  0.6963796259322657


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.176576795853853
0.0862726753405983
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18478
           1       0.38      0.01      0.01       417

    accuracy                           0.98     18895
   macro avg       0.68      0.50      0.50     18895
weighted avg       0.96      0.98      0.97     18895

Confusion_matrix
[[18473     5]
 [  414     3]]
done in 1.190236s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.176576795853853
0.09154201499937911
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18358
           1       0.57      0.01      0.02       445

    accuracy                           0.98     18803
   macro avg       0.77      0.50      0.50     18803
weighted avg       0.97      0.98      0.97     18803

Confusion_matrix
[[18355     3]
 [  441     4]]
done in 0.831437s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.27099999999999996
threshold:0.2, J-value:0.11699999999999999
threshold:0.30000000000000004, J-value:0.044
threshold:0.4, J-value:0.022
threshold:0.5, J-value:0.007
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6357196957013889
Balanced accuracy score of test is  0.6189523962244058
True positive rate of class 1 is  0.64
True pos

0.18166338563630635
0.09507682003239261
Classification report
              precision    recall  f1-score   support

           0       0.98      1.00      0.99     18358
           1       0.67      0.00      0.01       445

    accuracy                           0.98     18803
   macro avg       0.82      0.50      0.50     18803
weighted avg       0.97      0.98      0.96     18803

Confusion_matrix
[[18357     1]
 [  443     2]]
done in 1.321755s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.303
threshold:0.2, J-value:0.07400000000000001
threshold:0.30000000000000004, J-value:0.018
threshold:0.4, J-value:0.01
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6517061056209692
Balanced accuracy score of test is  0.6588039259129596
True positive rate of class 1 is  0.594
True positive rate of 

In [55]:
def add_mean_sd(records, result_table):
    records.append({
        'auroc': '%.4f (+/- %.4f)' % (result_table["auroc"].mean(), result_table["auroc"].std()),
        'overall ba validation': '%.4f (+/- %.4f)' % (result_table["overall ba validation"].mean(), result_table["overall ba validation"].std()),
        'overall ba test': '%.4f (+/- %.4f)' % (result_table["overall ba test"].mean(), result_table["overall ba test"].std()),
        'white ba validation': '%.4f (+/- %.4f)' %(result_table["white ba validation"].mean(), result_table["white ba validation"].std()),
        'white ba test': '%.4f (+/- %.4f)' % (result_table["white ba test"].mean(), result_table["white ba test"].std()),
        'black ba validation': '%.4f (+/- %.4f)' % (result_table["black ba validation"].mean(), result_table["black ba validation"].std()),
        'black ba test': '%.4f (+/- %.4f)' % (result_table["black ba test"].mean(), result_table["black ba test"].std()),
        'eod': '%.4f (+/- %.4f)' % (result_table["eod"].mean(), result_table["eod"].std()),
        'di': '%.4f (+/- %.4f)' % (result_table["di"].mean(), result_table["di"].std()),
        })
    pd_result = pd.DataFrame(records)
    return pd_result

In [56]:
result_lr = add_mean_sd (records_lr, result_lr)
result_rf = add_mean_sd (records_rf, result_rf)
result_dt = add_mean_sd (records_dt, result_dt)
result_gbt = add_mean_sd (records_gbt, result_gbt)

result_path='/Users/lifuchen/Desktop/research/resample/'
result_lr.to_csv(path.join(result_path,'race-lr-resample-result.csv'), index=False)
result_rf.to_csv(path.join(result_path,'race-rf-resample-result.csv'), index=False)
result_dt.to_csv(path.join(result_path,'race-dt-resample-result.csv'), index=False)
result_gbt.to_csv(path.join(result_path,'race-gbt-resample-result.csv'), index=False)