In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
from os import path
import imblearn
from imblearn.over_sampling import RandomOverSampler, SMOTE
from sklearn.metrics import balanced_accuracy_score, roc_auc_score
import sklearn.preprocessing
from sklearn import preprocessing
from sklearn.preprocessing import Normalizer
import src.lib.utility_classfier as uclf
import src.lib.optimal_threhold_related as thres
import src.lib.fairness_tests as fair

In [2]:
data_path='/Users/lifuchen/Desktop/research/data.csv'
df = pd.read_csv(data_path)

In [3]:
y = df.Class.values
X = df.drop(['GRID','Class'], axis=1)
X.shape

(109490, 87)

In [4]:
def get_result (classifier, records, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black):
        method_to_call = getattr(uclf, classifier)
        y_val_score = method_to_call(X_train_scaled, y_train,X_val_scaled, y_val)
        y_test_score = method_to_call(X_train_scaled, y_train,X_test_scaled, y_test)
        
        threshold, ba_val, ba_test = balance_accuracy (y_val, y_val_score,y_test, y_test_score)
        auroc = roc_auc_score(y_val, y_val_score)
        
        y_val_score_white = method_to_call(X_train_scaled, y_train, X_val_white_scaled, y_val_white)
        y_test_score_white = method_to_call(X_train_scaled, y_train,X_test_white_scaled, y_test_white)
        threshold_white, ba_val_white, ba_test_white = balance_accuracy (y_val_white, y_val_score_white,y_test_white, y_test_score_white)

        y_val_score_black = method_to_call(X_train_scaled, y_train, X_val_black_scaled, y_val_black)
        y_test_score_black = method_to_call(X_train_scaled, y_train,X_test_black_scaled, y_test_black)
        threshold_black, ba_val_black, ba_test_black = balance_accuracy (y_val_black, y_val_score_black, y_test_black, y_test_score_black)

        eod = fair.get_EOD(y_test_white, y_test_score_white,threshold_white, y_test_black, y_test_score_black, threshold_black)
        sp = fair.get_SP(y_test_white, y_test_score_white,threshold_white, y_test_black, y_test_score_black, threshold_black)

        records.append({
            'auroc': auroc,
            'overall threshold': threshold,
            'overall ba validation': ba_val,
            'overall ba test': ba_test,
            'white threshold': threshold_white,
            'white ba validation': ba_val_white,
            'white ba test': ba_test_white,
            'black threshold': threshold_black,
            'black ba validation': ba_val_black,
            'black ba test': ba_test_black,
            'eod': eod,
            'di': sp,
        })

In [5]:
def balance_accuracy (y_val, y_val_score,y_test, y_test_score):
    
    threshold, _ = thres.get_optimal_threshold_Jvalue (y_val, y_val_score)
    print ("Optimal threshold by J value is ",threshold)

    ba_val = thres.calculate_balanced_accuracy(y_val, y_val_score, threshold)
    print ("Balanced accuracy score of val is ", ba_val)

    ba_test = thres.calculate_balanced_accuracy(y_test, y_test_score, threshold)
    print ("Balanced accuracy score of test is ",ba_test)

    return threshold, ba_val, ba_test

In [6]:
def fairness_metrics (X, y, attribute, random_state):
    global threshold
    X_train, y_train, X_val, y_val, X_test, y_test, X_val_white, X_val_black, y_val_white, y_val_black, X_test_white, X_test_black, y_test_white, y_test_black \
        = fair.split_by_trait_balance_size(X, y, attribute, random_state)
    
    print("X train", X_train.shape[0])
    print("Y train", y_train.shape[0])
    print(X_val.shape[0], X_val_white.shape[0], X_val_black.shape[0])
    print(y_val.shape[0], y_val_white.shape[0], y_val_black.shape[0])
    print(X_test.shape[0], X_test_white.shape[0], X_test_black.shape[0])
    print(y_test.shape[0], y_test_white.shape[0], y_test_black.shape[0])

    max_abs_scaler = preprocessing.MaxAbsScaler()
    X_train_scaled = max_abs_scaler.fit_transform(X_train)
    X_test_scaled = max_abs_scaler.transform(X_test)
    X_test_white_scaled = max_abs_scaler.transform(X_test_white)
    X_test_black_scaled = max_abs_scaler.transform(X_test_black)
    X_val_scaled = max_abs_scaler.transform(X_val)
    X_val_white_scaled = max_abs_scaler.transform(X_val_white)
    X_val_black_scaled = max_abs_scaler.transform(X_val_black)

    get_result ("logic_regression", records_lr, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)
    get_result ("random_forest", records_rf, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)
    get_result ("decision_tree", records_dt, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)
    get_result ("gradiant_boosting", records_gbt, X_train_scaled, y_train, X_val_scaled, y_val, X_test_scaled, y_test, X_val_white_scaled, y_val_white, X_test_white_scaled, y_test_white, X_val_black_scaled, y_val_black, X_test_black_scaled, y_test_black)

In [7]:
records_lr = []
records_rf = []
records_dt = []
records_gbt = []
for random_state in range(0,4):
    fairness_metrics (X, y, "Race_W", random_state)

result_lr = pd.DataFrame(records_lr)
result_rf = pd.DataFrame(records_rf)
result_dt = pd.DataFrame(records_dt)
result_gbt = pd.DataFrame(records_gbt)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(56639,)
(56639,)
(113278, 87)
X train 113278
Y train 113278
21898 18899 2999
21898 18899 2999
21898 18968 2930
21898 18968 2930


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.1745291292673417
0.266569928409999
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     19904
           1       0.45      0.05      0.08      1994

    accuracy                           0.91     21898
   macro avg       0.68      0.52      0.52     21898
weighted avg       0.87      0.91      0.87     21898

Confusion_matrix
[[19793   111]
 [ 1904    90]]
done in 0.847969s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.1745291292673417
0.26554223404242555
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     19934
           1       0.41      0.05      0.08      1964

    accuracy                           0.91     21898
   macro avg       0.66      0.52      0.52     21898
weighted avg       0.87      0.91      0.87     21898

Confusion_matrix
[[19808   126]
 [ 1875    89]]
done in 0.802813s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.39
threshold:0.2, J-value:0.241
threshold:0.30000000000000004, J-value:0.152
threshold:0.4, J-value:0.082
threshold:0.5, J-value:0.039
threshold:0.6000000000000001, J-value:0.016
threshold:0.7000000000000001, J-value:0.006
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6947815915592437
Balanced accuracy score of test is  0.6894222165324798


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.1745291292673417
0.263776487871698
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17180
           1       0.45      0.05      0.09      1719

    accuracy                           0.91     18899
   macro avg       0.68      0.52      0.52     18899
weighted avg       0.87      0.91      0.87     18899

Confusion_matrix
[[17073   107]
 [ 1631    88]]
done in 0.895574s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.1745291292673417
0.2633775057741107
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17260
           1       0.41      0.05      0.09      1708

    accuracy                           0.91     18968
   macro avg       0.66      0.52      0.52     18968
weighted avg       0.87      0.91      0.87     18968

Confusion_matrix
[[17136   124]
 [ 1622    86]]
done in 0.793188s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.404
threshold:0.2, J-value:0.26
threshold:0.30000000000000004, J-value:0.16699999999999998
threshold:0.4, J-value:0.093
threshold:0.5, J-value:0.045
threshold:0.6000000000000001, J-value:0.019000000000000003
threshold:0.7000000000000001, J-value:0.007
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7019928099356572
Balanced accuracy score of test is  0.695846144243842


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.1745291292673417
0.28417354052515437
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2724
           1       0.33      0.01      0.01       275

    accuracy                           0.91      2999
   macro avg       0.62      0.50      0.48      2999
weighted avg       0.86      0.91      0.87      2999

Confusion_matrix
[[2720    4]
 [ 273    2]]
done in 0.878246s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.1745291292673417
0.2795560790231069
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2674
           1       0.60      0.01      0.02       256

    accuracy                           0.91      2930
   macro avg       0.76      0.51      0.49      2930
weighted avg       0.89      0.91      0.87      2930

Confusion_matrix
[[2672    2]
 [ 253    3]]
done in 0.826548s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.301
threshold:0.2, J-value:0.12399999999999999
threshold:0.30000000000000004, J-value:0.055
threshold:0.4, J-value:0.022000000000000002
threshold:0.5, J-value:0.006
threshold:0.6000000000000001, J-value:-0.001
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6503610999866507
Balanced accuracy score of test is  0.6442858311518325
True positive rate of class 1 is  0.642
True po

  _warn_prf(average, modifier, msg_start, len(result))


Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2674
           1       0.00      0.00      0.00       256

    accuracy                           0.91      2930
   macro avg       0.46      0.50      0.48      2930
weighted avg       0.83      0.91      0.87      2930

Confusion_matrix
[[2674    0]
 [ 256    0]]
done in 37.520637s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.405
threshold:0.2, J-value:0.21000000000000002
threshold:0.30000000000000004, J-value:0.061000000000000006
threshold:0.4, J-value:-0.001
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7024909891870244
Balanced accuracy score of test is  0.68387422868362


  _warn_prf(average, modifier, msg_start, len(result))


True positive rate of class 1 is  0.713
True positive rate of class 2 is  0.617
Positive prediction rate of class 1 is  0.377
Positive prediction rate of class 2 is  0.282
0.17930149121056152
0.27906950849927226
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19904
           1       0.25      0.00      0.01      1994

    accuracy                           0.91     21898
   macro avg       0.58      0.50      0.48     21898
weighted avg       0.85      0.91      0.87     21898

Confusion_matrix
[[19877    27]
 [ 1985     9]]
done in 1.381484s
0.17930149121056152
0.27642847879951965
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19934
           1       0.14      0.00      0.00      1964

    accuracy                           0.91     21898
   macro avg       0.53      0.50      0.48     21898
weighted avg       0.84      0.91      

True positive rate of class 2 is  0.441
Positive prediction rate of class 1 is  0.303
Positive prediction rate of class 2 is  0.147


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(56798,)
(56798,)
(113596, 87)
X train 113596
Y train 113596
21898 18825 3073
21898 18825 3073
21898 18883 3015
21898 18883 3015


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17554712104469375
0.2602976043015669
Classification report
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     19980
           1       0.39      0.05      0.08      1918

    accuracy                           0.91     21898
   macro avg       0.65      0.52      0.52     21898
weighted avg       0.87      0.91      0.88     21898

Confusion_matrix
[[19843   137]
 [ 1830    88]]
done in 0.917382s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17554712104469375
0.26133257842196905
Classification report
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     19972
           1       0.47      0.05      0.09      1926

    accuracy                           0.91     21898
   macro avg       0.69      0.52      0.52     21898
weighted avg       0.88      0.91      0.88     21898

Confusion_matrix
[[19860   112]
 [ 1827    99]]
done in 0.931157s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.373
threshold:0.2, J-value:0.26599999999999996
threshold:0.30000000000000004, J-value:0.163
threshold:0.4, J-value:0.079
threshold:0.5, J-value:0.039
threshold:0.6000000000000001, J-value:0.016
threshold:0.7000000000000001, J-value:0.007
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.001
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6866846251882748
Balanced accuracy score of test is  0.6922006489251099


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17554712104469375
0.256188399095871
Classification report
              precision    recall  f1-score   support

           0       0.92      0.99      0.95     17178
           1       0.39      0.05      0.09      1647

    accuracy                           0.91     18825
   macro avg       0.65      0.52      0.52     18825
weighted avg       0.87      0.91      0.88     18825

Confusion_matrix
[[17044   134]
 [ 1563    84]]
done in 0.886296s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17554712104469375
0.26189537331994234
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17196
           1       0.47      0.06      0.10      1687

    accuracy                           0.91     18883
   macro avg       0.69      0.53      0.53     18883
weighted avg       0.87      0.91      0.88     18883

Confusion_matrix
[[17085   111]
 [ 1590    97]]
done in 0.853197s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.4
threshold:0.2, J-value:0.28800000000000003
threshold:0.30000000000000004, J-value:0.178
threshold:0.4, J-value:0.088
threshold:0.5, J-value:0.043
threshold:0.6000000000000001, J-value:0.018000000000000002
threshold:0.7000000000000001, J-value:0.008
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.001
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6997937167483042
Balanced accuracy score of test is  0.6972633625525739


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17554712104469375
0.28547033062672944
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2802
           1       0.57      0.01      0.03       271

    accuracy                           0.91      3073
   macro avg       0.74      0.51      0.49      3073
weighted avg       0.88      0.91      0.87      3073

Confusion_matrix
[[2799    3]
 [ 267    4]]
done in 0.817512s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17554712104469375
0.2578077837093885
Classification report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2776
           1       0.67      0.01      0.02       239

    accuracy                           0.92      3015
   macro avg       0.79      0.50      0.49      3015
weighted avg       0.90      0.92      0.88      3015

Confusion_matrix
[[2775    1]
 [ 237    2]]
done in 0.850035s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.21599999999999997
threshold:0.2, J-value:0.128
threshold:0.30000000000000004, J-value:0.074
threshold:0.4, J-value:0.033999999999999996
threshold:0.5, J-value:0.013999999999999999
threshold:0.6000000000000001, J-value:0.003
threshold:0.7000000000000001, J-value:-0.001
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6076873134898373
Balanced accuracy score of test is  0.6463869931149241
True positive rate of class 1

  _warn_prf(average, modifier, msg_start, len(result))


Classification report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2776
           1       0.00      0.00      0.00       239

    accuracy                           0.92      3015
   macro avg       0.46      0.50      0.48      3015
weighted avg       0.85      0.92      0.88      3015

Confusion_matrix
[[2776    0]
 [ 239    0]]
done in 38.407535s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.39
threshold:0.2, J-value:0.20700000000000002
threshold:0.30000000000000004, J-value:0.05600000000000001
threshold:0.4, J-value:0.009999999999999998
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6952269201492871
Balanced accuracy score of test is  0.7029673049328977


  _warn_prf(average, modifier, msg_start, len(result))


True positive rate of class 1 is  0.731
True positive rate of class 2 is  0.632
Positive prediction rate of class 1 is  0.376
Positive prediction rate of class 2 is  0.258
0.18076069224942903
0.27312989204339694
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19980
           1       0.00      0.00      0.00      1918

    accuracy                           0.91     21898
   macro avg       0.46      0.50      0.48     21898
weighted avg       0.83      0.91      0.87     21898

Confusion_matrix
[[19979     1]
 [ 1918     0]]
done in 1.391174s
0.18076069224942903
0.27070593841040863
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19972
           1       0.00      0.00      0.00      1926

    accuracy                           0.91     21898
   macro avg       0.46      0.50      0.48     21898
weighted avg       0.83      0.91      

  _warn_prf(average, modifier, msg_start, len(result))


0.18076069224942903
0.26899444403083433
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     17196
           1       0.00      0.00      0.00      1687

    accuracy                           0.91     18883
   macro avg       0.46      0.50      0.48     18883
weighted avg       0.83      0.91      0.87     18883

Confusion_matrix
[[17196     0]
 [ 1687     0]]
done in 1.385925s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.344


  _warn_prf(average, modifier, msg_start, len(result))


threshold:0.2, J-value:0.196
threshold:0.30000000000000004, J-value:0.11400000000000002
threshold:0.4, J-value:0.0
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6717951711438424
Balanced accuracy score of test is  0.6796937102175511
0.18076069224942903
0.3062534533607046
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2802
           1       0.00      0.00      0.00       271

    accuracy                           0.91      3073
   macro avg       0.46      0.50      0.48      3073
weighted avg       0.83      0.91      0.87      3073

Confusion_matrix
[[2802    0]
 [ 271    0]]
done in 1.330447s


  _warn_prf(average, modifier, msg_start, len(result))


0.18076069224942903
0.2814250589309726
Classification report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2776
           1       0.00      0.00      0.00       239

    accuracy                           0.92      3015
   macro avg       0.46      0.50      0.48      3015
weighted avg       0.85      0.92      0.88      3015

Confusion_matrix
[[2775    1]
 [ 239    0]]
done in 1.348367s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.28500000000000003
threshold:0.2, J-value:0.08399999999999999
threshold:0.30000000000000004, J-value:0.027999999999999997
threshold:0.4, J-value:0.004
threshold:0.5, J-value:0.004
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6426511374321452
Balanced accuracy score of test is  0.656499825160069
True positive rate of class 1 is  0.

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(56741,)
(56741,)
(113482, 87)
X train 113482
Y train 113482
21898 18936 2962
21898 18936 2962
21898 18829 3069
21898 18829 3069


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17560485253966385
0.2638883796366986
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19950
           1       0.46      0.04      0.08      1948

    accuracy                           0.91     21898
   macro avg       0.69      0.52      0.52     21898
weighted avg       0.87      0.91      0.88     21898

Confusion_matrix
[[19851    99]
 [ 1865    83]]
done in 0.817653s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17560485253966385
0.2655876031000081
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19883
           1       0.44      0.04      0.07      2015

    accuracy                           0.91     21898
   macro avg       0.68      0.52      0.51     21898
weighted avg       0.87      0.91      0.87     21898

Confusion_matrix
[[19786    97]
 [ 1938    77]]
done in 0.816254s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.367
threshold:0.2, J-value:0.239
threshold:0.30000000000000004, J-value:0.149
threshold:0.4, J-value:0.082
threshold:0.5, J-value:0.038
threshold:0.6000000000000001, J-value:0.015000000000000001
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.002
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6835566843185994
Balanced accuracy score of test is  0.6983375201504484


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17560485253966385
0.2642075118749599
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17239
           1       0.46      0.05      0.09      1697

    accuracy                           0.91     18936
   macro avg       0.69      0.52      0.52     18936
weighted avg       0.87      0.91      0.87     18936

Confusion_matrix
[[17143    96]
 [ 1616    81]]
done in 0.813050s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17560485253966385
0.26382007830111964
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17079
           1       0.45      0.04      0.08      1750

    accuracy                           0.91     18829
   macro avg       0.68      0.52      0.51     18829
weighted avg       0.87      0.91      0.87     18829

Confusion_matrix
[[16988    91]
 [ 1677    73]]
done in 0.832235s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.379
threshold:0.2, J-value:0.255
threshold:0.30000000000000004, J-value:0.158
threshold:0.4, J-value:0.08700000000000001
threshold:0.5, J-value:0.042
threshold:0.6000000000000001, J-value:0.017
threshold:0.7000000000000001, J-value:0.005
threshold:0.8, J-value:0.002
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6893664661020804
Balanced accuracy score of test is  0.7077272004884863


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17560485253966385
0.26184817434847524
Classification report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2711
           1       0.40      0.01      0.02       251

    accuracy                           0.91      2962
   macro avg       0.66      0.50      0.49      2962
weighted avg       0.87      0.91      0.88      2962

Confusion_matrix
[[2708    3]
 [ 249    2]]
done in 0.804620s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17560485253966385
0.2764317622522639
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2804
           1       0.40      0.02      0.03       265

    accuracy                           0.91      3069
   macro avg       0.66      0.51      0.49      3069
weighted avg       0.87      0.91      0.87      3069

Confusion_matrix
[[2798    6]
 [ 261    4]]
done in 0.810896s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.27999999999999997
threshold:0.2, J-value:0.124
threshold:0.30000000000000004, J-value:0.07999999999999999
threshold:0.4, J-value:0.045
threshold:0.5, J-value:0.007
threshold:0.6000000000000001, J-value:0.004
threshold:0.7000000000000001, J-value:0.004
threshold:0.8, J-value:0.004
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.639629604047844
Balanced accuracy score of test is  0.6307888999542433
True positive rate of class 1 is  0.654
True 

  _warn_prf(average, modifier, msg_start, len(result))


Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2804
           1       0.00      0.00      0.00       265

    accuracy                           0.91      3069
   macro avg       0.46      0.50      0.48      3069
weighted avg       0.83      0.91      0.87      3069

Confusion_matrix
[[2804    0]
 [ 265    0]]
done in 40.009588s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.43400000000000005
threshold:0.2, J-value:0.21500000000000002
threshold:0.30000000000000004, J-value:0.059
threshold:0.4, J-value:0.011
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.7170660772623265
Balanced accuracy score of test is  0.6944775657416629


  _warn_prf(average, modifier, msg_start, len(result))


True positive rate of class 1 is  0.75
True positive rate of class 2 is  0.634
Positive prediction rate of class 1 is  0.374
Positive prediction rate of class 2 is  0.279
0.17975406810788852
0.2744824453890574
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19950
           1       0.14      0.00      0.00      1948

    accuracy                           0.91     21898
   macro avg       0.53      0.50      0.48     21898
weighted avg       0.84      0.91      0.87     21898

Confusion_matrix
[[19938    12]
 [ 1946     2]]
done in 1.418652s
0.17975406810788852
0.27899827218275947
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     19883
           1       0.20      0.00      0.00      2015

    accuracy                           0.91     21898
   macro avg       0.55      0.50      0.48     21898
weighted avg       0.84      0.91      0.

Positive prediction rate of class 1 is  0.309
Positive prediction rate of class 2 is  0.136


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_train ['Class'] = y_train


(56692,)
(56692,)
(113384, 87)
X train 113384
Y train 113384
21898 18932 2966
21898 18932 2966
21898 18882 3016
21898 18882 3016


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17416229161394378
0.2670773643603481
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     19908
           1       0.44      0.05      0.09      1990

    accuracy                           0.91     21898
   macro avg       0.68      0.52      0.52     21898
weighted avg       0.87      0.91      0.87     21898

Confusion_matrix
[[19790   118]
 [ 1896    94]]
done in 0.840106s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17416229161394378
0.2645640946037155
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     19918
           1       0.41      0.04      0.08      1980

    accuracy                           0.91     21898
   macro avg       0.66      0.52      0.51     21898
weighted avg       0.87      0.91      0.87     21898

Confusion_matrix
[[19799   119]
 [ 1897    83]]
done in 0.870933s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.376
threshold:0.2, J-value:0.248
threshold:0.30000000000000004, J-value:0.159
threshold:0.4, J-value:0.08600000000000001
threshold:0.5, J-value:0.041
threshold:0.6000000000000001, J-value:0.022
threshold:0.7000000000000001, J-value:0.007
threshold:0.8, J-value:0.002
threshold:0.9, J-value:0.001
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6881091967775386
Balanced accuracy score of test is  0.6974904684965936


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17416229161394378
0.264447139772899
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17204
           1       0.43      0.05      0.09      1728

    accuracy                           0.91     18932
   macro avg       0.67      0.52      0.52     18932
weighted avg       0.87      0.91      0.87     18932

Confusion_matrix
[[17087   117]
 [ 1638    90]]
done in 0.832192s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17416229161394378
0.2642994926504671
Classification report
              precision    recall  f1-score   support

           0       0.91      0.99      0.95     17156
           1       0.41      0.05      0.08      1726

    accuracy                           0.91     18882
   macro avg       0.66      0.52      0.52     18882
weighted avg       0.87      0.91      0.87     18882

Confusion_matrix
[[17041   115]
 [ 1646    80]]
done in 0.803114s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.392
threshold:0.2, J-value:0.26599999999999996
threshold:0.30000000000000004, J-value:0.173
threshold:0.4, J-value:0.097
threshold:0.5, J-value:0.045
threshold:0.6000000000000001, J-value:0.023
threshold:0.7000000000000001, J-value:0.007
threshold:0.8, J-value:0.001
threshold:0.9, J-value:0.001
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6960911464388126
Balanced accuracy score of test is  0.7037973667851172


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17416229161394378
0.28386610741145607
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95      2704
           1       0.80      0.02      0.03       262

    accuracy                           0.91      2966
   macro avg       0.86      0.51      0.49      2966
weighted avg       0.90      0.91      0.87      2966

Confusion_matrix
[[2703    1]
 [ 258    4]]
done in 0.814017s


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.17416229161394378
0.26622066425929786
Classification report
              precision    recall  f1-score   support

           0       0.92      1.00      0.96      2762
           1       0.43      0.01      0.02       254

    accuracy                           0.92      3016
   macro avg       0.67      0.51      0.49      3016
weighted avg       0.88      0.92      0.88      3016

Confusion_matrix
[[2758    4]
 [ 251    3]]
done in 0.830086s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.267
threshold:0.2, J-value:0.125
threshold:0.30000000000000004, J-value:0.07300000000000001
threshold:0.4, J-value:0.019
threshold:0.5, J-value:0.015
threshold:0.6000000000000001, J-value:0.011
threshold:0.7000000000000001, J-value:0.008
threshold:0.8, J-value:0.004
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.6331615023262117
Balanced accuracy score of test is  0.6480868023285649
True positive rate of class 1 is  0.647
True positive rat

  _warn_prf(average, modifier, msg_start, len(result))


0.1786415439706444
0.27347313543890395
Classification report
              precision    recall  f1-score   support

           0       0.91      1.00      0.95     17156
           1       1.00      0.00      0.00      1726

    accuracy                           0.91     18882
   macro avg       0.95      0.50      0.48     18882
weighted avg       0.92      0.91      0.87     18882

Confusion_matrix
[[17156     0]
 [ 1725     1]]
done in 1.336143s
threshold:0.0, J-value:0.0
threshold:0.1, J-value:0.37
threshold:0.2, J-value:0.17700000000000002
threshold:0.30000000000000004, J-value:0.068
threshold:0.4, J-value:0.059
threshold:0.5, J-value:0.0
threshold:0.6000000000000001, J-value:0.0
threshold:0.7000000000000001, J-value:0.0
threshold:0.8, J-value:0.0
threshold:0.9, J-value:0.0
Optimal threshold by J value is  0.1
Balanced accuracy score of val is  0.68522178304787
Balanced accuracy score of test is  0.6820844411327909
0.1786415439706444
0.3072382906894386
Classification report
     

In [8]:
def add_mean_sd(records, result_table):
    records.append({
        'auroc': '%.4f (+/- %.4f)' % (result_table["auroc"].mean(), result_table["auroc"].std()),
        'overall ba validation': '%.4f (+/- %.4f)' % (result_table["overall ba validation"].mean(), result_table["overall ba validation"].std()),
        'overall ba test': '%.4f (+/- %.4f)' % (result_table["overall ba test"].mean(), result_table["overall ba test"].std()),
        'white ba validation': '%.4f (+/- %.4f)' %(result_table["white ba validation"].mean(), result_table["white ba validation"].std()),
        'white ba test': '%.4f (+/- %.4f)' % (result_table["white ba test"].mean(), result_table["white ba test"].std()),
        'black ba validation': '%.4f (+/- %.4f)' % (result_table["black ba validation"].mean(), result_table["black ba validation"].std()),
        'black ba test': '%.4f (+/- %.4f)' % (result_table["black ba test"].mean(), result_table["black ba test"].std()),
        'eod': '%.4f (+/- %.4f)' % (result_table["eod"].mean(), result_table["eod"].std()),
        'di': '%.4f (+/- %.4f)' % (result_table["di"].mean(), result_table["di"].std()),
        })
    pd_result = pd.DataFrame(records)
    return pd_result

In [9]:
result_lr = add_mean_sd (records_lr, result_lr)
result_rf = add_mean_sd (records_rf, result_rf)
result_dt = add_mean_sd (records_dt, result_dt)
result_gbt = add_mean_sd (records_gbt, result_gbt)

result_path='/Users/lifuchen/Desktop/research/resample/'
result_lr.to_csv(path.join(result_path,'race-lr-resample-size-result.csv'), index=False)
result_rf.to_csv(path.join(result_path,'race-rf-resample-size-result.csv'), index=False)
result_dt.to_csv(path.join(result_path,'race-dt-resample-size-result.csv'), index=False)
result_gbt.to_csv(path.join(result_path,'race-gbt-resample-size-result.csv'), index=False)