In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

from util import load_data, fdr, plot_report

%matplotlib inline

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,ssn_count_3,fulladdress_count_0,address_count_0,name_dob_count_3,homephone_count_14,name_dob_count_7,name_day_since,ssn_firstname_count_0_by_14,name_count_7,fulladdress_homephone_count_0_by_14,...,ssn_count_0_by_14,ssn_firstname_count_7,ssn_count_7,fulladdress_count_1_by_14,fulladdress_homephone_count_7,ssn_name_dob_count_7,name_count_14,ssn_firstname_count_0_by_30,ssn_lastname_count_0_by_30,ssn_count_14
33556,-0.047591,-0.041351,-0.042158,-0.046415,0.708788,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
450170,-0.047591,-0.041351,-0.042158,-0.046415,0.135423,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
130443,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
173023,-0.047591,-0.041351,-0.042158,-0.046415,-0.437943,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421
207718,-0.047591,-0.041351,-0.042158,-0.046415,-1.011308,-0.055801,0.598116,0.123528,-0.115648,0.120387,...,0.124781,-0.057185,-0.058092,0.123861,-0.057263,-0.055635,-0.159376,0.171472,0.171458,-0.072421


In [4]:
lr = LogisticRegression(max_iter=1000)
params = {'C': [0.001, 0.01, 0.1, 1, 10],
          'class_weight': ['balanced', None]}
clf = GridSearchCV(lr, params, cv=5, refit='FDR', verbose=1,
                   n_jobs=8, scoring={'FDR': fdr, 'ROC': 'roc_auc'})
clf.fit(x_train, y_train)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:  2.2min
[Parallel(n_jobs=8)]: Done  50 out of  50 | elapsed:  4.2min finished


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=8,
             param_grid={'C': [0.001, 0.01, 0.1, 1, 10],
                         'class_weight': ['balanced', None]},
             pre_dispatch='2*n_jobs', refit='FDR', return_train_score=False,
             scoring={'FDR': <function fdr at 0x7efcac4fd950>,
                      'ROC': 'roc_auc'},
             verbose=1)

In [5]:
print(f"The best parameters are {clf.best_params_} "
      f"with a score of {clf.best_score_:0.5f}")

The best parameters are {'C': 0.001, 'class_weight': 'balanced'} with a score of 0.53327


In [8]:
pd.DataFrame(clf.cv_results_).sort_values('rank_test_FDR').filter(
    items=['params', 'mean_test_FDR','std_test_FDR',
           'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

Unnamed: 0,params,mean_test_FDR,std_test_FDR,rank_test_FDR,mean_test_ROC,std_test_ROC
0,"{'C': 0.001, 'class_weight': 'balanced'}",0.533271,0.012146,1,0.775996,0.004343
2,"{'C': 0.01, 'class_weight': 'balanced'}",0.532946,0.012247,2,0.775563,0.004212
4,"{'C': 0.1, 'class_weight': 'balanced'}",0.532404,0.011765,3,0.77535,0.004275
6,"{'C': 1, 'class_weight': 'balanced'}",0.532187,0.011683,4,0.775418,0.004336
8,"{'C': 10, 'class_weight': 'balanced'}",0.532187,0.011683,4,0.775436,0.004354
1,"{'C': 0.001, 'class_weight': None}",0.526118,0.011745,6,0.775012,0.004297
3,"{'C': 0.01, 'class_weight': None}",0.520917,0.012804,7,0.773762,0.003487
9,"{'C': 10, 'class_weight': None}",0.520376,0.012696,8,0.771808,0.004101
5,"{'C': 0.1, 'class_weight': None}",0.520159,0.013316,9,0.771966,0.003726
7,"{'C': 1, 'class_weight': None}",0.520159,0.012929,10,0.771875,0.004118


In [7]:
lr = LogisticRegression(max_iter=1000, C=1, class_weight='balanced').fit(x_train, y_train)
print(fdr(lr, x_train, y_train))
print(fdr(lr, x_test, y_test))
print(fdr(lr, x_oot, y_oot))

0.5330515821413091
0.5138146167557932
0.5169462512838069


In [None]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_40.csv')
lr = LogisticRegression(max_iter=1000, C=1, class_weight='balanced').fit(x_train, y_train)
print(fdr(lr, x_train, y_train))
print(fdr(lr, x_test, y_test))
print(fdr(lr, x_oot, y_oot))