In [1]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle
from sklearn.model_selection import GridSearchCV

from util import load_data, fdr, plot_report

%matplotlib inline

In [2]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_30.csv')
x_train.head()

Unnamed: 0,fulladdress_count_1_by_7,address_count_0,ssn_count_3,fulladdress_count_0,homephone_count_7,address_count_1_by_7,name_dob_count_3,name_count_7,homephone_count_3,fulladdress_homephone_count_0_by_14,...,ssn_dob_count_7,ssn_name_count_7,name_day_since,ssn_firstname_count_14,ssn_count_7,name_count_14,fulladdress_count_1_by_14,fulladdress_homephone_count_0_by_30,ssn_lastname_count_14,ssn_name_count_14
457690,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
783518,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
170533,0.09144,-0.042158,-0.047591,-0.041351,-0.769214,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
787095,0.09144,-0.042158,-0.047591,-0.041351,0.055132,0.103441,-0.046415,-0.115648,-0.521454,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962
333760,0.09144,-0.042158,-0.047591,-0.041351,0.055132,0.103441,-0.046415,-0.115648,0.639149,0.120387,...,-0.055708,-0.057109,0.598116,-0.071088,-0.058092,-0.159376,0.123861,0.166392,-0.071051,-0.070962


In [3]:
lr = LogisticRegression(max_iter=1000)
params = {'C': [1e-4, 1e-3, 1e-2, 0.1, 1, 10],
          'class_weight': ['balanced', None]}
clf = GridSearchCV(lr, params, cv=10, refit='FDR', verbose=1,
                   n_jobs=8, scoring={'FDR': fdr})
clf.fit(x_train, y_train)

Fitting 10 folds for each of 12 candidates, totalling 120 fits


[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:   32.3s
[Parallel(n_jobs=8)]: Done 120 out of 120 | elapsed: 10.3min finished


GridSearchCV(cv=10, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=1000, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=8,
             param_grid={'C': [0.0001, 0.001, 0.01, 0.1, 1, 10],
                         'class_weight': ['balanced', None]},
             pre_dispatch='2*n_jobs', refit='FDR', return_train_score=False,
             scoring={'FDR': <function fdr at 0x7f60e362e510>}, verbose=1)

In [4]:
print(f"The best parameters are {clf.best_params_} "
      f"with a score of {clf.best_score_:0.5f}")

The best parameters are {'C': 0.0001, 'class_weight': 'balanced'} with a score of 0.53417


In [5]:
pd.DataFrame(clf.cv_results_).filter(
    items=['mean_fit_time', 'mean_score_time',
           'params', 'mean_test_FDR','std_test_FDR',
           'rank_test_FDR', 'mean_test_ROC', 'std_test_ROC'])

Unnamed: 0,mean_fit_time,mean_score_time,params,mean_test_FDR,std_test_FDR,rank_test_FDR
0,5.219435,0.387736,"{'C': 0.0001, 'class_weight': 'balanced'}",0.534166,0.014937,1
1,2.912182,0.418131,"{'C': 0.0001, 'class_weight': None}",0.531875,0.015249,2
2,10.126874,0.419696,"{'C': 0.001, 'class_weight': 'balanced'}",0.530674,0.015912,3
3,5.982173,0.419448,"{'C': 0.001, 'class_weight': None}",0.523798,0.015852,8
4,21.13365,0.437839,"{'C': 0.01, 'class_weight': 'balanced'}",0.529036,0.015682,5
5,14.855013,0.476416,"{'C': 0.01, 'class_weight': None}",0.518886,0.014844,9
6,44.248722,0.452354,"{'C': 0.1, 'class_weight': 'balanced'}",0.528927,0.014743,6
7,30.011435,0.456462,"{'C': 0.1, 'class_weight': None}",0.518449,0.01491,10
8,84.022432,0.451482,"{'C': 1, 'class_weight': 'balanced'}",0.528927,0.014686,6
9,62.679485,0.434214,"{'C': 1, 'class_weight': None}",0.518013,0.015162,11


In [6]:
lr = LogisticRegression(max_iter=1000, C=1e-4, class_weight='balanced').fit(x_train, y_train)
print(fdr(lr, x_train, y_train))
print(fdr(lr, x_test, y_test))
print(fdr(lr, x_oot, y_oot))

0.5350360183366077
0.5320346320346321
0.5203697363916467


In [7]:
x_train, x_test, x_oot, y_train, y_test, y_oot = load_data('var_40.csv')
lr = LogisticRegression(max_iter=1000, C=1e-4, class_weight='balanced').fit(x_train, y_train)
print(fdr(lr, x_train, y_train))
print(fdr(lr, x_test, y_test))
print(fdr(lr, x_oot, y_oot))

0.5336918349429324
0.5385593220338983
0.5224238274563505
