In [1]:
import pandas as pd 
import numpy as np
import csv

import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold, GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score

In [2]:
### load data
data = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/KY Recidivism/KY data/kentucky_data.csv")
data = data.drop(['PersonID', 'screening_date','fta_risk_score_raw','nca_risk_score_raw',
                  'pvf_risk_score_raw', 'fta_calc', 'nca_calc', 'pvf_calc', 'Race'], axis=1)
X = data.loc[:,:'current_violence'].values
Y = data['recid_two_year'].values

In [3]:
X.shape, Y.shape

((146003, 36), (146003,))

In [4]:
## outer loop CV setup
train_outer = []
test_outer = []
outer_cv = KFold(n_splits=5, random_state=816, shuffle=True)

In [5]:
## save index
for train, test in outer_cv.split(X,Y):
    train_outer.append(train)
    test_outer.append(test)

### Logistic Regression

- automic parameter tuning

In [46]:
## model setup
lr = LogisticRegression(class_weight = 'balanced', solver='liblinear', random_state=816)
inner_cv = KFold(n_splits=5,shuffle=True, random_state=816)
#c_grid = {"C": [0.001, 0.01]}
c_grid = {"C": [0.001, 0.01, 0.1]}

In [47]:
holdout_auc = []
best_params = []
index = []
i = 0

while i < len(train_outer):
    
    if len(c_grid['C']) == 0: 
        print ("No parameters!") 
        break
        
    print(i)
    train_x, test_x = X[train_outer[i]], X[test_outer[i]]
    train_y, test_y = Y[train_outer[i]], Y[test_outer[i]]
    
    ## GridSearch: innver CV
    clf = GridSearchCV(estimator=lr, param_grid=c_grid, scoring='roc_auc',
                       cv=inner_cv, return_train_score=True).fit(train_x, train_y)
    
    ## best parameter & scores
    train_score = clf.cv_results_['mean_train_score']
    test_score = clf.cv_results_['mean_test_score']
    best_param = clf.best_params_
    auc_diff = np.mean(train_score) - np.mean(test_score)
    
    ## sanity check
    if auc_diff > 0.2: 
        i = i
        c_grid.remove(best_param['C'])
    else: 
        ## train model on best param
        best_model = LogisticRegression(class_weight = 'balanced', solver='liblinear', 
                                        random_state=816, C=best_param['C']).fit(train_x, train_y)
        prob = best_model.predict_proba(test_x)[:,1]
        pred = best_model.predict(test_x)
        
        ## store results
        holdout_auc.append(roc_auc_score(test_y, prob))
        best_params.append(best_param)
        index.append(i)
        i += 1

0
1
2
3
4


In [48]:
holdout_auc

[0.7235678695035542,
 0.7268969344189598,
 0.7275984005748546,
 0.7267346330719366,
 0.7264735231656887]

In [49]:
best_params

[{'C': 0.1}, {'C': 0.1}, {'C': 0.1}, {'C': 0.1}, {'C': 0.1}]

### Manually Specify Parameters

In [50]:
holdout_auc = []
best_params = []
auc_diff = []
index = []

for i in range(len(train_outer)):
    print(i)
    train_x, test_x = X[train_outer[i]], X[test_outer[i]]
    train_y, test_y = Y[train_outer[i]], Y[test_outer[i]]
    
    ## GridSearch: innver CV
    clf = GridSearchCV(estimator=lr, param_grid=c_grid, scoring='roc_auc',
                       cv=inner_cv, return_train_score=True).fit(train_x, train_y)
    
    ## best parameter & scores
    train_score = clf.cv_results_['mean_train_score']
    test_score = clf.cv_results_['mean_test_score']
    best_param = clf.best_params_
    auc_diff.append(np.mean(train_score) - np.mean(test_score))
    
    ## train model on best param
    best_model = LogisticRegression(class_weight = 'balanced', solver='liblinear', 
                                    random_state=816, C=best_param['C']).fit(train_x, train_y)
    prob = best_model.predict_proba(test_x)[:,1]
    pred = best_model.predict(test_x)
    
    ## store results
    holdout_auc.append(roc_auc_score(test_y, prob))
    best_params.append(best_param)
    index.append(i)

0
1
2
3
4


In [51]:
holdout_auc

[0.7235678695035542,
 0.7268969344189598,
 0.7275984005748546,
 0.7267346330719366,
 0.7264735231656887]

In [52]:
auc_diff

[0.0004437469642566416,
 0.0003579192572692458,
 0.0004714136295144167,
 0.0005590508592319132,
 0.000578855086546981]

In [53]:
best_param

{'C': 0.1}