In [9]:
import pandas as pd 
import numpy as np
import csv
import baseline_functions as base

# restore saved variables
%store -r summary_violent2_ky_model

In [10]:
KY_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/KY_cleaned")
FL_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/FL_cleaned")

## split x and y 
KY_X = KY_data.loc[:,:'current_violence']
KY_Y = KY_data['recid_violence_two_year'].values
FL_X = FL_data.loc[:,:'current_violence']
FL_Y = FL_data['recid_violence_two_year'].values

### Model

In [11]:
#### Logistic
c = [1e-4, 1e-3, 0.01, 0.1, 1]
logistic_summary = base.Logistic(KY_X, KY_Y, FL_X, FL_Y,C=c,seed=816)

#### Lasso
c = [1e-4, 1e-3, 0.01, 0.1, 1]
lasso_summary = base.Lasso(KY_X, KY_Y, FL_X, FL_Y,C=c,seed=816)

#### LinearSVM
c = [1e-4, 1e-3, 0.01, 0.1, 1]
svm_summary = base.LinearSVM(KY_X, KY_Y, FL_X, FL_Y,C=c,seed=816)

#### Random Forest
n_estimators =  [40,60,80]
depth = [8,9]
rf_summary = base.RF(KY_X, KY_Y, FL_X, FL_Y, 
                     depth=depth, 
                     estimators=n_estimators,
                     seed=816)

#### XGBoost
learning_rate = [0.01]
depth = [8]
n_estimators = [200]
xgb_summary = base.XGB(KY_X, KY_Y, FL_X, FL_Y,
                       learning_rate=learning_rate, 
                       depth=depth, 
                       estimators=n_estimators,
                       seed=816)

#### save results
summary_violent2_ky_model = {"Logistic": logistic_summary,
                                    "Lasso": lasso_summary,
                                    "LinearSVM": svm_summary,
                                    "RF": rf_summary,
                                    "XGBoost": xgb_summary}
%store summary_violent2_ky_model

Stored 'summary_violent2_ky_model' (dict)


In [12]:
results = []

for model_name, model_summary in summary_violent2_ky_model.items():
    results.append([model_name, 
                    model_summary['best_auc'], 
                    model_summary['auc_diff'], 
                    model_summary['best_param'], 
                    model_summary['FL_score']])
results

[['Logistic',
  0.8088695642322072,
  0.0013879706550848914,
  {'C': 0.1},
  0.6481043851889297],
 ['Lasso',
  0.8088581197695939,
  0.0013429270660689774,
  {'C': 0.1},
  0.6482623530898521],
 ['LinearSVM',
  0.8074313640876848,
  0.0011975361513377747,
  {'C': 1},
  0.6382598256034373],
 ['RF',
  0.8128474159060761,
  0.019971523479936915,
  {'max_depth': 9, 'n_estimators': 60},
  0.6456795779097687],
 ['XGBoost',
  0.813487657098685,
  0.024543791330624454,
  {'learning_rate': 0.01, 'max_depth': 8, 'n_estimators': 200},
  0.6146507329710602]]

In [13]:
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\KY_FL\\KY Model\\Two Year\\"
results = [["Violent",             
            round(logistic_summary['FL_score'],3), 
            round(lasso_summary['FL_score'], 3), 
            round(svm_summary['FL_score'],3), 
            round(rf_summary['FL_score'],3), 
            round(xgb_summary['FL_score'],3)]]
with open(path + 'Two Year KY_model.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)