In [1]:
import pandas as pd 
import numpy as np
import csv
import baseline_functions as base

# restore saved variables
%store -r summary_property2_fl_model

no stored variable summary_property2_fl_model


In [2]:
KY_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/KY_cleaned")
FL_data = pd.read_csv("~/Documents/Duke/Cynthia Research/psa-analysis - test/kentucky/models/ky_fl_combined/FL_cleaned")

## split x and y 
KY_X = KY_data.loc[:,:'current_violence']
KY_Y = KY_data['recid_property_two_year'].values
FL_X = FL_data.loc[:,:'current_violence']
FL_Y = FL_data['recid_property_two_year'].values

### Model

In [3]:
#### Logistic
c = [1e-5, 1e-4, 1e-3, 1e-2]
logistic_summary = base.Logistic(FL_X, FL_Y, KY_X, KY_Y,C=c,seed=816)

#### Lasso
c = [1e-5, 1e-4, 1e-3, 1e-2]
lasso_summary = base.Lasso(FL_X, FL_Y, KY_X, KY_Y,C=c,seed=816)

#### LinearSVM
c = [1e-5, 1e-4, 1e-3, 1e-2]
svm_summary = base.LinearSVM(FL_X, FL_Y, KY_X, KY_Y,C=c,seed=816)

#### Random Forest
n_estimators =  [60,80,100]
depth = [1,2,3]
impurity = [0.001, 0.003, 0.005]
rf_summary = base.RF(FL_X, FL_Y, KY_X, KY_Y, 
                     depth=depth, 
                     estimators=n_estimators, 
                     impurity=impurity,
                     seed=816)

#### XGBoost
learning_rate = [0.05,0.07]
depth = [1,2]
n_estimators = [20,40,60]
gamma = [8,10,12]
child_weight = [8,10,12]
subsample = [0.5]
xgb_summary = base.XGB(FL_X, FL_Y, KY_X, KY_Y,
                       learning_rate=learning_rate, 
                       depth=depth, 
                       estimators=n_estimators,
                       gamma=gamma, 
                       child_weight=child_weight, 
                       subsample=subsample, 
                       seed=816)

#### save results
summary_property2_fl_model = {"Logistic": logistic_summary,
                                    "Lasso": lasso_summary,
                                    "LinearSVM": svm_summary,
                                    "RF": rf_summary,
                                    "XGBoost": xgb_summary}
%store summary_property2_fl_model

Stored 'summary_property2_fl_model' (dict)


In [4]:
results = []

for model_name, model_summary in summary_property2_fl_model.items():
    results.append([model_name, 
                    model_summary['best_auc'], 
                    model_summary['auc_diff'], 
                    model_summary['best_param'], 
                    model_summary['KY_score']])
results

[['Logistic',
  0.7438481818204741,
  0.03550244267044955,
  {'C': 0.01},
  0.6103477372929486],
 ['Lasso',
  0.7365869533914763,
  0.000637974997832802,
  {'C': 0.001},
  0.6433822812618463],
 ['LinearSVM',
  0.7214261256842208,
  0.04109983892998448,
  {'C': 0.01},
  0.5434996213207377],
 ['RF',
  0.7288539873376069,
  0.05347247750893225,
  {'max_depth': 3, 'min_impurity_decrease': 0.001, 'n_estimators': 100},
  0.6109765240343843],
 ['XGBoost',
  0.7349651037342368,
  0.029003625296001867,
  {'gamma': 8,
   'learning_rate': 0.07,
   'max_depth': 2,
   'min_child_weight': 10,
   'n_estimators': 60,
   'subsample': 0.5},
  0.618746999275112]]

In [6]:
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\KY Recidivism\\KY Results\\KY_FL\\FL Model\\Two Year\\"
results = [["Property",            
            round(logistic_summary['KY_score'],3), 
            round(lasso_summary['KY_score'], 3), 
            round(svm_summary['KY_score'],3), 
            round(rf_summary['KY_score'],3), 
            round(xgb_summary['KY_score'],3)]]
with open(path + 'Two Year FL_model.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)