In [1]:
import os 
os.chdir('../../')
print("Current working directory is now: ", os.getcwd())

import pandas as pd
import numpy as np
import csv
import ky_utils.ky_baseline_functions as base

Current working directory is now:  C:\Users\binha\Documents\Duke\Cynthia Research\interpretable-machine-learning\ky-fl-combined\ky-model


In [2]:
KY_data = pd.read_csv("~/Documents/Duke/Cynthia Research/data/ky-fl-data/KY_cleaned").sort_values('person_id')
FL_data = pd.read_csv("~/Documents/Duke/Cynthia Research/data/ky-fl-data/FL_cleaned").sort_values('person_id')

## split x and y 
KY_X = KY_data.loc[:,:'current_violence20']
KY_Y = KY_data['general_six_month'].values
FL_X = FL_data.loc[:,:'current_violence20']
FL_Y = FL_data['general_six_month'].values

### Model

In [3]:
#### Logistic
c = [1e-4, 1e-3, 1e-2, 1e-1, 1]
logistic_summary = base.Logistic(KY_X, KY_Y, FL_X, FL_Y,C=c,seed=816)

#### Lasso
c = [1e-4, 1e-3, 1e-2, 1e-1, 1]
lasso_summary = base.Lasso(KY_X, KY_Y, FL_X, FL_Y,C=c,seed=816)

#### LinearSVM
c = [1e-4, 1e-3, 1e-2, 1e-1, 1]
svm_summary = base.LinearSVM(KY_X, KY_Y, FL_X, FL_Y,C=c,seed=816)

#### Random Forest
n_estimators =  [100,150,200]
depth = [7,8,9]
rf_summary = base.RF(KY_X, KY_Y, FL_X, FL_Y, 
                     depth=depth, 
                     estimators=n_estimators,
                     seed=816)

#### XGBoost
learning_rate = [0.1]
depth = [4,5,6]
n_estimators = [100,150]
xgb_summary = base.XGB(KY_X, KY_Y, FL_X, FL_Y,
                       learning_rate=learning_rate, 
                       depth=depth, 
                       estimators=n_estimators,
                       seed=816)

#### save results
summary_general6_ky_model = {"Logistic": logistic_summary,
                             "Lasso": lasso_summary,
                             "LinearSVM": svm_summary,
                             "RF": rf_summary,
                             "XGBoost": xgb_summary}



In [4]:
results = []
for model_name, model_summary in summary_general6_ky_model.items():
    results.append([model_name, 
                    np.mean(model_summary['KY_validation']), 
                    np.mean(model_summary['auc_diff']), 
                    np.mean(model_summary['FL_score'])])
results

[['Logistic', 0.759555002820291, 0.000608301731505212, 0.5706450198130265],
 ['Lasso', 0.7595225481912151, 0.0005213343071691412, 0.5703947985996229],
 ['LinearSVM', 0.7621655330000447, 0.0005288347819071948, 0.5667256569076289],
 ['RF', 0.7753122149644962, 0.015840716232166718, 0.5745905436078944],
 ['XGBoost', 0.7817230605205759, 0.0170716915212618, 0.5680372407955988]]

In [5]:
path = "./results/baselines/six-month/"
results = [["", "Logistic", "Lasso", "Linear SVM", "Random Forest", "XGBoost"],
           ["General",             
            np.str(round(np.mean(logistic_summary['FL_score']),3)) + " (" + np.str(round(np.std(logistic_summary['FL_score']),3)) + ")", 
            np.str(round(np.mean(lasso_summary['FL_score']), 3)) + " (" + np.str(round(np.std(lasso_summary['FL_score']),3)) + ")", 
            np.str(round(np.mean(svm_summary['FL_score']),3)) + " (" + np.str(round(np.std(svm_summary['FL_score']),3)) + ")", 
            np.str(round(np.mean(rf_summary['FL_score']),3)) + " (" + np.str(round(np.std(rf_summary['FL_score']),3)) + ")", 
            np.str(round(np.mean(xgb_summary['FL_score']),3)) + " (" + np.str(round(np.std(xgb_summary['FL_score']),3)) + ")"]]
with open(path + 'six-month-ky-baseline.csv', 'w') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerows(results)