In [1]:
import pandas as pd 
import numpy as np
import csv

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
from tabulate import tabulate 

In [2]:
## load data sets
train_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/Without Traffic Data/train_recid_use.csv")
test_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/Without Traffic Data/test_recid_use.csv")

## get rid of the record with 'p_age_first_offense' == 0
train_pd = train_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd[test_pd['p_age_first_offense'] != 0]

## split train and test
x_train, y_train = train_pd.values[:, :-1], train_pd.values[:, -1]
x_test, y_test = test_pd.values[:, :-1], test_pd.values[:, -1]

### CART

In [3]:
from sklearn.tree import DecisionTreeClassifier

### Cross Validation

In [4]:
def crossvalidation(X, Y, nfold, depth, min_sample_split=2, min_impurity_decrease=0, seed = 816):
    
    """
    
    @parameters:
    - X: training set -- features
    - Y: training set -- response variable
    - nfold: n-folds cross validation
    - depth: max split depth
    - min_sample_split
    - min_impurity_decrease
    - seed: random state
    
    """
    
    
    ## n-folds cross validation set up
    #cv = KFold(n_splits=nfold, random_state=seed, shuffle=True)
    cv = StratifiedKFold(n_splits=nfold, random_state=seed, shuffle=True)
    
    ## classifier: logistic regression
    classifier = DecisionTreeClassifier(max_depth=depth, min_samples_split= min_sample_split, 
                                        min_impurity_decrease=min_impurity_decrease, random_state=seed)
    train_acc, test_acc = [], []
    train_auc, test_auc = [], []

    i = 0
    for train, test in cv.split(X, Y):
    
        ## data & classifier
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = X[test], Y[test]
        fit_model = classifier.fit(X_train, Y_train)
    
        ## accuracy & probability
        train_acc.append(fit_model.score(X_train, Y_train))
        test_acc.append(fit_model.score(X_test, Y_test))
    
        train_prob = fit_model.predict_proba(X_train)[:,1]
        test_prob = fit_model.predict_proba(X_test)[:,1]
    
        ## compute AUC
    
        train_fpr, train_tpr, train_thresholds = roc_curve(Y_train, train_prob)
        test_fpr, test_tpr, test_thresholds = roc_curve(Y_test, test_prob)    
        train_auc.append(auc(train_fpr, train_tpr))
        test_auc.append(auc(test_fpr, test_tpr))
        i += 1

    return train_acc, test_acc, train_auc, test_auc

### Tune Parameters 
-- To prevent overfitting and get as good performance as possible.

-- criteria: difference between the avg. train accuracy and test accuracy and the difference between avg. train auc and avg. test auc are both smaller than 3%.

In [18]:
DEPTH = [1,2,3,4,5,6,7]
IMPURITY = [0.005, 0.007, 0.009, 0.01, 0.012]
SPLIT = [2,3,4,5,6,7,8,9] 

In [19]:
results = []
for d in DEPTH:
    for i in IMPURITY:
        for s in SPLIT:
            train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 5, depth=d, min_impurity_decrease=i, min_sample_split=s)
            auc_diff = np.mean(train_auc) - np.mean(test_auc)
            results.append([d, i, s, np.mean(test_auc), auc_diff])

In [20]:
table = pd.DataFrame(results, columns=['Depth', 'Impurity Decrease','Min Split', 'Validation AUC', 'AUC Diff'])

In [21]:
table.sort_values(by = 'Validation AUC', axis=0, ascending =False)

Unnamed: 0,Depth,Impurity Decrease,Min Split,Validation AUC,AUC Diff
41,2,0.005,3,0.607048,0.028010
45,2,0.005,7,0.607048,0.028010
47,2,0.005,9,0.607048,0.028010
46,2,0.005,8,0.607048,0.028010
40,2,0.005,2,0.607048,0.028010
44,2,0.005,6,0.607048,0.028010
43,2,0.005,5,0.607048,0.028010
42,2,0.005,4,0.607048,0.028010
48,2,0.007,2,0.604697,0.027326
49,2,0.007,3,0.604697,0.027326


### Best Parameter
depth: 2 / Impurity Decrease: 0.005 / Min Split: 3

In [22]:
train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 5, depth=2, min_impurity_decrease=0.005, min_sample_split=3)

In [23]:
np.mean(train_acc), np.mean(test_acc), np.mean(train_auc), np.mean(test_auc)

(0.6103620413197064,
 0.5911918816550387,
 0.6350572460815437,
 0.6070476607354436)

#### Heldout Test Set 
-- using 0.5 as threshold

In [24]:
cart = DecisionTreeClassifier(max_depth=2, min_samples_split=3, min_impurity_decrease=0.005, random_state=816).fit(x_train, y_train)
heldout_test_acc = cart.score(x_test, y_test)
heldout_test_acc

0.6263440860215054

In [25]:
prob = cart.predict_proba(x_test)[:,1]
fpr,tpr,thresholds = roc_curve(y_test, prob)
heldout_test_auc = auc(fpr, tpr)
heldout_test_auc

0.6228013602251408

-- using optimal threshold

#optimal_index = np.argmin(np.sqrt(np.square(1-tpr) + np.square(fpr)))
optimal_index = np.argmax(abs(tpr-fpr))
optimal_threshold = thresholds[optimal_index]
optimal_threshold

prediction = cart.predict_proba(x_test)[:,1]
heldout_test_acc = np.mean((prediction > optimal_threshold) == y_test)
heldout_test_acc

#### Log Results

In [26]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\broward\\broward models\\Baseline Model Results\\Without Traffic\\Recidivism\\"

train_auc_mean, train_auc_std = np.mean(train_auc), np.std(train_auc)
test_auc_mean, test_auc_std = np.mean(test_auc), np.std(test_auc)

results = ["CART", train_auc_mean, train_auc_std, test_auc_mean, test_auc_std, heldout_test_auc, heldout_test_acc ]

with open(path + 'Recidivism Summary.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(results)