In [1]:
import pandas as pd 
import numpy as np
import csv

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
from tabulate import tabulate 

In [2]:
## load data sets
train_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/Without Traffic Data/train_recid_violent.csv")
test_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward//data/Without Traffic Data/test_recid_violent.csv")

## get rid of the record with 'p_age_first_offense' == 0
train_pd = train_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd[test_pd['p_age_first_offense'] != 0]

## split train and test
x_train, y_train = train_pd.values[:, :-1], train_pd.values[:, -1]
x_test, y_test = test_pd.values[:, :-1], test_pd.values[:, -1]

## Lasso

In [3]:
from sklearn.linear_model import Lasso

### Cross Validation

In [4]:
def crossvalidation(X, Y, nfold, alpha, seed=816):
    
    """
    
    @parameters:
    - X: training set -- features
    - Y: training set -- response variable
    - nfold: n-folds cross validation
    - c: inverse of regularazation strength. Larger -> smaller regularization; 
    - seed: random state
    
    """
    
    ## n-folds cross validation set up
    cv = StratifiedKFold(n_splits=nfold, random_state=seed, shuffle=True)
    #cv = KFold(n_splits=nfold, random_state=seed, shuffle=True)
    
    ## classifier
    classifier = Lasso(alpha=alpha, random_state=seed)
    train_acc, test_acc = [], []
    train_auc, test_auc = [], []

    i = 0
    for train, test in cv.split(X, Y):
    
        ## data & classifier
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = X[test], Y[test]
        fit_model = classifier.fit(X_train, Y_train)
    
        ## accuracy & probability
        train_prob = fit_model.predict(X_train)
        test_prob = fit_model.predict(X_test)

        train_acc.append(np.mean((train_prob > 0.5) == Y_train))
        test_acc.append(np.mean((test_prob > 0.5) == Y_test))
        
        ## compute ROC curve and AUC
    
        train_fpr, train_tpr, train_thresholds = roc_curve(Y_train, train_prob)
        test_fpr, test_tpr, test_thresholds = roc_curve(Y_test, test_prob)    
        train_auc.append(auc(train_fpr, train_tpr))
        test_auc.append(auc(test_fpr, test_tpr))
        i += 1
    
    return train_acc, test_acc, train_auc, test_auc

### Tune Parameters 
-- To prevent overfitting and get as good performance as possible.

-- criteria: difference between the avg. train accuracy and test accuracy and the difference between avg. train auc and avg. test auc are both smaller than 3%.

In [5]:
Alpha = [0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1,2]

In [6]:
results = []
for a in Alpha:
    train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 5, a)
    auc_diff = np.mean(train_auc) - np.mean(test_auc)
    results.append([a, np.mean(test_auc), auc_diff])



In [7]:
table = pd.DataFrame(results, columns=['Penalty', 'Validation AUC', 'AUC Diff'])

In [8]:
table.sort_values(by = 'Validation AUC', axis=0, ascending = False)

Unnamed: 0,Penalty,Validation AUC,AUC Diff
1,0.005,0.683439,0.035833
0,0.001,0.681764,0.04861
2,0.01,0.68156,0.025688
3,0.05,0.669445,0.01092
4,0.1,0.66768,0.009552
5,0.5,0.561545,-8.6e-05
6,1.0,0.5,0.0
7,2.0,0.5,0.0


### Best Parameter: 

alpha = 0.05

In [9]:
train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 5, alpha=0.05)

In [10]:
np.mean(train_acc), np.mean(test_acc), np.mean(train_auc), np.mean(test_auc)

(0.7882274760383388,
 0.7869410229805943,
 0.6803645948569221,
 0.6694445078557617)

#### Heldout Test Set
-- use 0.5 as threshold

In [11]:
la = Lasso(alpha=0.05, random_state=816).fit(x_train, y_train)
heldout_test_acc = np.mean((la.predict(x_test) > 0.5) == y_test)
heldout_test_acc

0.8091397849462365

In [12]:
prob = la.predict(x_test)
fpr, tpr, thresholds = roc_curve(y_test, prob)
heldout_test_auc = auc(fpr, tpr)
heldout_test_auc

0.6736518448438977

-- use optimal threshold

#optimal_index = np.argmin(np.sqrt(np.square(1-tpr) + np.square(fpr)))
optimal_index = np.argmax(abs(tpr - fpr))
optimal_threshold = thresholds[optimal_index]
optimal_threshold

heldout_test_acc = np.mean((la.predict(x_test) > optimal_threshold) == y_test)
heldout_test_acc

#### Log Results

In [13]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\broward\\broward models\\Baseline Model Results\\Without Traffic\\Violent\\"

train_auc_mean, train_auc_std = np.mean(train_auc), np.std(train_auc)
test_auc_mean, test_auc_std = np.mean(test_auc), np.std(test_auc)
                   
results = ["Lasso", train_auc_mean, train_auc_std, test_auc_mean, test_auc_std, heldout_test_auc, heldout_test_acc ]

with open(path + 'Violent Summary.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(results)