In [2]:
import pandas as pd 
import numpy as np
import csv

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.utils import shuffle

import matplotlib.pyplot as plt
from tabulate import tabulate 

In [4]:
from sklearn.linear_model import LogisticRegression, Lasso  ## logistic & lasso
from sklearn.tree import DecisionTreeClassifier             ## decision tree
from sklearn.ensemble import RandomForestClassifier         ## RF
from sklearn.svm import SVC                                 ## SVM
import xgboost as xgb                                       ## XGBoost

#### Upload Data

In [5]:
## load data sets
train_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/Without Traffic Data/train_recid_use.csv")
test_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/Without Traffic Data/test_recid_use.csv")

## get rid of the record with 'p_age_first_offense' == 0
train_pd = train_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd[test_pd['p_age_first_offense'] != 0]

## split train and test
x_train, y_train = train_pd.values[:, :-1], train_pd.values[:, -1]
x_test, y_test = test_pd.values[:, :-1], test_pd.values[:, -1]

#### Cross Validation

In [6]:
def crossvalidation(X, Y, nfold, classifier, seed = 816):
    
    """
    
    @parameters:
    - X: training set -- features
    - Y: training set -- response variable
    - classifier: specific classifier
    - nfold: n-folds cross validation
    - seed: random state
    """
    
    
    ## n-folds cross validation set up
    #cv = KFold(n_splits=nfold, random_state=seed, shuffle=True)
    cv = StratifiedKFold(n_splits=nfold, random_state=seed, shuffle=True)
    
    ## classifier: logistic regression
    #classifier = LogisticRegression(class_weight = 'balanced', solver='liblinear', C=c, random_state=seed)
    train_acc, test_acc = [], []
    train_auc, test_auc = [], []

    i = 0
    for train, test in cv.split(X, Y):
    
        ## data & classifier
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = X[test], Y[test]
        fit_model = classifier.fit(X_train, Y_train)
    
        ## accuracy & probability
        train_acc.append(fit_model.score(X_train, Y_train))
        test_acc.append(fit_model.score(X_test, Y_test))
    
        train_prob = fit_model.predict_proba(X_train)[:,1]
        test_prob = fit_model.predict_proba(X_test)[:,1]
    
        ## compute AUC
    
        train_fpr, train_tpr, train_thresholds = roc_curve(Y_train, train_prob)
        test_fpr, test_tpr, test_thresholds = roc_curve(Y_test, test_prob)    
        train_auc.append(auc(train_fpr, train_tpr))
        test_auc.append(auc(test_fpr, test_tpr))
        i += 1

    return train_acc, test_acc, train_auc, test_auc