In [4]:
import pandas as pd 
import numpy as np
import csv

from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.metrics import roc_curve, auc, confusion_matrix
from sklearn.utils import shuffle

import matplotlib.pyplot as plt

In [5]:
## load data sets
train_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward/data/Without Minor Offenses/train_recid_violent.csv")
test_pd = pd.read_csv("~/Documents/Duke/Cynthia Research/KY-analysis-mytrials/broward//data/Without Minor Offenses/test_recid_violent.csv")

## get rid of the record with 'p_age_first_offense' == 0
train_pd = train_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd.drop(['person_id', 'screening_date'], axis=1)
test_pd = test_pd[test_pd['p_age_first_offense'] != 0]

## split train and test
x_train, y_train = train_pd.values[:, :-1], train_pd.values[:, -1]
x_test, y_test = test_pd.values[:, :-1], test_pd.values[:, -1]

#### convert variable types

In [6]:
variables = ['sex', 'current_violent', 'current_violent20', 'six_month', 'one_year', 'three_year', 'five_year', 'recid_violent']
for i in variables:
    train_pd[i] = train_pd[i].astype('category')
    test_pd[i] = test_pd[i].astype('category')

### XGBoost

In [7]:
import xgboost as xgb

### Cross Validaion

In [8]:
## prepare data
trainx = xgb.DMatrix(x_train, label=y_train)
testx = xgb.DMatrix(x_test, label=y_test)

In [9]:
def crossvalidation(X, parameters, setups, seed = 816):
    
    crossvalidation = xgb.cv(dtrain = X, 
                             params=parameters, 
                             num_boost_round = setup['nrounds'], 
                             nfold=setup['nfolds'], 
                             verbose_eval = False, 
                             metrics='auc', 
                             maximize=True, 
                             seed=seed)
    
    iterations = crossvalidation.index
    train_auc = crossvalidation['train-auc-mean'].values
    test_auc = crossvalidation['test-auc-mean'].values
    
    best_index = np.where(test_auc == np.max(test_auc))[0][0]
    best_iterations = iterations[best_index]
    best_test_auc = test_auc[best_index]
    best_train_auc = train_auc[best_index]
    
    return best_iterations, best_train_auc, best_test_auc

#### parameters

In [10]:
ETA = [0.01, 0.03, 0.05]
GAMMA = [12,14,16,18]
DEPTH = [1,2,3]
CHILD_WEIGHT = [12,14,16,18]
SUB_SAMPLE = [0.1, 0.3, 0.5, 0.7]
setup = {'nfolds': 5, 'nrounds': 100}

In [11]:
results = []

for e in ETA:
    for g in GAMMA:
        for d in DEPTH:
            for c in CHILD_WEIGHT:
                for s in SUB_SAMPLE:
                    parameters = {'objective': "binary:logistic", 
                                  'eta': e, 
                                  'gamma': g, 
                                  'max_depth': d, 
                                  'min_child_weight': c, 
                                  'subsample': s, 
                                  'colsample_bytree': 1, 
                                  'early_stopping_rounds': 10}
            
                    ite, train_auc, test_auc = crossvalidation(trainx, parameters, setup, seed=816)
                    auc_diff = train_auc - test_auc
                    results.append([e, g,d,c,s,ite, test_auc, auc_diff])

In [12]:
table = pd.DataFrame(results, columns=['Learning Rate', 'Gamma', 'Depth', 'Min Child Weight', 'Subsample', 'Iteration', 'Validation AUC', 'AUC Diff'])

In [13]:
table = table[table['AUC Diff'] <= 0.02]

In [14]:
table.sort_values(by = 'Validation AUC', axis=0, ascending = False)

Unnamed: 0,Learning Rate,Gamma,Depth,Min Child Weight,Subsample,Iteration,Validation AUC,AUC Diff
181,0.01,18,3,14,0.3,92,0.631203,0.014929
165,0.01,18,2,14,0.3,92,0.631203,0.014929
149,0.01,18,1,14,0.3,92,0.631203,0.014929
169,0.01,18,2,16,0.3,97,0.630652,0.012580
185,0.01,18,3,16,0.3,97,0.630652,0.012580
153,0.01,18,1,16,0.3,97,0.630652,0.012580
145,0.01,18,1,12,0.3,29,0.628250,0.005253
177,0.01,18,3,12,0.3,29,0.628250,0.005253
161,0.01,18,2,12,0.3,29,0.628250,0.005253
189,0.01,18,3,18,0.3,82,0.627366,0.015928


### Best Parameter:
- learning rate: 0.01 / depth: 3 / gamma: 18 / min_child_weight: 14 / subsample: 0.3 / iteration: 93

In [15]:
parameters = {'objective': "binary:logistic", 'eta': 0.01, 'gamma': 18, 'max_depth': 3, 'min_child_weight': 14, 'subsample': 0.3, 
              'colsample_bytree': 1, 'early_stopping_rounds': 10}

In [16]:
cv = xgb.cv(parameters,trainx, num_boost_round=93, nfold=5, metrics='auc', verbose_eval=False, maximize=True, seed=816)

In [17]:
train_auc = cv['train-auc-mean'].values
test_auc = cv['test-auc-mean'].values
best_index = np.where(test_auc == np.max(test_auc))[0][0]

test_auc_mean = test_auc[best_index]
train_auc_mean = train_auc[best_index]
train_auc_std = np.std(train_auc)
test_auc_std = np.std(test_auc)

In [18]:
train_auc_mean, test_auc_mean, train_auc_std, test_auc_std

(0.6461318, 0.6312026000000001, 0.015907731484299262, 0.012420355363758108)

#### model
-- use 0.5 as threshold

In [19]:
xgboost = xgb.train(parameters, trainx, num_boost_round=93, verbose_eval=False, maximize=True)

In [20]:
pred = xgboost.predict(testx)
heldout_test_acc = np.mean((pred > 0.5) == y_test)
heldout_test_acc

0.8118279569892473

In [21]:
fpr,tpr,thresholds = roc_curve(y_test, pred)
heldout_test_auc = auc(fpr, tpr)
heldout_test_auc

0.6238174077578051

-- use optimal threshold

#optimal_index = np.argmin(np.sqrt(np.square(1-tpr) + np.square(fpr)))
optimal_index = np.argmax(abs(tpr - fpr))
optimal_threshold = thresholds[optimal_index]
optimal_threshold

heldout_test_acc = np.mean((pred > optimal_threshold) == y_test)
heldout_test_acc

#### Log Results

In [22]:
#log model results to the model performance folder, as per standards
path = "C:\\Users\\binha\\Documents\\Duke\\Cynthia Research\\KY-analysis-mytrials\\broward\\broward models\\Baseline Model Results\\Without Minor Offenses\\Violent\\"
                  
results = ["XGBoost", train_auc_mean, train_auc_std, test_auc_mean, test_auc_std, heldout_test_auc, heldout_test_acc ]

with open(path + 'Violent Summary.csv', 'a') as writeFile:
    writer = csv.writer(writeFile)
    writer.writerow(results)

### Appendix -- old method

In [17]:
def crossvalidation(X, Y, nfold, learning_rate, depth, N, min_child_weight = 1, gamma = 0, subsample = 1, colsample_bytree = 1,
                    reg_alpha=0, reg_lambda=1, seed=816):
    
    """
    
    @parameters:
    - X: training set -- features
    - Y: training set -- response variable
    - nfold: n-folds cross validation
    - learning_rate: learning rate
    - depth: max split depth
    - N: number of estimators
    - reg_alpha: L1 regularization term on weights
    - reg_lambda: L2 regularization term on weights
    - seed: random state
    
    """
    
    ## nfolds cross validation set up
    cv = KFold(n_splits=nfold, random_state=seed, shuffle=True)
    
    ## classifier
    classifier = xgb.XGBClassifier(learning_rate=learning_rate, 
                                   max_depth=depth, 
                                   n_estimators=N, 
                                   min_child_weight=min_child_weight,
                                   gamma = gamma, 
                                   subsample=subsample,
                                   colsample_bylevel=colsample_bytree,
                                   reg_alpha=reg_alpha, 
                                   reg_lambda=reg_lambda,
                                   random_state=seed)
    train_acc, test_acc = [], []
    train_auc, test_auc = [], []

    i = 0
    for train, test in cv.split(X, Y):
    
        ## data & classifier
        X_train, Y_train = X[train], Y[train]
        X_test, Y_test = X[test], Y[test]
        fit_model = classifier.fit(X_train, Y_train)
    
        ## accuracy & probability
        train_acc.append(fit_model.score(X_train, Y_train))
        test_acc.append(fit_model.score(X_test, Y_test))
    
        train_prob = fit_model.predict_proba(X_train)[:,1]
        test_prob = fit_model.predict_proba(X_test)[:,1]
    
        ## compute ROC curve and AUC
    
        train_fpr, train_tpr, train_thresholds = roc_curve(Y_train, train_prob)
        test_fpr, test_tpr, test_thresholds = roc_curve(Y_test, test_prob)
        train_auc.append(auc(train_fpr, train_tpr))
        test_auc.append(auc(test_fpr, test_tpr))
        i += 1
    
    return train_acc, test_acc, train_auc, test_auc

### Tune Parameters 
-- To prevent overfitting and get as good performance as possible.

-- criteria: difference between the avg. train accuracy and test accuracy and the difference between avg. train auc and avg. test auc are both smaller than 3%.

In [20]:
Depth = [1, 2]
child_weight = [2,3,4,5,6]
gamma = [2,3,4,5,6]

In [None]:
train_ACC = []
train_AUC = []
test_ACC = []
test_AUC = []
LR = []
DEPTH = []
Estimator = []
Child_weight = []
Gamma = []

for i in Depth:
    for j in child_weight:
        for k in gamma:
            train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 10, learning_rate=0.1, depth=i, N=100, min_child_weight=j, gamma=k)
            
            acc_diff = str(round((np.mean(train_acc) - np.mean(test_acc))*100, 2)) + "%"
            auc_diff = str(round((np.mean(train_auc) - np.mean(test_auc))*100, 2)) + "%"
            

In [None]:
Depth = [1, 2]
lr = [0.01, 0.05, 0.1, 0.2, 0.3, 0.4, 0.5]
num_estimator = [10, 20]

In [19]:
print("Learning Rate, ", "Depth, ", "Number of Estimators, ", "Avg. ACC Diff, ", "Avg. AUC Diff, ", "Avg. Test AUC, ", "Avg. Test ACC")

for k in lr:
    for i in Depth:
        for j in num_estimator:
            train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 10, k, i, j)
            acc_diff = str(round((np.mean(train_acc) - np.mean(test_acc))*100, 2)) + "%"
            auc_diff = str(round((np.mean(train_auc) - np.mean(test_auc))*100, 2)) + "%"
            print(k, "             ", i, "          ", j, "                ", acc_diff, "          ", auc_diff, "       ", round(np.mean(test_auc), 3), "         ", round(np.mean(test_acc), 3))

Learning Rate,  Depth,  Number of Estimators,  Avg. ACC Diff,  Avg. AUC Diff,  Avg. Test AUC,  Avg. Test ACC
0.01               1            10                  2.6%            2.45%         0.598           0.571
0.01               1            20                  2.67%            2.62%         0.611           0.569
0.01               2            10                  4.77%            4.1%         0.615           0.566
0.01               2            20                  4.24%            4.43%         0.617           0.577
0.05               1            10                  1.92%            2.38%         0.622           0.576
0.05               1            20                  2.06%            2.69%         0.628           0.58
0.05               2            10                  5.09%            4.75%         0.623           0.577
0.05               2            20                  5.12%            5.63%         0.627           0.587
0.1               1            10                  2.0

In [46]:
train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 10, learning_rate=0.05, depth=1, N=20)

In [47]:
np.mean(train_acc), np.mean(test_acc), np.mean(train_auc), np.mean(test_auc)

(0.6001974985871293, 0.579593336599706, 0.6544994997113768, 0.6275627839190581)

In [48]:
xgboost = xgb.XGBClassifier(learning_rate=0.05, max_depth=1, n_estimators=20).fit(x_train, y_train)
heldout_test_acc=xgboost.score(x_test, y_test)
heldout_test_acc

0.6263440860215054

In [49]:
prob = xgboost.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, prob)
heldout_test_auc = auc(fpr, tpr)
heldout_test_auc

0.6551653377110693

- learning rate: 0.1 / depth: 1 / N: 10

In [29]:
train_acc, test_acc, train_auc, test_auc = crossvalidation(x_train, y_train, 10, 0.1, 1, 10)

In [30]:
np.mean(train_acc), np.mean(test_acc), np.mean(train_auc), np.mean(test_auc)

(0.5996286601942533,
 0.5789523109586804,
 0.6536408805176906,
 0.6279567183245163)

In [31]:
xgboost = xgb.XGBClassifier(learning_rate=0.1, max_depth=1, n_estimators=20).fit(x_train, y_train)
heldout_test_acc=xgboost.score(x_test, y_test)
heldout_test_acc

0.6263440860215054

In [32]:
prob = xgboost.predict_proba(x_test)[:,1]
fpr, tpr, thresholds = roc_curve(y_test, prob)
heldout_test_auc = auc(fpr, tpr)
heldout_test_auc

0.6574372654784241