In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix

In [2]:
dataset = pd.read_csv('fullEDfullmodel.csv')
dataset['class'] = dataset['class'].map({'T': 1, 'F': 0})
dataset.head()

Unnamed: 0,TRAUMATYPE,SYSBP,RR,GCS,EDMOTOR,SI,SIRANK,AGE,SEX,RTS,...,PhysiologicalAMPT,LungAMPT,AMPT,Mechanism,RTSCode,AgeGroups,SBPCode,MotorCode,AMPT2,class
0,0,100,21,15,6,0.72,2,80.0,0,7.8408,...,0,0,0,0,0,5,0,0,0,1
1,0,103,22,14,6,0.650485,1,80.0,1,7.8408,...,0,0,0,1,0,5,0,0,0,1
2,0,96,12,15,6,0.791667,2,67.0,0,7.8408,...,0,0,0,1,0,5,0,0,0,1
3,1,141,20,15,6,0.553191,1,63.0,1,7.8408,...,0,0,0,0,0,5,0,0,0,1
4,0,105,24,15,6,0.695238,1,66.0,0,7.8408,...,0,1,2,0,0,5,0,0,1,1


In [3]:
X = dataset.iloc[:, :-1] #removes ['class']
y = dataset['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify = y)

In [6]:
def adjusted_thresh(y_pred, thresh):
    
    c1_scores = y_pred[:, 1]
    
    return [1 if y >= thresh else 0 for y in c1_scores]

In [7]:
#index 0,1,2,3 are min_samples_split, max_depth, max_features, and n_estimators respectively
import itertools  
import time

param_grid = {
    'min_samples_split': [25, 27], 
    'n_estimators' : [400, 450, 500],
    'max_depth': [5, 10, 17, 50],
    'max_features': [5, 15, 20],
    'max_leaf_nodes': [75, 95, 100],
}

def optimize(p_grid):
    start = time.time()
    best_param = []
    best_spec = 0
    
    
    #partition p_grid and get an itertools form list of all possible combinations
    param_names = []
    param_lists = []
    for param in p_grid.keys():
        param_names.append(param)
        param_lists.append(param_grid[param])
        
    combinations = itertools.product(param_lists[0], param_lists[1], param_lists[2], param_lists[3], param_lists[4])
    
    #Test each combination of parameters in p_grid and record the highest parameter pair with 95% sen and highest spec
    for pair in combinations:
        pair = list(pair) #convert from itertools tuple to lsit
        
        rf = RandomForestClassifier(min_samples_split = pair[0], n_estimators = pair[1], max_depth = pair[2], max_features = pair[3], max_leaf_nodes = pair[4])
        rf.fit(X_train, y_train)

        pred = rf.predict_proba(X_test)
        adj_pred = adjusted_thresh(pred, 0.13) #adjust threshold for about 95% sen

        tp, fp, fn, tn = confusion_matrix(y_test, adj_pred).ravel()

        sen = tp/(tp+fn)
        spec = tn/(tn+fp)
        
        if sen >= 0.95 and spec > best_spec:
            best_spec = spec
            best_param = pair
            best_param.extend([sen, spec]) #add sensitivity and spec to best param
            best_model = rf
    
    end = time.time()
    print(f'Function runtime: {(end-start)/60} min')
    
    return best_param[:-2], best_param[-2:], best_model

In [None]:
best_param, stats, clf = optimize(param_grid)
print(best_param)
print(stats)


In [174]:
#Used if you want to adjust the thresholding from the best model 

pred = clf.predict_proba(X_test)
adj_pred = adjusted_thresh(pred, 0.13)

tp, fp, fn, tn = confusion_matrix(y_test, adj_pred).ravel()

print(f'Sensitivity: {tp/(tp+fn)}')
print(f'Specificity: {tn/(tn+fp)}')
print(tp, fp, fn, tn)

Sensitivity: 0.9524000803374172
Specificity: 0.507579309267073
3248 3151 237 4742
