In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix

In [22]:
dataset = pd.read_csv('fullEDfullmodel.csv')
dataset.head()

Unnamed: 0,TRAUMATYPE,SYSBP,RR,GCS,EDMOTOR,SI,SIRANK,AGE,SEX,RTS,...,PhysiologicalAMPT,LungAMPT,AMPT,Mechanism,RTSCode,AgeGroups,SBPCode,MotorCode,AMPT2,class
0,0,100,21,15,6,0.72,2,80.0,0,7.8408,...,0,0,0,0,0,5,0,0,0,T
1,0,103,22,14,6,0.650485,1,80.0,1,7.8408,...,0,0,0,1,0,5,0,0,0,T
2,0,96,12,15,6,0.791667,2,67.0,0,7.8408,...,0,0,0,1,0,5,0,0,0,T
3,1,141,20,15,6,0.553191,1,63.0,1,7.8408,...,0,0,0,0,0,5,0,0,0,T
4,0,105,24,15,6,0.695238,1,66.0,0,7.8408,...,0,1,2,0,0,5,0,0,1,T


In [23]:
X = dataset.iloc[:, :-1] #removes ['class']
y = dataset['class']

In [24]:
#parameters: clf- a classifier that has not been fit, but has been initialized; sens_target- the target sensitivity for tuning specificitiy
#Tunes a classifier using 10 runs of stratified sampling to tune the decision threshold to result in a sensitivity within 1-percent of the target
#Retruns the average sensitivity, specificity, and descision threshold for the 10 runs
def treeThresholdTuner(clf, sens_target):
    sensArr = []
    specArr = []
    thresholds = []
    for i in range(1,11):
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=i, stratify = y)
        clf.fit(X_train, y_train)
        threshold = 0.00
        sensitivity = 0
        specificity = 0
        while((sensitivity < (sens_target - 0.01) or sensitivity > (sens_target + 0.01)) and threshold < 0.5):
            threshold = threshold + 0.01
            clf_pred = (clf.predict_proba(X_test) >= threshold)
            clf_pred = [item[1] for item in clf_pred]
            for i in range(len(clf_pred)):
                if clf_pred[i] == False:
                    clf_pred[i] = 'F'
                else:
                    clf_pred[i] = 'T'   
            tp, fp, fn, tn = confusion_matrix(y_test, clf_pred).ravel()
            if (tp + fn) != 0:
                sensitivity = tp/(tp+fn)
            if (tn + fp) != 0:
                specificity = tn/(tn+fp)
        if sensitivity > (sens_target - 0.01):
            if sensitivity < (sens_target + 0.01):
                sensArr.append(sensitivity)
                specArr.append(specificity)
                thresholds.append(threshold)
    if len(sensArr) == 0:
        return -1, -1, -1
    sensArr = np.array(sensArr)
    sensAvg = sensArr.sum(axis=0) / len(sensArr)
    specArr = np.array(specArr)
    specAvg = specArr.sum(axis=0) / len(sensArr)
    thresholds = np.array(thresholds)
    thresholdAvg = thresholds.sum(axis=0) / len(thresholds)
    return sensAvg, specAvg, thresholdAvg

In [28]:
#Parameters: max_depth, max_leaf_nodes, and splitter, see DecisionTreeClassfier's Documentation for info
#Initializes a gini DecisionTreeClassifier with the passed in parameters and calls tuneClfSpec, printing the results
def treeTuner(max_depth, max_leaf_nodes, splitter):
    tre = tree.DecisionTreeClassifier(random_state = 42, criterion='gini', max_depth = max_depth, max_leaf_nodes = max_leaf_nodes, splitter = splitter)
    sensitivity, specificity, threshold = treeThresholdTuner(tre, 0.95)
    print(f'Parameters: Max Depth: {max_depth}, Max Leaf Nodes: {max_leaf_nodes}, Splitter: {splitter}')
    print(f'Sensitivity: {sensitivity}, Specificity: {specificity}, Threshold: {threshold}\n')

In [29]:
#Parameters: params- a 2-d array of size 3,x, where x can be any length, containing hyperparameters to be tuned
#Loops through all permutations of given hyperparameters and calls treeTunerSpec with those parameters
def gridSearchTree(params):
    for i in range(0, len(params[0])):
        for j in range(0, len(params[1])):
            for k in range(0, len(params[2])):
                treeTuner(params[0][i], params[1][j], params[2][k])

In [30]:
params = [[4, 6, 8, None], [20, 25, 30, None], ['best', 'random']]
gridSearchTree(params)

Parameters: Max Depth: 4, Max Leaf Nodes: 20, Splitter: best
Sensitivity: 0.9516352687269162, Specificity: 0.4908853324332916, Threshold: 0.13799999999999996

Parameters: Max Depth: 4, Max Leaf Nodes: 20, Splitter: random
Sensitivity: 0.9544358389612618, Specificity: 0.4716004893088243, Threshold: 0.05

Parameters: Max Depth: 4, Max Leaf Nodes: 25, Splitter: best
Sensitivity: 0.9516352687269162, Specificity: 0.4908853324332916, Threshold: 0.13799999999999996

Parameters: Max Depth: 4, Max Leaf Nodes: 25, Splitter: random
Sensitivity: 0.9544358389612618, Specificity: 0.4716004893088243, Threshold: 0.05

Parameters: Max Depth: 4, Max Leaf Nodes: 30, Splitter: best
Sensitivity: 0.9516352687269162, Specificity: 0.4908853324332916, Threshold: 0.13799999999999996

Parameters: Max Depth: 4, Max Leaf Nodes: 30, Splitter: random
Sensitivity: 0.9544358389612618, Specificity: 0.4716004893088243, Threshold: 0.05

Parameters: Max Depth: 4, Max Leaf Nodes: None, Splitter: best
Sensitivity: 0.9516352