In [13]:
#Import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix
from statistics import mean
from itertools import product
from os import listdir
from datetime import date
import time
import numpy as np
import csv

In [1]:
class HyperParameterTuning:
    def __init__(self, path): #make this take in classifier type 
        self.path = path

        dir_list = listdir(self.path)
        
        version = 1
        for file in dir_list:
            if str(date.today()) in file:
                version = int(file[file.index('v')+1])+1
 
        self.prefix = f'{date.today()}-v{version}'
            

    def get_prefix(self):
        return self.prefix
        
        
    def gridSearchCustom(self, param_grid, X, y, display = False):
        start = time.time()
        
        clf_arrs = []
        param_arrs = []
        sen_arrs = []
        spec_arrs = []
        thresh_arrs = []
        conf_matrix  = []
        output = []
        
        
        best_specificity = 0
        param_grid_list = [dict(zip(param_grid, v)) for v in product(*param_grid.values())]
        
        for params in param_grid_list:
            if display:
                print(params)
            clf = tree.DecisionTreeClassifier(random_state = 42, **params)
            sensArr, specArr, thresholds, conf_matrices, y_data = self.thresholdTuner(clf, 0.95, X, y, display)
            
            clf_arrs.append(clf)
            param_arrs.append(params)
            sen_arrs.append(sensArr)
            spec_arrs.append(specArr)
            thresh_arrs.append(thresholds)
            conf_matrix.append(conf_matrices)
            
            
            
            if len(specArr) == 0:
                specAvg = 0
            else:
                specAvg = specArr.sum(axis=0) / len(specArr)
                
            if specAvg > best_specificity:
                best_specificity = specAvg
                self.best_clf = clf
                self.best_params = params
                self.best_sensArr = sensArr
                self.best_specArr = specArr
                self.best_thresholds = thresholds
                self.best_conf_matrices = conf_matrices
                self.best_y_data = y_data
            if display:
                print(f'Specificity: {round(specAvg, 4)}\n')
                
                
            output.append([params, mean(sensArr), mean(specArr), mean(thresholds)])
            
        end = time.time()
        print(f'\nFunction runtime: {(end-start)/60} min')
                           
        with open(f'{self.path}\\{self.prefix}-hp_tuning', "w", newline = "") as f:
            header = ['Parameters', 'AVG Sensitivity', 'AVG Specificity', 'Threshold']
            writer = csv.writer(f)
            
            writer.writerow(header)
            writer.writerows(output)
            
        return clf_arrs, param_arrs, sen_arrs, spec_arrs, thresh_arrs, conf_matrix
        
    def thresholdTuner(self, clf, sens_target, X, y, display = False):
        
        sensArr = []
        specArr = []
        thresholds = []
        conf_matrices = []
        for i in range(10):
            if display:
                print(f'Run: {i+1} / 10', end='\r')
                
            X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = i, stratify = y)
            clf.fit(X_train, y_train)
            
            threshold = 0.10
            sensitivity = 0
            specificity = 0
            VARIANCE = 0.005
            while((sensitivity < (sens_target - VARIANCE) or sensitivity > (sens_target + VARIANCE)) and threshold < 0.5):
                threshold = threshold + 0.005
                clf_pred = (clf.predict_proba(X_test) >= threshold)
                clf_pred = [item[1] for item in clf_pred]
                
                clf_pred = [1 if pred else 0 for pred in clf_pred]

                tp, fp, fn, tn = confusion_matrix(y_test, clf_pred).ravel()
                conf_matrix = [tp, fp, fn, tn]
                if (tp + fn) != 0 and (tn + fp) != 0:
                    sensitivity = tp/(tp+fn)
                    specificity = tn/(tn+fp)

            if sensitivity > (sens_target - VARIANCE) and sensitivity < (sens_target + VARIANCE):
                sensArr.append(round(sensitivity, 3))
                specArr.append(round(specificity, 3))
                thresholds.append(round(threshold, 3))
                conf_matrices.append(conf_matrix)
                
        y_data = [y_test, clf_pred]                                                         
        sensArr = np.array(sensArr)
        specArr = np.array(specArr)
        thresholds = np.array(thresholds)
        return sensArr, specArr, thresholds, conf_matrices, y_data

    
    def get_best(self):
        return self.best_clf, self.best_params, self.best_sensArr, self.best_specArr, self.best_thresholds, self.best_conf_matrices, self.best_y_data
    
    