In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

from sklearn import svm
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
import train
import pickle

In [2]:
def eval(tnTotal, fpTotal, fnTotal, tpTotal, y, y_pred, size):
    avgaccuracy = 0
    avgPrecision = 0
    avgTpr = 0
    avgFar = 0
    avgFrr = 0
    
    avgaccuracy = (tpTotal+tnTotal)/(tpTotal+tnTotal+fpTotal+fnTotal)
    avgPrecision = tpTotal/(tpTotal+fpTotal)
    avgTpr = tpTotal/(tpTotal+fnTotal)
    avgFar = fpTotal/(fpTotal+tnTotal)
    avgFrr = fnTotal/(fnTotal+tpTotal)
    
    avgaccuracy = avgaccuracy / size
    avgPrecision = avgPrecision / size
    avgTpr = avgTpr / size
    avgFar = avgFar / size
    avgFrr = avgFrr / size
    
    fprROC, tprROC, threshold = roc_curve(y, y_pred, pos_label=1) #fprROC == avgFar, tprROC == avgTpr
    fnrROC = 1 - tprROC #fnrROC == avgFrr
    EER = fprROC[np.nanargmin(np.absolute((fnrROC - fprROC)))]

    print("average acc = "+str(avgaccuracy))
    print("average precision = "+str(avgPrecision))
    print("average TPR = "+str(avgTpr))
    print("average FAR = "+str(avgFar))
    print("average FRR = "+str(avgFrr))
    print("average EER = "+str(EER))

In [3]:
dfList = train.addNewData()

for df in dfList:
    scaler = MinMaxScaler(feature_range=(0,1)).fit(df)
    df = scaler.transform(df)
    
tempList = train.transform("temp.json")

for tempdf in tempList:
    scaler = MinMaxScaler(feature_range=(0,1)).fit(tempdf)
    tempdf = scaler.transform(tempdf)

110
119
133
107
114
81
118
92
106
117
106
117
108
159
136
123
110
78
67
80
56
85


In [4]:
#Truly train the RF models that are going to be used
from imblearn.pipeline import Pipeline
from sklearn.model_selection import RepeatedStratifiedKFold
from numpy import mean
from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import FloatTensorType
import onnxruntime as rt
import json

userOrder = []
userList = ["1092954", "dama0623", "1094908", "4109034029", "611034", "1094841", "D1186959",
                "411411159", "1094845", "1094842", "110", "pomiii5093", "1092574", "anyu5471", "px",
                "wardlin", "lenny", "1092960", "1092923", "1092950", "1092942", "1092928", "1092922"]

with open("evaluationData.json", 'r') as f:
    data = json.load(f)
    numSubject = len(data)
    for i in data.keys():
        userOrder.append(userList.index(i))
        
topSMOTE = -1
topScore = 0
topRS = -1
curSMOTE = 0.1
curRS = 1
c=0
for j in range(10):#SMOTE+=0.1
    curRS = 1
    for k in range(10):#RS-=0.1
        try:
            precisionList = []
            accList = []
            tprList = []
            farList = []
            frrList = []
            tnTotal, fpTotal, fnTotal, tpTotal = 0, 0, 0, 0
            i=0
            for df in dfList:
                X = df.drop(columns=['label'])
                X = X.drop(columns=['pressureMedian'])
                X = X.drop(columns=['sizeMedian'])
                y = df['label']
                predicted_targets = np.array([])
                actual_targets = np.array([])
                
                X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2, random_state=42, stratify=y) #stratify=y means to keep the same ratio of labels in test set as the training set
                X_re, y_re = SMOTE(sampling_strategy=curSMOTE, random_state=42, k_neighbors=2).fit_resample(X_train, y_train.astype('int'))
                X_re, y_re = RandomUnderSampler(sampling_strategy=curRS, random_state=42).fit_resample(X_re, y_re.astype('int'))
                
                RF_model = RandomForestClassifier(n_estimators=22)
                RF_model.fit(X_re,y_re)
                initial_types = [
                    ('input_data_type', FloatTensorType([None, 54]))
                ]

                converted_model = convert_sklearn(RF_model, initial_types=initial_types)
                with open(f"./models/model{i}.onnx", "wb") as f:
                    f.write(converted_model.SerializeToString())
            
                i+=1
            i=0
            for df in tempList:
                y = df['label']
                X = df.drop(columns=['label'])
                X = X.drop(columns=['pressureMedian'])
                X = X.drop(columns=['sizeMedian'])

                predicted_targets = np.array([])
                actual_targets = np.array([])

                sess = rt.InferenceSession(
                        f"./models/model{userOrder[i]}.onnx", providers=rt.get_available_providers())
                        #f"./models/RandomSamplingRF-newData/model{userOrder[i]}.onnx", providers=rt.get_available_providers())
                
                input_name = sess.get_inputs()[0].name
                    #print(X_test.astype(np.float32).to_dict('split')['data'])
                    #break
                y_pred = sess.run(None, {input_name: X.astype(np.float32).to_dict('split')['data']})[0]

                predicted_targets = np.append(predicted_targets, y_pred)
                actual_targets = np.append(actual_targets, y)

                #print("NOW SHOWING: " + str(userList[userOrder[i]]))
                cm = confusion_matrix(actual_targets.astype('int'), predicted_targets.astype('int'), labels=[False, True])

                tn, fp, fn, tp = cm.ravel()
                tnTotal += tn
                fpTotal += fp
                fnTotal += fn
                tpTotal += tp

                accuracy = (tp+tn)/(tp+tn+fp+fn)
                precision = tp/(tp+fp)
                tpr = tp/(tp+fn)
                far = fp/(fp+tn)
                frr = fn/(fn+tp)

                accList.append(accuracy)
                precisionList.append(precision)
                tprList.append(tpr)
                farList.append(far)
                frrList.append(frr)

                #print("acc = "+str(accuracy))
                #print("precision = "+str(precision))
                #print("TPR = "+str(tpr))
                #print("FAR = "+str(far))
                #print("FRR = "+str(frr))
                i+=1
                
            if (mean(precisionList) > topScore):
                topSMOTE = curSMOTE
                topScore = mean(precisionList)
                topRS = curRS
            eval(tnTotal, fpTotal, fnTotal, tpTotal, actual_targets.astype('int'), predicted_targets, len(dfList))
            print(topSMOTE, topRS, topScore)
        except:
            i=0
            print("err", curSMOTE, curRS, topScore)
        c+=1
        curRS -= 0.1
    curSMOTE += 0.1
print(c)

average acc = 0.9517852948085506
average precision = 0.5556202120780284
average TPR = 0.9606746628771569
average FAR = 0.048943829371126825
average FRR = 0.039325337122843196
average EER = 0.007599463567277604
0.1 1 0.5556202120780284
average acc = 0.953155586876517
average precision = 0.5742650771152662
average TPR = 0.9560451962202432
average FAR = 0.0473396163041967
average FRR = 0.0439548037797568
average EER = 0.002682163611980331
0.1 0.9 0.5742650771152662
average acc = 0.956444287839637
average precision = 0.5862259121055432
average TPR = 0.9561800909559828
average FAR = 0.043815421192275904
average FRR = 0.043819909044017384
average EER = 0.004023245417970496
0.1 0.8 0.5862259121055431
average acc = 0.9600070472163496
average precision = 0.59551852710953
average TPR = 0.9576286101149178
average FAR = 0.0402114672686974
average FRR = 0.04237138988508207
average EER = 0.002235136343316942
0.1 0.7000000000000001 0.5955185271095301
average acc = 0.9624344217367472
average precision