In [1]:
# ----- IMPORTs -----
from sklearn.ensemble import RandomForestClassifier
import joblib
import os
import pandas
import sklearn.metrics as metric
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler # Necessario per SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
import time
from sklearn.neighbors import KNeighborsClassifier
import numpy
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
# ----- CONSTANTS -----
#models = ["tree", "forest", "knn", "svm-rbf", "svm-linear", "svm-poly", "svm-sigmoid", "gb"]
models = [ "svm-rbf" ]
onlyMinMax = False
#datasetDirectory = "../../04. Features/10. EcgStrict/Analysis/WSMeanMFrequency&Time/"
datasetDirectory = "datasets/features_post_avgM/"
out_dir = "datasets/train_test_metrics/{}/"
out_dir2 = "datasets/train_test_metrics/model/"
#datasetDirectory = "../../04. Features/07. ConcatenatiCutMonotoni/"
allSubjects = [ "%02d"%subjectInt for subjectInt in range(1, 41) ]
subjectsToRemove = ["01", "02", "40"]
#subjectsToRemove = []
subjects = [ subject for subject in allSubjects if subject not in subjectsToRemove ]
columns = ["Timestamp", "Label", "MaxRR", "MeanRR", "MeanHR", "minRR", "minHR", "maxHR", "RMSSD", "RangeRR", "NN50", "PNN50", "StdRR", "StdHR", "LowFrequenciesPower", "HighFrequenciesPower", "VeryHighFrequenciesPower", "TotalSignalPower", "%lowFrequencyPower", "%highFrequencyPower", "lowFrequencyPower/highFrequencyPower"]
#columns = ["Timestamp", "Label", "SCL_mean", "EDA_Max-min", "SCL_derivative_mean"]
featuresNamesTaken = ["maxHR", "MaxRR", "StdHR", "lowFrequencyPower/highFrequencyPower", "%lowFrequencyPower", "VeryHighFrequenciesPower", "PNN50", "HighFrequenciesPower", "StdRR" ]
#featuresNamesTaken = [ "SCL_mean", "EDA_Max-min", "SCL_derivative_mean" ]
columnsToDrop = [ column for column in columns if column not in featuresNamesTaken ]
labels = [0, 1]
metricsNames = ["Recall-0", "Recall-1", "Precision-0", "Precision-1", "Fscore-0", "Fscore-1", "Accuracy", "TN", "FP", "FN", "TP", "%TN", "%FP", "%FN", "%TP", "CPU-Time"]


In [3]:
# ----- FUNCTIONS -----
def trainAndTest(modelName, testSubject, classifier, metricsDF, scoresDF=None):
    modelFilename = "%s%s_Classifier_tp%s.pkl" % (out_dir.format(testSubject),modelName, testSubject)
    resultFilename = "%s%s_Results_tp%s.txt" % (out_dir.format(testSubject),modelName, testSubject)
    scalerFilename = "%s%s_Scaler_tp%s.pkl" % (out_dir.format(testSubject),modelName, testSubject)
    predictionFilename = "%s%s_PredictedLabels.csv" % (out_dir.format(testSubject),modelName)
    start = time.time()
    # Build the trainingSet
    trainingSet = None
    for subject in subjects:
        # Skip test subject
        if subject==testSubject:
            continue
        # Concat the subject data to the training set
        filename = datasetDirectory+"f"+subject+".csv"
        subjectFeatures = pandas.read_csv(filename, sep = '\t')
        trainingSet = pandas.concat([trainingSet, subjectFeatures])
    # Estraggo l'etichetta Label
    trainLabels = trainingSet["Label"]
    # Rimuovo le colonne inutili per il training
    trainingSet = trainingSet.drop(columns=columnsToDrop)
    # Costruisco il modello
    if not onlyMinMax and ("svm" in modelName or "knn" in modelName):
        scaler = StandardScaler()
        trainingSet = scaler.fit_transform( trainingSet.to_numpy() )
        joblib.dump(scaler, scalerFilename)
    classifier.fit(trainingSet, trainLabels)
    # Salvo il modello
    joblib.dump(classifier, modelFilename)
    # Load test data
    testSet = None
    filename = datasetDirectory +"f"+testSubject+".csv"
    testSet = pandas.read_csv(filename, sep = '\t')
    # Estrai la label
    trueLabels = testSet["Label"]
    # Rimuovo le colonne inutili per il test
    testSet = testSet.drop(columns=columnsToDrop)
    # Prediction
    if not onlyMinMax and ("svm" in modelName or "knn" in modelName):
        testSet = scaler.transform( testSet.to_numpy() )
    predictedLabels = classifier.predict(testSet)
    # Write predicted labels on a file
    predictionFile = open(predictionFilename, "a+")
    predictionFile.write( testSubject )
    for label in predictedLabels:
        predictionFile.write( "\t"+str(int(label)) )
    predictionFile.write( "\n" )
    predictionFile.close()
    # Evaluate results
    recall = metric.recall_score(trueLabels, predictedLabels, labels=labels, average=None)
    precision = metric.precision_score(trueLabels, predictedLabels, labels=labels, average=None)
    fmeasure =  metric.f1_score(trueLabels, predictedLabels, labels=labels, average=None)
    accuracy = metric.accuracy_score(trueLabels, predictedLabels)
    confusionMatrix = metric.confusion_matrix(trueLabels, predictedLabels, labels=labels)
    end = time.time()
    # Print features importance
    if "forest" in modelName or "tree" in modelName:
        scoresDF["Subject "+testSubject] = [ score for score in classifier.feature_importances_ ]
    tn, fp, fn, tp = confusionMatrix.ravel()
    totWindows100 = 100/(tn+fp+fn+tp)
    metricsDF["Subject "+testSubject] = [ recall[0], recall[1], precision[0], precision[1], fmeasure[0], fmeasure[1], accuracy, tn, fp, fn, tp, tn*totWindows100, fp*totWindows100, fn*totWindows100, tp*totWindows100, end-start ]


In [4]:
# ----- MAIN -----
# Choose the model to build
for model in models:
    print( "Model: "+model )
    metricsDF = pandas.DataFrame()
    metricsDF[""] = metricsNames
    scoresDF = pandas.DataFrame()
    scoresDF[""] = featuresNamesTaken
    # Choose the subject for the test
    for testSubject in subjects:
        print( "    Test subject: "+testSubject )
        if not os.path.isdir(out_dir.format(testSubject)):
            os.makedirs(out_dir.format(testSubject))
        if model=="tree":
            classifier = DecisionTreeClassifier()
            trainAndTest(model, testSubject, classifier, metricsDF, scoresDF)
        elif model=="forest":
            classifier = RandomForestClassifier()
            trainAndTest(model, testSubject, classifier, metricsDF, scoresDF)
        elif "svm" in model:
            _, kernel = model.split("-")
            classifier = SVC(C=1, gamma=0.1, kernel=kernel)
            trainAndTest(model, testSubject, classifier, metricsDF)
        elif model=="knn":
            classifier = KNeighborsClassifier()
            trainAndTest(model, testSubject, classifier, metricsDF)
        elif model=="gb":
            classifier = GradientBoostingClassifier()
            trainAndTest(model, testSubject, classifier, metricsDF)
    # Compute the mean value of the metrics
    meanValues = []
    for i in range(len(metricsNames)):
        meanValues.append( numpy.mean(metricsDF.iloc[i, 1:]) )
    # Write out the metrics DF
    metricsDF.insert(1, "Mean", meanValues) # indici=>0
    metricsDF.to_csv(out_dir2 + model+"_metrics.csv", sep='\t', index=False)
    # Compute the mean value of the scores
    meanValues = []
    for i in range(len(featuresNamesTaken)):
        meanValues.append( numpy.mean(scoresDF.iloc[i, 1:]) )
    # Write out the score DF
    scoresDF.insert(1, "Mean", meanValues)
    scoresDF.to_csv(out_dir2 + model+"_scores.csv", sep='\t', index=False)


Model: svm-rbf
    Test subject: 03


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


    Test subject: 04
    Test subject: 05


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


    Test subject: 06


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


    Test subject: 07
    Test subject: 08
    Test subject: 09
    Test subject: 10
    Test subject: 11
    Test subject: 12
    Test subject: 13
    Test subject: 14
    Test subject: 15
    Test subject: 16


  'recall', 'true', average, warn_for)
  'recall', 'true', average, warn_for)


    Test subject: 17
    Test subject: 18
    Test subject: 19
    Test subject: 20
    Test subject: 21
    Test subject: 22
    Test subject: 23
    Test subject: 24
    Test subject: 25
    Test subject: 26
    Test subject: 27
    Test subject: 28
    Test subject: 29
    Test subject: 30
    Test subject: 31
    Test subject: 32
    Test subject: 33
    Test subject: 34
    Test subject: 35
    Test subject: 36
    Test subject: 37
    Test subject: 38
    Test subject: 39
