In [2]:
import warnings 
warnings.filterwarnings('ignore')

In [14]:
import math
from time import time
import pickle
import pandas as pd
import numpy as np
from time import time

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.metrics import accuracy_score, f1_score

from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [4]:
import sys
sys.path.append('../src')
from preprocessing import *
from utils import *
from plotting import *

# Modified dataset

In [25]:
# Varía ciertos hiperparámetros e imprime los resultados más relevantes
def hyper_sim(df,num_val,n_hid_layers,n_neur,alpha,features,over_dict=None,under_dict=None):
    errs_acc = []
    errs_f1 = []
    rec_ban = []
    loss = []
    for i in range(num_val):
        df_train, df_test = split_series_byID(0.75, df)
        df_train, df_test = norm_train_test(df_train,df_test,features_to_norm=features)
        xtrain, ytrain = df_train[features].values, df_train['class'].values
        if over_dict and under_dict:
            over_sampling = SMOTE(sampling_strategy=over_dict)
            under_sampling = RandomUnderSampler(sampling_strategy=under_dict)
            xtrain, ytrain = over_sampling.fit_resample(xtrain, ytrain)
            xtrain, ytrain = under_sampling.fit_resample(xtrain, ytrain)
        xtest, ytest = df_test[features].values, df_test['class'].values
        
        tup = []
        for i in range(n_hid_layers):
            tup.append(n_neur)
        tup = tuple(tup)

        clf_nn = MLPClassifier(
                hidden_layer_sizes=tup,
                max_iter=2000,
                early_stopping=True,
                shuffle=True,
                alpha=alpha,
                learning_rate='adaptive'
            )

        clf_nn.fit(xtrain, ytrain)
        ypred = clf_nn.predict(xtest)
        errs_acc.append(accuracy_score(ytest,ypred))
        errs_f1.append(f1_score(ytest,ypred,average='weighted'))
        rec_ban.append(np.sum(np.logical_and(ytest=='banana',ypred=='banana'))/np.sum(ytest=='banana'))
        loss.append(clf_nn.loss_)

    errs_acc = np.array(errs_acc)
    errs_f1 = np.array(errs_f1)
    rec_ban = np.array(rec_ban)
    loss = np.array(loss)
    print('Train loss:',np.mean(loss),'+-',np.std(loss))
    print('Accuracy:',np.mean(errs_acc),'+-',np.std(errs_acc))
    print('F1-score:',np.mean(errs_f1),'+-',np.std(errs_f1))
    print('Recall bananas:',np.mean(rec_ban),'+-',np.std(rec_ban))

In [9]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']

df_db = group_datafiles_byID('../datasets/preprocessed/HT_Sensor_prep_metadata.dat', '../datasets/preprocessed/HT_Sensor_prep_dataset.dat')
df_db = reclassify_series_samples(df_db)
df_db.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt,t0_delay,dt_delay
0,12.49025,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528,0,07-04-15,background,13.49,1.64,0.0,0.0
1,12.490528,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299,0,07-04-15,background,13.49,1.64,0.0,0.0
2,12.490806,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093,0,07-04-15,background,13.49,1.64,0.0,0.0
3,12.491084,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905,0,07-04-15,background,13.49,1.64,0.0,0.0
4,12.491373,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736,0,07-04-15,background,13.49,1.64,0.0,0.0


In [11]:
# Validación simple de 5 ejecuciones para decidir la constante de aprendizaje, el número de capas ocultas y
# el número de neuronas
for alpha in [0.1,0.01,0.001]:
    print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>')
    print('Alpha:',alpha)
    for n_hid_layers in range(1,4):
        print('##############################################')
        print('\t Hidden layers:',n_hid_layers)
        for n_neur in [4,8,16]:
            print('==============================================')
            print('\t \t Neurons per layer:',n_neur)
            hyper_sim(df_db,5,n_hid_layers,n_neur,alpha,features)
            print('==============================================')

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>
Alpha: 0.1
##############################################
	 Hidden layers: 1
	 	 Neurons per layer: 4
Train loss: 0.36439495614577544 +- 0.02154308323919161
Accuracy: 0.8414925093983957 +- 0.03205808447781048
F1-score: 0.8248495700474063 +- 0.040662127331983974
Recall bananas: 0.20894162557954427 +- 0.10269828337888062
	 	 Neurons per layer: 8
Train loss: 0.29394989438794517 +- 0.02299163910422099
Accuracy: 0.8420864215354106 +- 0.050105374962669945
F1-score: 0.8215395564119155 +- 0.0595370460638927
Recall bananas: 0.20786677266283388 +- 0.09736478307023012
	 	 Neurons per layer: 16
Train loss: 0.22796668428559053 +- 0.021724370601683172
Accuracy: 0.7926713150481643 +- 0.04742558095032711
F1-score: 0.7785714316435802 +- 0.05049925052450031
Recall bananas: 0.2145449389813968 +- 0.09986174503858439
##############################################
	 Hidden layers: 2
	 	 Neurons per layer: 4
Train loss: 0.3053438411410784 +- 0.010913760999942861

Train loss: 0.1669481749987911 +- 0.0115103516657133
Accuracy: 0.7810960421718576 +- 0.015640244446935764
F1-score: 0.7767980712383297 +- 0.013167999840908913
Recall bananas: 0.2136602859010403 +- 0.10486630301074099
	 	 Neurons per layer: 16
Train loss: 0.06250807849206433 +- 0.012097463956786951
Accuracy: 0.7562773702105744 +- 0.033389186034020385
F1-score: 0.763713857861914 +- 0.02966348795820711
Recall bananas: 0.2969017997507465 +- 0.11205908391079757
##############################################
	 Hidden layers: 3
	 	 Neurons per layer: 4
Train loss: 0.27363762460100227 +- 0.0207385507502707
Accuracy: 0.8227726926585787 +- 0.04966026398541586
F1-score: 0.8124863456187688 +- 0.05884403393734762
Recall bananas: 0.334669057520933 +- 0.19765387789816402
	 	 Neurons per layer: 8
Train loss: 0.1398741629690068 +- 0.017248251009375704
Accuracy: 0.7916713387089158 +- 0.025552402033975928
F1-score: 0.7894495940925385 +- 0.027894053815988983
Recall bananas: 0.33623865105676676 +- 0.102284

Elegimos 3 capas ocultas, 4 neuronas por capa y un alpha 0.01

Ahora probamos a hacer bagging con 20 estimadores.

In [13]:
errs_acc = []
errs_f1 = []
rec_ban = []
for i in range(5):
    df_train, df_test = split_series_byID(0.75, df_db)
    df_train, df_test = norm_train_test(df_train,df_test,features_to_norm=features)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    xtest, ytest = df_test[features].values, df_test['class'].values

    clf_nn = MLPClassifier(
        hidden_layer_sizes=(4,4,4),
        max_iter=2000,
        early_stopping=True,
        shuffle=True,
        alpha=0.01,
        learning_rate='adaptive'
        )

    bag = BaggingClassifier(base_estimator=clf_nn,n_estimators=20,n_jobs=3)

    bag.fit(xtrain, ytrain)
    ypred = bag.predict(xtest)
    metric_report(ytest, ypred)
    errs_acc.append(accuracy_score(ytest,ypred))
    errs_f1.append(f1_score(ytest,ypred,average='weighted'))
    rec_ban.append(np.sum(np.logical_and(ytest=='banana',ypred=='banana'))/np.sum(ytest=='banana'))

errs_acc = np.array(errs_acc)
errs_f1 = np.array(errs_f1)
rec_ban = np.array(rec_ban)
print('Accuracy:',np.mean(errs_acc),'+-',np.std(errs_acc))
print('F1-score:',np.mean(errs_f1),'+-',np.std(errs_f1))
print('Recall bananas:',np.mean(rec_ban),'+-',np.std(rec_ban))

TEST SET PROPORTIONS:
	Real background percentage: 0.7741905663284396
	Real banana percentage: 0.11463473686527414
	Real wine percentage: 0.11117469680628618
------------------------------------------
Accuracy: 0.8511610503378602
Recall on background: 0.9603341123222399
Recall on banana: 0.21235743635037058
Recall on wine: 0.7495941397743021
F1-score: 0.828924712641002
TEST SET PROPORTIONS:
	Real background percentage: 0.7683214404684678
	Real banana percentage: 0.10617230920279819
	Real wine percentage: 0.12550625032873397
------------------------------------------
Accuracy: 0.8505049353928152
Recall on background: 0.949272923310381
Recall on banana: 0.1778062172315568
Recall on wine: 0.8149402807850806
F1-score: 0.8289392454642864
TEST SET PROPORTIONS:
	Real background percentage: 0.7268808410219711
	Real banana percentage: 0.0831863759892998
	Real wine percentage: 0.189932782988729
------------------------------------------
Accuracy: 0.8643690205399511
Recall on background: 0.974191

# Windows

In [6]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity',
            'R1_mean', 'R2_mean', 'R3_mean', 'R4_mean', 'R5_mean', 'R6_mean', 'R7_mean',
            'R8_mean', 'Temp._mean', 'Humidity_mean', 'R1_std', 'R2_std', 'R3_std', 'R4_std',
            'R5_std', 'R6_std', 'R7_std', 'R8_std', 'Temp._std', 'Humidity_std']

win_df = pd.read_pickle('../datasets/preprocessed/window120_dataset.pkl')

In [6]:
# Validación simple de 5 ejecuciones para decidir la constante de aprendizaje, el número de capas ocultas y
# el número de neuronas
for alpha in [0.01,0.001]:
    print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>')
    print('Alpha:',alpha)
    for n_hid_layers in range(2,5):
        print('##############################################')
        print('\t Hidden layers:',n_hid_layers)
        for n_neur in [2,4,8]:
            print('==============================================')
            print('\t \t Neurons per layer:',n_neur)
            hyper_sim(win_df,5,n_hid_layers,n_neur,alpha,features)
            print('==============================================')

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>
Alpha: 0.01
##############################################
	 Hidden layers: 2
	 	 Neurons per layer: 2
Train loss: 0.3369292638080534 +- 0.01898777366073932
Accuracy: 0.8614995653739118 +- 0.013802793278683207
F1-score: 0.8477053916986094 +- 0.02269791360032949
Recall bananas: 0.27286655706901736 +- 0.13537520341231715
	 	 Neurons per layer: 4
Train loss: 0.25103889475688124 +- 0.02103813588063515
Accuracy: 0.8550185358754737 +- 0.01473824536252829
F1-score: 0.8413171659320327 +- 0.011710756562335249
Recall bananas: 0.32072884775245203 +- 0.14954677863898094
	 	 Neurons per layer: 8
Train loss: 0.15008099671145567 +- 0.006708852855497587
Accuracy: 0.8055775903424107 +- 0.021140383711052996
F1-score: 0.8088981076874298 +- 0.022004382367473173
Recall bananas: 0.33186436122857177 +- 0.06900749905851701
##############################################
	 Hidden layers: 3
	 	 Neurons per layer: 2
Train loss: 0.36515380967254024 +- 0.01634684196598

In [9]:
hyper_sim(win_df,10,3,4,0.01,features)

Train loss: 0.24018066820164438 +- 0.023418848195325886
Accuracy: 0.8637574268900602 +- 0.02871614724926834
F1-score: 0.8553617572371873 +- 0.031009063536019333
Recall bananas: 0.39207973741325436 +- 0.092974568441552


Al igual que antes, elegimos 3 capas ocultas, 4 neuronas por capa y un alpha 0.01. Aunque hay mejores resultados, tienen demasiada desviación típica como para ser fiables.

Ahora probamos a hacer bagging con 20 estimadores.

In [7]:
errs_acc = []
errs_f1 = []
rec_ban = []
for i in range(10):
    df_train, df_test = split_series_byID(0.75, win_df)
    df_train, df_test = norm_train_test(df_train,df_test,features_to_norm=features)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    xtest, ytest = df_test[features].values, df_test['class'].values

    clf_nn = MLPClassifier(
        hidden_layer_sizes=(4,4,4),
        max_iter=2000,
        early_stopping=True,
        shuffle=True,
        alpha=0.01,
        learning_rate='adaptive'
        )

    bag = BaggingClassifier(base_estimator=clf_nn,n_estimators=20,n_jobs=3)

    bag.fit(xtrain, ytrain)
    ypred = bag.predict(xtest)
    metric_report(ytest, ypred)
    errs_acc.append(accuracy_score(ytest,ypred))
    errs_f1.append(f1_score(ytest,ypred,average='weighted'))
    rec_ban.append(np.sum(np.logical_and(ytest=='banana',ypred=='banana'))/np.sum(ytest=='banana'))

errs_acc = np.array(errs_acc)
errs_f1 = np.array(errs_f1)
rec_ban = np.array(rec_ban)
print('Accuracy:',np.mean(errs_acc),'+-',np.std(errs_acc))
print('F1-score:',np.mean(errs_f1),'+-',np.std(errs_f1))
print('Recall bananas:',np.mean(rec_ban),'+-',np.std(rec_ban))

TEST SET PROPORTIONS:
	Real background percentage: 0.7869345926926833
	Real banana percentage: 0.09259234484327131
	Real wine percentage: 0.12047306246404538
------------------------------------------
Accuracy: 0.895500822775496
Recall on background: 0.9620033661447442
Recall on banana: 0.6426335307999808
Recall on wine: 0.65545067555062
F1-score: 0.8939343748402855
TEST SET PROPORTIONS:
	Real background percentage: 0.7070396595801938
	Real banana percentage: 0.09927509418729817
	Real wine percentage: 0.19368524623250807
------------------------------------------
Accuracy: 0.818205900161464
Recall on background: 0.9609517635934368
Recall on banana: 0.16950444726810673
Recall on wine: 0.6296159607494084
F1-score: 0.792719333808237
TEST SET PROPORTIONS:
	Real background percentage: 0.7244155558003745
	Real banana percentage: 0.12466674011237193
	Real wine percentage: 0.1509177040872535
------------------------------------------
Accuracy: 0.8511093973779883
Recall on background: 0.9761904

# Smote

In [24]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity',
            'R1_mean', 'R2_mean', 'R3_mean', 'R4_mean', 'R5_mean', 'R6_mean', 'R7_mean',
            'R8_mean', 'Temp._mean', 'Humidity_mean', 'R1_std', 'R2_std', 'R3_std', 'R4_std',
            'R5_std', 'R6_std', 'R7_std', 'R8_std', 'Temp._std', 'Humidity_std']

win_df = pd.read_pickle('../datasets/preprocessed/window120_dataset.pkl')

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

In [26]:
# Validación simple de 5 ejecuciones para decidir la constante de aprendizaje, el número de capas ocultas y
# el número de neuronas
for alpha in [0.01,0.001]:
    print('<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>')
    print('Alpha:',alpha)
    for n_hid_layers in range(2,5):
        print('##############################################')
        print('\t Hidden layers:',n_hid_layers)
        for n_neur in [2,4,8]:
            print('==============================================')
            print('\t \t Neurons per layer:',n_neur)
            hyper_sim(win_df,5,n_hid_layers,n_neur,alpha,features,over_dict=over_dict,under_dict=under_dict)
            print('==============================================')

<<<<<<<<<<<<<<<<<<<<<<<<<<<<<>>>>>>>>>>>>>>>>>
Alpha: 0.01
##############################################
	 Hidden layers: 2
	 	 Neurons per layer: 2
Train loss: 0.5488535924240698 +- 0.21046260621523777
Accuracy: 0.8271824672424748 +- 0.0444991777822927
F1-score: 0.8038085542601353 +- 0.07510036911176725
Recall bananas: 0.3496213514627672 +- 0.2131246020306895
	 	 Neurons per layer: 4
Train loss: 0.3185648758281586 +- 0.022205933514169364
Accuracy: 0.8121943421500776 +- 0.029241303750929024
F1-score: 0.819819580056623 +- 0.02658073479470294
Recall bananas: 0.4106811750623657 +- 0.09157163857353492
	 	 Neurons per layer: 8
Train loss: 0.1440066255811115 +- 0.016734622110107483
Accuracy: 0.7561186546743115 +- 0.01949760318613811
F1-score: 0.7584008535355601 +- 0.006767609476800042
Recall bananas: 0.34648036125721304 +- 0.13061643798541048
##############################################
	 Hidden layers: 3
	 	 Neurons per layer: 2
Train loss: 0.4749120460553636 +- 0.07229212674257438
Accur

In [19]:
hyper_sim(win_df,10,2,4,0.001,features,over_dict=over_dict)

Train loss: 0.2549249821288277 +- 0.02396171263553367
Accuracy: 0.8450009759161426 +- 0.038346372438755875
F1-score: 0.833654747589921 +- 0.04890055706443827
Recall bananas: 0.338290871841337 +- 0.1905058852897362


In [20]:
hyper_sim(win_df,10,3,4,0.01,features,over_dict=over_dict)

Train loss: 0.2503299377997425 +- 0.027380070003056072
Accuracy: 0.8248288805993831 +- 0.02328144346620624
F1-score: 0.8104826957935496 +- 0.028213516288456884
Recall bananas: 0.3091208926005941 +- 0.13019222637709468


In [27]:
# Mejor opción
hyper_sim(win_df,10,3,4,0.01,features,over_dict=over_dict,under_dict=under_dict)

Train loss: 0.3174721772856896 +- 0.027246001164953758
Accuracy: 0.8224637333552776 +- 0.034269443248349775
F1-score: 0.8301484949130975 +- 0.03307461316084157
Recall bananas: 0.47073848708416133 +- 0.1181386744211724


In [29]:
errs_acc = []
errs_f1 = []
rec_ban = []
for i in range(5):
    df_train, df_test = split_series_byID(0.75, win_df)
    df_train, df_test = norm_train_test(df_train,df_test,features_to_norm=features)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    over_sampling = SMOTE(sampling_strategy=over_dict)
    under_sampling = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = over_sampling.fit_resample(xtrain, ytrain)
    xtrain, ytrain = under_sampling.fit_resample(xtrain, ytrain)
    xtest, ytest = df_test[features].values, df_test['class'].values

    clf_nn = MLPClassifier(
        hidden_layer_sizes=(4,4,4),
        max_iter=2000,
        early_stopping=True,
        shuffle=True,
        alpha=0.01,
        learning_rate='adaptive'
        )

    bag = BaggingClassifier(base_estimator=clf_nn,n_estimators=20,n_jobs=3)

    bag.fit(xtrain, ytrain)
    ypred = bag.predict(xtest)
    metric_report(ytest, ypred)
    errs_acc.append(accuracy_score(ytest,ypred))
    errs_f1.append(f1_score(ytest,ypred,average='weighted'))
    rec_ban.append(np.sum(np.logical_and(ytest=='banana',ypred=='banana'))/np.sum(ytest=='banana'))

errs_acc = np.array(errs_acc)
errs_f1 = np.array(errs_f1)
rec_ban = np.array(rec_ban)
print('Accuracy:',np.mean(errs_acc),'+-',np.std(errs_acc))
print('F1-score:',np.mean(errs_f1),'+-',np.std(errs_f1))
print('Recall bananas:',np.mean(rec_ban),'+-',np.std(rec_ban))

TEST SET PROPORTIONS:
	Real background percentage: 0.7261303338023632
	Real banana percentage: 0.1202765016020479
	Real wine percentage: 0.15359316459558886
------------------------------------------
Accuracy: 0.8245588010917659
Recall on background: 0.8768101138744754
Recall on banana: 0.6473801050072236
Recall on wine: 0.716280353200883
F1-score: 0.8326610440983933
TEST SET PROPORTIONS:
	Real background percentage: 0.7762145091408189
	Real banana percentage: 0.10652743536457862
	Real wine percentage: 0.11725805549460241
------------------------------------------
Accuracy: 0.873000916448311
Recall on background: 0.9137284029998836
Recall on banana: 0.4742921765822529
Recall on wine: 0.9656184640223094
F1-score: 0.8733368939127423
TEST SET PROPORTIONS:
	Real background percentage: 0.7684564355994835
	Real banana percentage: 0.11653087574017186
	Real wine percentage: 0.1150126886603446
------------------------------------------
Accuracy: 0.862664173456213
Recall on background: 0.9539231