In [18]:
%load_ext autoreload
%autoreload 2

In [19]:
import math
import time
import pickle
import pandas as pd
import numpy as np
from time import time

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import AdaBoostClassifier

In [20]:
import sys
sys.path.append('../src')
from preprocessing import *

# Splitting the dataset

In [3]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)
df_db.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
0,12.49025,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528,0,07-04-15,background,13.49,1.64
1,12.490528,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299,0,07-04-15,background,13.49,1.64
2,12.490806,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093,0,07-04-15,background,13.49,1.64
3,12.491084,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905,0,07-04-15,background,13.49,1.64
4,12.491373,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736,0,07-04-15,background,13.49,1.64


In [7]:
df_train, df_test = split_series_byID(100, 0.75, df_db)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

# Basic Neural Network

In [11]:
def printResults(n_hid_layers,n_neur,accuracy,elapsed):
    print('========================================')
    print('Number of hidden layers:', n_hid_layers)
    print('Number of neurons per layer:', n_neur)
    print('Accuracy:', accuracy)
    print('Time (minutes):', (elapsed)/60)
    
def printScores(xtest,ytest,clf):
    xback, yback = xtest[ytest=='background'], ytest[ytest=='background']
    print('Score del background:', clf.score(xback,yback))
    xrest, yrest = xtest[ytest!='background'], ytest[ytest!='background']
    print('Score del resto:', clf.score(xrest,yrest))
    num_back = len(yback)
    num_wine = len(yrest[yrest=='wine'])
    num_banana = len(yrest[yrest=='banana'])
    func = lambda x: 1/num_back if x=='background' else (1/num_wine if x=='wine' else 1/num_banana)
    weights = np.array([func(x) for x in ytest])
    # Score donde las tres clases ponderan igual
    print('Score con pesos:', clf.score(xtest,ytest,weights))
    print('========================================')

In [10]:
# NN with 2 hidden layers and 15 neurons per layer
start = time()

clf = MLPClassifier(hidden_layer_sizes=(15,15))
clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)

final = time()

printResults(2,15,score,final-start)



Number of hidden layers: 2
Number of neurons per layer: 15
Accuracy: 0.8353240753965501
Time (minutes): 14.562885828812917


In [38]:
# Adding early stopping and more iterations
start = time()

clf = MLPClassifier(hidden_layer_sizes=(15,15),early_stopping=True,max_iter=2000)
clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)

final = time()

printResults(2,15,score,final-start)

Number of hidden layers: 2
Number of neurons per layer: 15
Accuracy: 0.8043631733802351
Time (minutes): 8.405658360322317


In [47]:
# Análisis del score
print('Proporcion de background:',len(ytest[ytest=='background'])/len(ytest))
printScores(xtest,ytest,clf)

Proporcion de background: 0.8182092707232372
Score del background: 0.9399310733505234
Score del resto: 0.19419502559037444
Score con pesos: 0.44563333558968055


Demasiado sesgo hacia el background, hay que reducirlo aunque el score baje

# Removing excess of background

In [31]:
# prop: ejemplos que no son background que habrá por cada ejemplo de background
def remove_bg(df,prop=2):
    new_df = df[df['class']!='background'].copy()
    useful_samples = new_df.shape[0]
    new_df = new_df.append(df[df['class']=='background'].sample(n=int(useful_samples/2)).copy())
    return new_df

In [93]:
# Para evitar el sesgo quitamos elementos clasificados como background, pero solo en el train set
df_train, df_test = split_series_byID(100, 0.75, df_db)
df_train = remove_bg(df_train)

In [94]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

In [95]:
start = time()

clf = MLPClassifier(hidden_layer_sizes=(15,15),early_stopping=True,max_iter=2000)
clf.fit(xtrain,ytrain)
score = clf.score(xtest,ytest)

final = time()

printResults(2,15,score,final-start)

Number of hidden layers: 2
Number of neurons per layer: 15
Accuracy: 0.6814555008652668
Time (minutes): 2.026431759198507


In [96]:
# Análisis del score
printScores(xtest,ytest,clf)

Score del background: 0.7280450951862517
Score del resto: 0.4577521557515206
Score con pesos: 0.5598611811734319


Aunque se ponga la misma cantidad de background que de bananas o wine sigue habiendo un sesgo hacia el background.

# Hyperparameter analysis

In [28]:
def hyper_analysis(df_db,
                   hid_layers,
                   neurons,
                   clf_nn,
                   printScores=printScores,
                   features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity'],
                   rem_bg=True):
    df_train, df_test = split_series_byID(100, 0.75, df_db)
    if rem_bg:
        df_train = remove_bg(df_train)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    xtest, ytest = df_test[features].values, df_test['class'].values

    start_total = time()

    for n_hid_layers in hid_layers:
        for n_neur in neurons:
            tup = []
            for i in range(n_hid_layers):
                tup.append(n_neur)
            tup = tuple(tup)

            start = time()

            clf_nn.fit(xtrain, ytrain)
            score = clf_nn.score(xtest, ytest)
            final = time()

            printResults(n_hid_layers,n_neur,score,final-start)
            printScores(xtest,ytest,clf_nn)


    end_total = time()
    print('\n====> Total tiempo transcurrido (horas):', (end_total-start_total)/(60*60))

In [97]:
df_train, df_test = split_series_byID(100, 0.75, df_db)
df_train = remove_bg(df_train)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

start_total = time()

for n_hid_layers in range(1,5):
    for n_neur in [5,10,15,20,40]:
        tup = []
        for i in range(n_hid_layers):
            tup.append(n_neur)
        tup = tuple(tup)

        start = time()

        clf_nn = MLPClassifier(
                    hidden_layer_sizes = tup,
                    max_iter=2000,
                    early_stopping=True
                )

        clf_nn.fit(xtrain, ytrain)
        score = clf_nn.score(xtest, ytest)
        final = time()
        
        printResults(n_hid_layers,n_neur,score,final-start)
        printScores(xtest,ytest,clf_nn)
        

end_total = time()
print('\n====> Total tiempo transcurrido (horas):', (end_total-start_total)/(60*60))

Number of hidden layers: 1
Number of neurons per layer: 5
Accuracy: 0.6464538000436101
Time (minutes): 0.7238489230473836
Score del background: 0.6517030942511409
Score del resto: 0.6271659070366737
Score con pesos: 0.6350169014100797
Number of hidden layers: 1
Number of neurons per layer: 10
Accuracy: 0.7221131969900808
Time (minutes): 0.8069314082463582
Score del background: 0.7455093581208391
Score del resto: 0.636146848977865
Score con pesos: 0.6533235110759776
Number of hidden layers: 1
Number of neurons per layer: 15
Accuracy: 0.7306583066941492
Time (minutes): 1.698828891913096
Score del background: 0.7619541096177197
Score del resto: 0.6156656858785746
Score con pesos: 0.6508640130517963
Number of hidden layers: 1
Number of neurons per layer: 20
Accuracy: 0.6364687344433336
Time (minutes): 0.6529813011487325
Score del background: 0.6559948498932295
Score del resto: 0.5647223985076636
Score con pesos: 0.5929886370444567
Number of hidden layers: 1
Number of neurons per layer: 40


# Two Neural Networks

## 1. Classify background

In [24]:
def printScoresBack(xtest,ytest,clf):
    xback, yback = xtest[ytest=='background'], ytest[ytest=='background']
    print('Score del background:', clf.score(xback,yback))
    xrest, yrest = xtest[ytest!='background'], ytest[ytest!='background']
    print('Score del resto:', clf.score(xrest,yrest))
    num_back = len(yback)
    num_rest = len(ytest)-num_back
    func = lambda x: 1/num_back if x=='background' else 1/num_rest
    weights = np.array([func(x) for x in ytest])
    # Score donde las tres clases ponderan igual
    print('Score con pesos:', clf.score(xtest,ytest,weights))
    print('========================================')

In [7]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)
df_db.loc[df_db['class']!='background','class'] = 'not-background'
df_db[df_db['class']!='background'].head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
3429,13.490021,12.8102,10.3665,10.4529,11.6742,13.4941,13.2749,8.30531,9.04553,26.4234,59.4725,0,07-04-15,not-background,13.49,1.64
3430,13.490309,12.8097,10.3655,10.4523,11.6734,13.4934,13.274,8.30527,9.04545,26.4241,59.4745,0,07-04-15,not-background,13.49,1.64
3431,13.490587,12.8088,10.3645,10.4516,11.6731,13.493,13.273,8.30523,9.04538,26.4246,59.4763,0,07-04-15,not-background,13.49,1.64
3432,13.490865,12.808,10.3638,10.4508,11.6727,13.4922,13.2719,8.3052,9.04516,26.4251,59.4779,0,07-04-15,not-background,13.49,1.64
3433,13.491144,12.8078,10.3628,10.4503,11.6722,13.4914,13.2708,8.30517,9.04511,26.4256,59.4793,0,07-04-15,not-background,13.49,1.64


In [25]:
# Primero probamos a no quitar el exceso de background

df_train, df_test = split_series_byID(100, 0.75, df_db)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

start_total = time.time()

for n_hid_layers in range(1,5):
    for n_neur in [5,10,15,20,40]:
        tup = []
        for i in range(n_hid_layers):
            tup.append(n_neur)
        tup = tuple(tup)

        start = time.time()

        clf_nn = MLPClassifier(
                    hidden_layer_sizes = tup,
                    max_iter=2000,
                    early_stopping=True
                )

        clf_nn.fit(xtrain, ytrain)
        score = clf_nn.score(xtest, ytest)
        final = time.time()
        
        printResults(n_hid_layers,n_neur,score,final-start)
        printScoresBack(xtest,ytest,clf_nn)
        

end_total = time.time()
print('\n====> Total tiempo transcurrido (horas):', (end_total-start_total)/(60*60))
# En más de la mitad de ocasiones aquellos datos que no son background son clasificados erroneamente.
# Veamos si es cuestión de quitar background.

Number of hidden layers: 1
Number of neurons per layer: 5
Accuracy: 0.8569467252918013
Time (minutes): 1.2730913956960042
Score del background: 0.9394004860923576
Score del resto: 0.4879634553100981
Score con pesos: 0.7136819707012279
Number of hidden layers: 1
Number of neurons per layer: 10
Accuracy: 0.8372843495611944
Time (minutes): 1.3523353179295858
Score del background: 0.908695652173913
Score del resto: 0.5177164402765021
Score con pesos: 0.7132060462252077
Number of hidden layers: 1
Number of neurons per layer: 15
Accuracy: 0.849018205576451
Time (minutes): 1.8269388278325398
Score del background: 0.9274858223062382
Score del resto: 0.4978730603760816
Score con pesos: 0.7126794413411598
Number of hidden layers: 1
Number of neurons per layer: 20
Accuracy: 0.879668379509456
Time (minutes): 5.245651348431905
Score del background: 0.9582068593032677
Score del resto: 0.5282061197853725
Score con pesos: 0.7432064895443201
Number of hidden layers: 1
Number of neurons per layer: 40
Ac

In [32]:
# Ahora, lo mismo quitando el exceso de background
df_train, df_test = split_series_byID(100, 0.75, df_db)
df_train = remove_bg(df_train,prop=1)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

start_total = time.time()

for n_hid_layers in range(1,5):
    for n_neur in [5,10,15,20,40]:
        tup = []
        for i in range(n_hid_layers):
            tup.append(n_neur)
        tup = tuple(tup)

        start = time.time()

        clf_nn = MLPClassifier(
                    hidden_layer_sizes = tup,
                    max_iter=2000,
                    early_stopping=True,
                    shuffle=True
                )

        clf_nn.fit(xtrain, ytrain)
        score = clf_nn.score(xtest, ytest)
        final = time.time()
        
        printResults(n_hid_layers,n_neur,score,final-start)
        printScoresBack(xtest,ytest,clf_nn)
        

end_total = time.time()
print('\n====> Total tiempo transcurrido (horas):', (end_total-start_total)/(60*60))

Number of hidden layers: 1
Number of neurons per layer: 5
Accuracy: 0.5801633486905403
Time (minutes): 0.4300204555193583
Score del background: 0.5223014632760336
Score del resto: 0.7499749841566326
Score con pesos: 0.6361382237163332
Number of hidden layers: 1
Number of neurons per layer: 10
Accuracy: 0.566574975523127
Time (minutes): 0.21988988320032757
Score del background: 0.5181815598806648
Score del resto: 0.7085987792268437
Score con pesos: 0.6133901695537544
Number of hidden layers: 1
Number of neurons per layer: 15
Accuracy: 0.5998508076308506
Time (minutes): 0.598010802268982
Score del background: 0.5622503196476772
Score del resto: 0.7101997932023615
Score con pesos: 0.6362250564250194
Number of hidden layers: 1
Number of neurons per layer: 20
Accuracy: 0.6004441863717856
Time (minutes): 0.21585826476415
Score del background: 0.5434919732916608
Score del resto: 0.7675861378873287
Score con pesos: 0.6555390555894948
Number of hidden layers: 1
Number of neurons per layer: 40
A

## 2. Classify wine and bananas

In [34]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)
df_db = df_db[df_db['class']!='background']
df_db.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
3429,13.490021,12.8102,10.3665,10.4529,11.6742,13.4941,13.2749,8.30531,9.04553,26.4234,59.4725,0,07-04-15,banana,13.49,1.64
3430,13.490309,12.8097,10.3655,10.4523,11.6734,13.4934,13.274,8.30527,9.04545,26.4241,59.4745,0,07-04-15,banana,13.49,1.64
3431,13.490587,12.8088,10.3645,10.4516,11.6731,13.493,13.273,8.30523,9.04538,26.4246,59.4763,0,07-04-15,banana,13.49,1.64
3432,13.490865,12.808,10.3638,10.4508,11.6727,13.4922,13.2719,8.3052,9.04516,26.4251,59.4779,0,07-04-15,banana,13.49,1.64
3433,13.491144,12.8078,10.3628,10.4503,11.6722,13.4914,13.2708,8.30517,9.04511,26.4256,59.4793,0,07-04-15,banana,13.49,1.64


In [36]:
df_train, df_test = split_series_byID(100, 0.75, df_db)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

start_total = time.time()

for n_hid_layers in range(1,5):
    for n_neur in [5,10,15,20,40]:
        tup = []
        for i in range(n_hid_layers):
            tup.append(n_neur)
        tup = tuple(tup)

        start = time.time()

        clf_nn = MLPClassifier(
                    hidden_layer_sizes = tup,
                    max_iter=2000,
                    early_stopping=True,
                    shuffle=True
                )

        clf_nn.fit(xtrain, ytrain)
        score = clf_nn.score(xtest, ytest)
        final = time.time()
        
        printResults(n_hid_layers,n_neur,score,final-start)
        

end_total = time.time()
print('\n====> Total tiempo transcurrido (horas):', (end_total-start_total)/(60*60))

Number of hidden layers: 1
Number of neurons per layer: 5
Accuracy: 0.7908191826833364
Time (minutes): 0.12984011173248292
Number of hidden layers: 1
Number of neurons per layer: 10
Accuracy: 0.7634353424146296
Time (minutes): 0.47473315795262655
Number of hidden layers: 1
Number of neurons per layer: 15
Accuracy: 0.7635053181563725
Time (minutes): 0.5810888727506002
Number of hidden layers: 1
Number of neurons per layer: 20
Accuracy: 0.7582804627729054
Time (minutes): 0.370017139116923
Number of hidden layers: 1
Number of neurons per layer: 40
Accuracy: 0.7931517074080985
Time (minutes): 1.0173205216725667
Number of hidden layers: 2
Number of neurons per layer: 5
Accuracy: 0.6941593580891957
Time (minutes): 0.43400735457738243
Number of hidden layers: 2
Number of neurons per layer: 10
Accuracy: 0.6807473409218138
Time (minutes): 0.7764243205388387
Number of hidden layers: 2
Number of neurons per layer: 15
Accuracy: 0.7795297630154879
Time (minutes): 1.0063043196996053
Number of hidden