In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import time
import pickle
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.metrics import confusion_matrix, f1_score, accuracy_score
from sklearn.model_selection import cross_val_score

import sys
sys.path.append('../src')
from preprocessing import *
from plotting import *
from utils import *

In [3]:
# Smote libraries
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

# Raw dataset

In [13]:
df_db_raw = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', 
                             '../datasets/raw/HT_Sensor_dataset.dat')
df_db_raw = reclassify_series_samples(df_db_raw)

In [19]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']

errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train_raw, df_test_raw = split_series_byID(0.8, df_db_raw)
    xtrain, ytrain = df_train_raw[features].values, df_train_raw['class'].values
    xtest, ytest = df_test_raw[features].values, df_test_raw['class'].values

    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())

Accuracy: 0.8572143593668403 +- 0.025515978212783497
f1-score: 0.8190647327201015 +- 0.03829195828833234
Accuracy (smote): 0.8458565971888294 +- 0.031579337552137786
f1-score (smote): 0.8221617082189414 +- 0.03645957483160785


# Raw dataset normalized

In [20]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']

errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train_raw, df_test_raw = split_series_byID(0.8, df_db_raw)
    df_train_raw, df_test_raw = norm_train_test(df_train_raw, df_test_raw, features)
    xtrain, ytrain = df_train_raw[features].values, df_train_raw['class'].values
    xtest, ytest = df_test_raw[features].values, df_test_raw['class'].values

    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())

Accuracy: 0.8201521580939907 +- 0.01508803526206297
f1-score: 0.7765723760844815 +- 0.023235452515250485
Accuracy (smote): 0.7955777021534507 +- 0.026829736189149724
f1-score (smote): 0.770908003822002 +- 0.028783269484552905


# Preprocessed dataset (not norm)

In [4]:
with open('../datasets/preprocessed/window120_dataset.pkl', 'rb') as f: 
    df_db = pickle.load(f)

In [5]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity',
            'R1_mean', 'R2_mean', 'R3_mean', 'R4_mean', 'R5_mean', 'R6_mean', 'R7_mean',
            'R8_mean', 'Temp._mean', 'Humidity_mean', 'R1_std', 'R2_std', 'R3_std', 'R4_std',
            'R5_std', 'R6_std', 'R7_std', 'R8_std', 'Temp._std', 'Humidity_std']
errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train, df_test = split_series_byID(0.8, df_db)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    xtest, ytest = df_test[features].values, df_test['class'].values

    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())

Accuracy: 0.842697679663286 +- 0.03612150086276967
f1-score: 0.7997203574301185 +- 0.044495086486672615
Accuracy (smote): 0.8373088255373812 +- 0.03845074698631557
f1-score (smote): 0.821415832735829 +- 0.03494529439940311


# Preprocessed dataset normalized

In [6]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity',
            'R1_mean', 'R2_mean', 'R3_mean', 'R4_mean', 'R5_mean', 'R6_mean', 'R7_mean',
            'R8_mean', 'Temp._mean', 'Humidity_mean', 'R1_std', 'R2_std', 'R3_std', 'R4_std',
            'R5_std', 'R6_std', 'R7_std', 'R8_std', 'Temp._std', 'Humidity_std']
errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train, df_test = split_series_byID(0.8, df_db)
    df_train, df_test = norm_train_test(df_train, df_test, features)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    xtest, ytest = df_test[features].values, df_test['class'].values

    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())

Accuracy: 0.8588394806584356 +- 0.029749994213194787
f1-score: 0.820860741656366 +- 0.040412007031441224
Accuracy (smote): 0.8568151494744827 +- 0.020655782451296067
f1-score (smote): 0.8451657203060917 +- 0.022927579949338098


In [7]:
df_db_raw = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', 
                             '../datasets/raw/HT_Sensor_dataset.dat')
df_db_raw = reclassify_series_samples(df_db_raw)

features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']

errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train_raw, df_test_raw = split_series_byID(0.8, df_db_raw)
    xtrain, ytrain = df_train_raw[features].values, df_train_raw['class'].values
    xtest, ytest = df_test_raw[features].values, df_test_raw['class'].values

    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('==== Raw not norm ====')
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())


features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']

errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train_raw, df_test_raw = split_series_byID(0.8, df_db_raw)
    df_train_raw, df_test_raw = norm_train_test(df_train_raw, df_test_raw, features)
    xtrain, ytrain = df_train_raw[features].values, df_train_raw['class'].values
    xtest, ytest = df_test_raw[features].values, df_test_raw['class'].values

    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=400, criterion='gini', max_depth=6, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('==== Raw normalized ====')
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())



with open('../datasets/preprocessed/window120_dataset.pkl', 'rb') as f: 
    df_db = pickle.load(f)


features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity',
            'R1_mean', 'R2_mean', 'R3_mean', 'R4_mean', 'R5_mean', 'R6_mean', 'R7_mean',
            'R8_mean', 'Temp._mean', 'Humidity_mean', 'R1_std', 'R2_std', 'R3_std', 'R4_std',
            'R5_std', 'R6_std', 'R7_std', 'R8_std', 'Temp._std', 'Humidity_std']
errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train, df_test = split_series_byID(0.8, df_db)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    xtest, ytest = df_test[features].values, df_test['class'].values

    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('==== Prep not norm ====')
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())


features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity',
            'R1_mean', 'R2_mean', 'R3_mean', 'R4_mean', 'R5_mean', 'R6_mean', 'R7_mean',
            'R8_mean', 'Temp._mean', 'Humidity_mean', 'R1_std', 'R2_std', 'R3_std', 'R4_std',
            'R5_std', 'R6_std', 'R7_std', 'R8_std', 'Temp._std', 'Humidity_std']
errs_acc = []
errs_f1 = []
errs_acc_smote = []
errs_f1_smote = []

over_dict = {'banana': 175000, 'wine': 175000}
under_dict = {'background': 500000}

for i in range(7):
    df_train, df_test = split_series_byID(0.8, df_db)
    df_train, df_test = norm_train_test(df_train, df_test, features)
    xtrain, ytrain = df_train[features].values, df_train['class'].values
    xtest, ytest = df_test[features].values, df_test['class'].values

    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc.append(acc)
    errs_f1.append(f1)

    oversample = SMOTE(sampling_strategy=over_dict)
    undersample = RandomUnderSampler(sampling_strategy=under_dict)
    xtrain, ytrain = oversample.fit_resample(xtrain, ytrain)
    xtrain, ytrain = undersample.fit_resample(xtrain, ytrain)   
    rfc = RandomForestClassifier(n_estimators=500, criterion='entropy', max_depth=7, n_jobs=-1)
    rfc.fit(xtrain, ytrain)
    y_pred = rfc.predict(xtest)
    acc = accuracy_score(ytest, y_pred)
    f1 = f1_score(ytest, y_pred, average='weighted')
    errs_acc_smote.append(acc)
    errs_f1_smote.append(f1)

errs_acc = np.asarray(errs_acc)
errs_f1 = np.asarray(errs_f1)
errs_acc_smote = np.asarray(errs_acc_smote)
errs_f1_smote = np.asarray(errs_f1_smote)
print('==== Prep normalized ====')
print('Accuracy:', errs_acc.mean(), '+-', errs_acc.std())
print('f1-score:', errs_f1.mean(), '+-', errs_f1.std())
print('Accuracy (smote):', errs_acc_smote.mean(), '+-', errs_acc_smote.std())
print('f1-score (smote):', errs_f1_smote.mean(), '+-', errs_f1_smote.std())

==== Raw not norm ====
Accuracy: 0.829126960359107 +- 0.02553156301512779
f1-score: 0.787510519299962 +- 0.03160476024504506
Accuracy (smote): 0.8060539195963409 +- 0.04602192047852723
f1-score (smote): 0.788859341214864 +- 0.043065273085567284
==== Raw normalized ====
Accuracy: 0.7996186298608433 +- 0.015432418256137817
f1-score: 0.7477898508041246 +- 0.027788567354873974
Accuracy (smote): 0.775782840576741 +- 0.02855181319680453
f1-score (smote): 0.75583040659745 +- 0.02624287692244901
==== Prep not norm ====
Accuracy: 0.8668739546129747 +- 0.02584366360676435
f1-score: 0.8381532252632597 +- 0.038280920355707855
Accuracy (smote): 0.8568541110253781 +- 0.01836804417585253
f1-score (smote): 0.8507988530266319 +- 0.026391353991821787
==== Prep normalized ====
Accuracy: 0.8424621905317415 +- 0.0318680433490166
f1-score: 0.8002417362853895 +- 0.042669954691796005
Accuracy (smote): 0.8412930561527828 +- 0.04199533281853157
f1-score (smote): 0.8243814680710216 +- 0.04586247920934379
