In [6]:
%load_ext autoreload
%autoreload 2

In [12]:
import math
import time
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC

# Testing how to split correctly the data

In [17]:
df_meta = pd.read_csv('../datasets/raw/HT_Sensor_metadata.dat', delimiter='\t+')
df_meta.shape

(100, 5)

In [77]:
len_indices = df_meta.shape[0]
test_perc = 0.3
n_test = math.floor(test_perc * len_indices)
test_indices = np.random.choice(np.arange(100), size=n_test, replace=False)
print(test_indices)

[58 22 94 39 89 50 35 88  0 73 65 29 62 43 27  4 41 23 98 47 75 36 18 26
 78 10 96 25 13 51]


In [108]:
df_db = pd.read_csv('../datasets/raw/HT_Sensor_dataset.dat', delimiter='\s+')
df_db.shape

(928991, 12)

In [109]:
df_db.set_index('id', inplace=True)
df_db = df_db.join(df_meta, how='inner')
df_db['time'] += df_db['t0']
df_db.set_index(np.arange(df_db.shape[0]), inplace=True)
df_db.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
0,12.49025,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528,0,07-04-15,banana,13.49,1.64
1,12.490528,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299,0,07-04-15,banana,13.49,1.64
2,12.490806,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093,0,07-04-15,banana,13.49,1.64
3,12.491084,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905,0,07-04-15,banana,13.49,1.64
4,12.491373,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736,0,07-04-15,banana,13.49,1.64


In [132]:
bool_list = []
for item in df_db.id:
    if item in test_indices:
        bool_list.append(True)
    else:
        bool_list.append(False)

print(len(bool_list))
bool_list = np.asarray(bool_list)

928991


In [133]:
df_test = df_db[bool_list]
df_test.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
0,12.49025,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528,0,07-04-15,banana,13.49,1.64
1,12.490528,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299,0,07-04-15,banana,13.49,1.64
2,12.490806,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093,0,07-04-15,banana,13.49,1.64
3,12.491084,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905,0,07-04-15,banana,13.49,1.64
4,12.491373,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736,0,07-04-15,banana,13.49,1.64


In [136]:
df_train = df_db[~bool_list]
df_train.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
12815,18.610202,12.9677,10.3815,10.4593,11.6892,12.3805,14.1287,7.18476,7.77536,25.7545,58.9859,1,07-05-15,wine,19.61,0.54
12816,18.610491,12.9671,10.3814,10.459,11.6886,12.3812,14.128,7.1853,7.77512,25.756,58.9812,1,07-05-15,wine,19.61,0.54
12817,18.610769,12.9668,10.3813,10.4592,11.6881,12.382,14.1274,7.18579,7.77504,25.7573,58.9749,1,07-05-15,wine,19.61,0.54
12818,18.611047,12.9659,10.3815,10.4593,11.6881,12.3828,14.1274,7.18636,7.77498,25.7585,58.9683,1,07-05-15,wine,19.61,0.54
12819,18.611325,12.9648,10.3816,10.4592,11.6878,12.3833,14.1269,7.18663,7.77478,25.7596,58.9623,1,07-05-15,wine,19.61,0.54


# Building the final splitting function

In [170]:
def split_series_byID(n_ids, train_perc, joint_df):
    # Sampling test indices
    n_train = math.floor(train_perc * n_ids)
    train_indices = np.random.choice(np.arange(len_indices), size=n_train, replace=False)

    # Selecting train/test examples
    bool_list = []
    for id in joint_df.id:
        if id in train_indices:
            bool_list.append(True)
        else:
            bool_list.append(False)

    # We will use the fact it is a numpy array later
    bool_list = np.asarray(bool_list)

    df_train = joint_df[bool_list]
    # The fact that bool_list is a np array allows us to just choose its complementary
    df_test = joint_df[~bool_list]

    return df_train, df_test

In [171]:
df_meta = pd.read_csv('../datasets/raw/HT_Sensor_metadata.dat', delimiter='\t+')
df_meta.shape

(100, 5)

In [172]:
df_db = pd.read_csv('../datasets/raw/HT_Sensor_dataset.dat', delimiter='\s+')
df_db.shape

(928991, 12)

In [173]:
df_db.set_index('id', inplace=True)
df_db = df_db.join(df_meta, how='inner')
df_db['time'] += df_db['t0']
df_db.set_index(np.arange(df_db.shape[0]), inplace=True)
df_db.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
0,12.49025,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528,0,07-04-15,banana,13.49,1.64
1,12.490528,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299,0,07-04-15,banana,13.49,1.64
2,12.490806,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093,0,07-04-15,banana,13.49,1.64
3,12.491084,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905,0,07-04-15,banana,13.49,1.64
4,12.491373,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736,0,07-04-15,banana,13.49,1.64


In [174]:
n_ids = 100
train_perc = 0.75
df_train, df_test = split_series_byID(n_ids, train_perc, df_db)

In [175]:
print(df_train.shape)
df_train.head()

(710889, 16)


Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
0,12.49025,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528,0,07-04-15,banana,13.49,1.64
1,12.490528,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299,0,07-04-15,banana,13.49,1.64
2,12.490806,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093,0,07-04-15,banana,13.49,1.64
3,12.491084,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905,0,07-04-15,banana,13.49,1.64
4,12.491373,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736,0,07-04-15,banana,13.49,1.64


In [176]:
print(df_test.shape)
df_test.head()

(218102, 16)


Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
31227,5.490119,11.5751,8.32428,8.43391,9.4093,11.1259,11.9891,7.8941,5.69867,26.3565,65.0541,3,07-09-15,banana,6.49,0.72
31228,5.490397,11.5752,8.32434,8.43402,9.40951,11.1259,11.9892,7.89354,5.69871,26.3559,65.052,3,07-09-15,banana,6.49,0.72
31229,5.490675,11.5744,8.3241,8.43397,9.40937,11.1258,11.9895,7.89248,5.69865,26.3553,65.05,3,07-09-15,banana,6.49,0.72
31230,5.490953,11.5737,8.32389,8.43393,9.40957,11.1258,11.9897,7.89151,5.69859,26.3548,65.0483,3,07-09-15,banana,6.49,0.72
31231,5.491239,11.5737,8.32413,8.43404,9.40992,11.126,11.99,7.89078,5.69853,26.3544,65.0467,3,07-09-15,banana,6.49,0.72


# Testing training with SVC

In [162]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

In [168]:
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(703127, 11)
(703127,)
(225864, 11)
(225864,)


In [165]:
C = 1e-2
clf_svm = SVC(C=C)

In [166]:
start_t = time.time()

clf_svm.fit(xtrain, ytrain)

end_t = time.time()

print('==> Tiempo transcurrido de entrenamiento (horas):', (end_t-start_t)/(60*60))

==> Tiempo transcurrido de entrenamiento (horas): 2.160339769191212


In [167]:
clf_svm.score(xtest, ytest)

0.3754338894201821

# Testing training with Logistic Regression

In [177]:
clf_lr = LogisticRegressionCV()
###
# Default: k-fold: 5-fold cross validation
#          Cs: A grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4
#          penalty: l2
#          solver: lbfgs
#          tol: 1e-4
#          max_iter: 100
###

In [178]:
clf_lr.fit(xtrain, ytrain)

LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

In [179]:
clf_lr.score(xtest, ytest)

0.552509474728155

# Testing what we got reclassifying the series before and after stimulus

In [3]:
import sys
sys.path.append('../src')

In [4]:
from preprocessing import *

In [16]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)
df_db.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
0,12.49025,12.8621,10.3683,10.4383,11.6699,13.4931,13.3423,8.04169,8.73901,26.2257,59.0528,0,07-04-15,background,13.49,1.64
1,12.490528,12.8617,10.3682,10.4375,11.6697,13.4927,13.3412,8.04133,8.73908,26.2308,59.0299,0,07-04-15,background,13.49,1.64
2,12.490806,12.8607,10.3686,10.437,11.6696,13.4924,13.3405,8.04101,8.73915,26.2365,59.0093,0,07-04-15,background,13.49,1.64
3,12.491084,12.8602,10.3686,10.437,11.6697,13.4921,13.3398,8.04086,8.73936,26.2416,58.9905,0,07-04-15,background,13.49,1.64
4,12.491373,12.8595,10.3688,10.4374,11.6699,13.4919,13.339,8.04087,8.73986,26.2462,58.9736,0,07-04-15,background,13.49,1.64


In [17]:
df_train, df_test = split_series_byID(100, 0.75, df_db)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(703885, 11)
(703885,)
(225106, 11)
(225106,)


In [18]:
clf_lr = LogisticRegressionCV()
###
# Default: k-fold: 5-fold cross validation
#          Cs: A grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4
#          penalty: l2
#          solver: lbfgs
#          tol: 1e-4
#          max_iter: 100
###

In [19]:
clf_lr.fit(xtrain, ytrain)

LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

In [20]:
clf_lr.score(xtest, ytest)

0.8426119250486438

In [21]:
with open('lr_good_split_test_model.pkl', 'wb') as f:
    pickle.dump(clf_lr, f)


# Removing excess of background

In [7]:
import sys
sys.path.append('../src')

from preprocessing import *
from sklearn.linear_model import LogisticRegressionCV

In [4]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)
df_db = remove_excess_bg(df_db)
df_db.head()

Unnamed: 0,time,R1,R2,R3,R4,R5,R6,R7,R8,Temp.,Humidity,id,date,class,t0,dt
1647,12.990107,13.1748,10.6763,10.7443,12.012,13.8965,13.6949,8.21363,8.92817,26.109,58.1348,0,07-04-15,background,13.49,1.64
1648,12.990385,13.1748,10.6765,10.7446,12.0124,13.8968,13.6943,8.21372,8.92807,26.1091,58.1344,0,07-04-15,background,13.49,1.64
1649,12.990663,13.175,10.6765,10.7447,12.0134,13.8966,13.6943,8.21394,8.92814,26.1092,58.1341,0,07-04-15,background,13.49,1.64
1650,12.990941,13.1749,10.6763,10.7447,12.0137,13.8968,13.6943,8.21414,8.92836,26.1093,58.1338,0,07-04-15,background,13.49,1.64
1651,12.991219,13.1746,10.6757,10.7446,12.0136,13.8971,13.6947,8.21418,8.92824,26.1093,58.1335,0,07-04-15,background,13.49,1.64


In [5]:
print(df_db.shape)

(594517, 16)


In [21]:
df_train, df_test = split_series_byID(100, 0.75, df_db)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(442948, 11)
(442948,)
(151569, 11)
(151569,)


In [22]:
clf_lr = LogisticRegressionCV()
###
# Default: k-fold: 5-fold cross validation
#          Cs: A grid of Cs values are chosen in a logarithmic scale between 1e-4 and 1e4
#          penalty: l2
#          solver: lbfgs
#          tol: 1e-4
#          max_iter: 100
###

In [23]:
clf_lr.fit(xtrain, ytrain)

LogisticRegressionCV(Cs=10, class_weight=None, cv='warn', dual=False,
           fit_intercept=True, intercept_scaling=1.0, max_iter=100,
           multi_class='warn', n_jobs=None, penalty='l2',
           random_state=None, refit=True, scoring=None, solver='lbfgs',
           tol=0.0001, verbose=0)

In [24]:
clf_lr.score(xtest, ytest)

0.8234005634397535

# Testing with a 5-layer neural network

In [44]:
from sklearn.neural_network import MLPClassifier
import time

In [48]:
start_total = time.time()

for n_hid_layers in [1, 2, 3, 4]:
    for n_neur in [5, 10, 30, 50]:
        tup = []
        for i in range(n_hid_layers):
            tup.append(n_neur)
        tup = tuple(tup)

        start_t = time.time()

        clf_nn = MLPClassifier(
                    hidden_layer_sizes = tup,
                    activation='relu',
                    solver='adam',
                    max_iter=1000,
                )
        df_train, df_test = split_series_byID(100, 0.75, df_db)
        features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
        xtrain, ytrain = df_train[features].values, df_train['class'].values
        xtest, ytest = df_test[features].values, df_test['class'].values

        clf_nn.fit(xtrain, ytrain)

        accuracy = clf_nn.score(xtest, ytest)

        end_t = time.time()

        print('========================================')
        print('Number of hidden layers:', n_hid_layers)
        print('Number of neurons per layer:', n_neur)
        print('Accuracy:', accuracy)
        print('Time (minutes):', (end_t-start_t)/60)
        print('========================================')

end_total = time.time()
print('\n====> Total tiempo transcurrido (horas):', (end_total-start_total)/(60*60))

Number of hidden layers: 1
Number of neurons per layer: 5
Accuracy: 0.7474020679082815
Time (minutes): 2.09681693315506
Number of hidden layers: 1
Number of neurons per layer: 10
Accuracy: 0.7315296110267492
Time (minutes): 6.165274548530578
Number of hidden layers: 1
Number of neurons per layer: 30
Accuracy: 0.6855147340221968
Time (minutes): 9.630225416024526
Number of hidden layers: 1
Number of neurons per layer: 50
Accuracy: 0.7086005398203821
Time (minutes): 18.78674513498942
Number of hidden layers: 2
Number of neurons per layer: 5
Accuracy: 0.7360287350444831
Time (minutes): 1.6581427017847696
Number of hidden layers: 2
Number of neurons per layer: 10
Accuracy: 0.7085657778080884
Time (minutes): 5.473576414585113
Number of hidden layers: 2
Number of neurons per layer: 30
Accuracy: 0.7908143945913569
Time (minutes): 25.004579945405325
Number of hidden layers: 2
Number of neurons per layer: 50
Accuracy: 0.571268169612359
Time (minutes): 52.15484143098195
Number of hidden layers: 3

In [None]:
# Testing it with deleted excess background dataframe
df_train, df_test = split_series_byID(100, 0.75, df_db)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

In [42]:
clf_nn.fit(xtrain, ytrain)

MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20, 20, 20), learning_rate='constant',
       learning_rate_init=0.001, max_iter=1000, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=None, shuffle=True, solver='lbfgs', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False)

In [43]:
clf_nn.score(xtest, ytest)

0.6840381991814461