In [4]:
%load_ext autoreload
%autoreload 2

In [9]:
import math
import time
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, plot_confusion_matrix
import sys
sys.path.append('../src')
from preprocessing import *
from plotting import *

In [6]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)

# Testing sklearn DecisionTree Classifier

In [4]:
df_train, df_test = split_series_byID(100, 0.75, df_db)

In [5]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(701805, 11)
(701805,)
(227186, 11)
(227186,)


In [6]:
clf_tree = DecisionTreeClassifier(
                criterion='entropy',
                splitter='best',
                max_features='sqrt',
                random_state=0)
            # Default input args:
            #    max_depth=None -> If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples
            #    min_samples_split=2
            #    min_samples_leaf=1
            #    ccp_alpha=0.0 -> By default, no pruning is performed

In [7]:
start_t = time.time()

clf_tree.fit(xtrain, ytrain)

end_t = time.time()
print('Training time (mins):', (end_t-start_t)/60)


Training time (mins): 0.07512848774592082


In [8]:
clf_tree.score(xtest, ytest)

0.7461287227205902

In [11]:
print(clf_tree.tree_.max_depth)

180


# Testing RandomForest (ensembles)

## Using whole dataset

In [7]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)

In [12]:
max_depth_list = [2, 4, 6, 8, 10, 14, 18, 22, 26, 32, 36, 40]
n_estimators_list = [400, 500, 700, 1000]
criterions = ['entropy', 'gini']

In [13]:
main_start = time.time()

for d in max_depth_list:
    for nest in n_estimators_list:
        for crit in criterions:

            df_train, df_test = split_series_byID(100, 0.77, df_db)
            features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
            xtrain, ytrain = df_train[features].values, df_train['class'].values
            xtest, ytest = df_test[features].values, df_test['class'].values
            
            clf_randFor = RandomForestClassifier(
                        n_estimators=nest,
                        criterion=crit,
                        max_depth=d,
                        max_features='sqrt',
                        random_state=0)

            start_t = time.time()

            clf_randFor.fit(xtrain, ytrain)

            end_t = time.time()
            print('==========================================')
            print('Number of estimators:',nest)
            print('Max depth:', d)
            print('Criterion:', crit)
            print('Training time (mins):', (end_t-start_t)/60)
            print('Precision (score):', clf_randFor.score(xtest, ytest))
            print('==========================================')

main_end = time.time()

print('==================================================')
print('OVERALL TIME (hours):', (main_end-main_start)/(60*60))
print('==================================================')
print('==================================================')

        

9139
Number of estimators: 1000
Max depth: 2
Criterion: gini
Training time (mins): 6.274376936753591
Precision (score): 0.82572384867891
Number of estimators: 400
Max depth: 4
Criterion: entropy
Training time (mins): 5.355758766333262
Precision (score): 0.8250252487058514
Number of estimators: 400
Max depth: 4
Criterion: gini
Training time (mins): 4.3957205533981325
Precision (score): 0.8006146586511463
Number of estimators: 500
Max depth: 4
Criterion: entropy
Training time (mins): 6.587088612715403
Precision (score): 0.830885825237377
Number of estimators: 500
Max depth: 4
Criterion: gini
Training time (mins): 5.570846013228098
Precision (score): 0.8388239826545058
Number of estimators: 700
Max depth: 4
Criterion: entropy
Training time (mins): 9.296311287085215
Precision (score): 0.7938500736418873
Number of estimators: 700
Max depth: 4
Criterion: gini
Training time (mins): 8.30868130127589
Precision (score): 0.8073012898149703
Number of estimators: 1000
Max depth: 4
Criterion: entrop

In [14]:
df_train, df_test = split_series_byID(100, 0.77, df_db)
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values

clf_randFor = RandomForestClassifier(
            n_estimators=5000,
            criterion='entropy',
            max_depth=7,
            max_features='sqrt',
            random_state=0)

start_t = time.time()

clf_randFor.fit(xtrain, ytrain)

end_t = time.time()
print('==========================================')
print('Number of estimators:',5000)
print('Max depth:', 7)
print('Criterion:', 'entropy')
print('Training time (mins):', (end_t-start_t)/60)
print('Precision (score):', clf_randFor.score(xtest, ytest))
print('==========================================')

Number of estimators: 5000
Max depth: 7
Criterion: entropy
Training time (mins): 132.90388695001602
Precision (score): 0.8471366322797279


## Deleting excess background

In [7]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)
df_db = remove_excess_bg(df_db)

In [8]:
max_depth_list = [2, 4, 6, 10, 20, 30, 50, 70, 80, 100]
n_estimators_list = [100, 200, 300, 400]
criterions = ['entropy', 'gini']

In [9]:
main_start = time.time()

for d in max_depth_list:
    for nest in n_estimators_list:
        for crit in criterions:

            df_train, df_test = split_series_byID(100, 0.85, df_db)
            features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
            xtrain, ytrain = df_train[features].values, df_train['class'].values
            xtest, ytest = df_test[features].values, df_test['class'].values
            
            clf_randFor = RandomForestClassifier(
                        n_estimators=nest,
                        criterion=crit,
                        max_depth=d,
                        max_features='sqrt',
                        oob_score=True,
                        random_state=0)

            start_t = time.time()

            clf_randFor.fit(xtrain, ytrain)
            oob_error = clf_randFor.oob_score_

            end_t = time.time()
            print('==========================================')
            print('Number of estimators:',nest)
            print('Max depth:', d)
            print('Criterion:', crit)
            print('Training time (mins):', (end_t-start_t)/60)
            print('Out-of-bag estimate:', oob_error)
            print('Precision on test set (score):', clf_randFor.score(xtest, ytest))
            print('==========================================')

main_end = time.time()

print('==================================================')
print('OVERALL TIME (hours):', (main_end-main_start)/(60*60))
print('==================================================')
print('==================================================')

g estimate: 0.7714623631607841
Precision on test set (score): 0.7386642435256702
Number of estimators: 100
Max depth: 4
Criterion: entropy
Training time (mins): 1.3233341852823892
Out-of-bag estimate: 0.7977355781143606
Precision on test set (score): 0.7995242742132227
Number of estimators: 100
Max depth: 4
Criterion: gini
Training time (mins): 0.9155715505282084
Out-of-bag estimate: 0.8013998709052768
Precision on test set (score): 0.7543262756057798
Number of estimators: 200
Max depth: 4
Criterion: entropy
Training time (mins): 2.856734116872152
Out-of-bag estimate: 0.8114948888605821
Precision on test set (score): 0.638909702991801
Number of estimators: 200
Max depth: 4
Criterion: gini
Training time (mins): 3.2667909224828082
Out-of-bag estimate: 0.8011770606397818
Precision on test set (score): 0.8161678142057008
Number of estimators: 300
Max depth: 4
Criterion: entropy
Training time (mins): 3.670349482695262
Out-of-bag estimate: 0.8125563563993758
Precision on test set (score): 0.

# Boosting

In [9]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)

In [14]:
losses = ['deviance', 'exponential']
estimators = [500, 1000]
learning_rates = [0.1, 0.01]
depths = [5, 7]

In [15]:
main_start = time.time()

for l in losses:
    for estim in estimators:
        for lr in learning_rates:
            for d in depths:
                
                df_train, df_test = split_series_byID(100, 0.8, df_db)
                features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
                xtrain, ytrain = df_train[features].values, df_train['class'].values
                xtest, ytest = df_test[features].values, df_test['class'].values

                clf_boost = GradientBoostingClassifier(
                                loss=l,
                                learning_rate=lr,
                                n_estimators=estim,
                                max_depth=d,
                                random_state=0)

                start_t = time.time()

                clf_boost.fit(xtrain, ytrain)

                end_t = time.time()
                print('==========================================')
                print('Loss function:', l)
                print('Number of estimators:', estim)
                print('Learning rate:', lr)
                print('Max depth:', d)
                print('Training time (mins):', (end_t-start_t)/60)
                print('Precision (score):', clf_boost.score(xtest, ytest))
                print('==========================================')
                
main_end = time.time()
print('==============================================================')
print('Total time (hours):', (main_end-main_start)/(60*60))
print('==============================================================')

Loss function: deviance
Number of estimators: 500
Learning rate: 0.1
Max depth: 5
Training time (mins): 111.22014693419139
Precision (score): 0.7613230575135076
Loss function: deviance
Number of estimators: 500
Learning rate: 0.1
Max depth: 7
Training time (mins): 152.49901788234712
Precision (score): 0.7380312111885091
Loss function: deviance
Number of estimators: 500
Learning rate: 0.01
Max depth: 5
Training time (mins): 117.28765278259912
Precision (score): 0.8705405887202798
Loss function: deviance
Number of estimators: 500
Learning rate: 0.01
Max depth: 7
Training time (mins): 157.71072766780853
Precision (score): 0.9183072450085292
Loss function: deviance
Number of estimators: 1000
Learning rate: 0.1
Max depth: 5
Training time (mins): 233.82789334456126
Precision (score): 0.8355979223553905
Loss function: deviance
Number of estimators: 1000
Learning rate: 0.1
Max depth: 7
Training time (mins): 305.2583548863729
Precision (score): 0.8155505567883596
Loss function: deviance
Number 

ValueError: ExponentialLoss requires 2 classes; got 3 class(es)

# Boosting with AdaBoost classifier

This time we are gonna store classifiers to study them later.

In [11]:
base_estim = DecisionTreeClassifier(max_depth=5)

In [10]:
n_estimators = [500, 1000]
learning_rates = [0.1, 0.01, 0.001]

In [12]:
clfs = []

main_start = time.time()

for nest in n_estimators:
    for lr in learning_rates:

        df_train, df_test = split_series_byID(100, 0.8, df_db)
        features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
        xtrain, ytrain = df_train[features].values, df_train['class'].values
        xtest, ytest = df_test[features].values, df_test['class'].values
        
        clf_adaBoost = AdaBoostClassifier(base_estimator=base_estim, n_estimators=nest, learning_rate=lr)
        
        start_t = time.time()
        
        clf_adaBoost.fit(xtrain, ytrain)

        end_t = time.time()
        
        print('==========================================')
        print('Number of estimators:', nest)
        print('Learning rate:', lr)
        print('Training time (mins):', (end_t-start_t)/60)
        print('Precision (score):', clf_adaBoost.score(xtest, ytest))
        print('==========================================')
        
        clfs.append(clf_adaBoost)
        

main_end = time.time()
print('==============================================================')
print('Total time (hours):', (main_end-main_start)/(60*60))
print('==============================================================')     

Number of estimators: 500
Learning rate: 0.1
Training time (mins): 43.583362050851186
Precision (score): 0.8719164179104477
Number of estimators: 500
Learning rate: 0.01
Training time (mins): 40.7228889465332
Precision (score): 0.7880553532410779
Number of estimators: 500
Learning rate: 0.001
Training time (mins): 42.52133306662242
Precision (score): 0.8655926945044344
Number of estimators: 1000
Learning rate: 0.1
Training time (mins): 83.51484416325887
Precision (score): 0.7885299402295479
Number of estimators: 1000
Learning rate: 0.01
Training time (mins): 83.1132468978564
Precision (score): 0.8277351931081416
Number of estimators: 1000
Learning rate: 0.001
Training time (mins): 83.40477879842122
Precision (score): 0.8240472648355809
Total time (hours): 6.32070310221778
