In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import math
import time
import pickle
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import sys
sys.path.append('../src')
from preprocessing import *

In [3]:
df_db = group_datafiles_byID('../datasets/raw/HT_Sensor_metadata.dat', '../datasets/raw/HT_Sensor_dataset.dat')
df_db = reclassify_series_samples(df_db)

# Testing sklearn DecisionTree Classifier

In [4]:
df_train, df_test = split_series_byID(100, 0.75, df_db)

In [5]:
features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
xtrain, ytrain = df_train[features].values, df_train['class'].values
xtest, ytest = df_test[features].values, df_test['class'].values
print(xtrain.shape)
print(ytrain.shape)
print(xtest.shape)
print(ytest.shape)

(701805, 11)
(701805,)
(227186, 11)
(227186,)


In [6]:
clf_tree = DecisionTreeClassifier(
                criterion='entropy',
                splitter='best',
                max_features='sqrt',
                random_state=0)
            # Default input args:
            #    max_depth=None -> If None, then nodes are expanded until all leaves are pure or until all leaves contain less than min_samples_split samples
            #    min_samples_split=2
            #    min_samples_leaf=1
            #    ccp_alpha=0.0 -> By default, no pruning is performed

In [7]:
start_t = time.time()

clf_tree.fit(xtrain, ytrain)

end_t = time.time()
print('Training time (mins):', (end_t-start_t)/60)


Training time (mins): 0.07512848774592082


In [8]:
clf_tree.score(xtest, ytest)

0.7461287227205902

In [11]:
print(clf_tree.tree_.max_depth)

180


# Testing tree ensembles

In [4]:
from sklearn.ensemble import RandomForestClassifier

In [5]:
max_depth_list = [2, 4, 6, 10, 20, 30, 50, 70, 80, 100]
n_estimators_list = [100, 200, 300, 400]
criterions = ['entropy', 'gini']

In [10]:
main_start = time.time()

for d in max_depth_list:
    for nest in n_estimators_list:
        for crit in criterions:

            df_train, df_test = split_series_byID(100, 0.77, df_db)
            features = ['R1', 'R2', 'R3', 'R4', 'R5', 'R5', 'R6', 'R7', 'R8', 'Temp.', 'Humidity']
            xtrain, ytrain = df_train[features].values, df_train['class'].values
            xtest, ytest = df_test[features].values, df_test['class'].values
            
            clf_randFor = RandomForestClassifier(
                        n_estimators=nest,
                        criterion=crit,
                        max_depth=d,
                        max_features='sqrt',
                        random_state=0)

            start_t = time.time()

            clf_randFor.fit(xtrain, ytrain)

            end_t = time.time()
            print('==========================================')
            print('Number of estimators:',nest)
            print('Max depth:', d)
            print('Criterion:', crit)
            print('Training time (mins):', (end_t-start_t)/60)
            print('Precision (score):', clf_randFor.score(xtest, ytest))
            print('==========================================')

main_end = time.time()

print('==================================================')
print('OVERALL TIME (hours):', (main_end-main_start)/(60*60))
print('==================================================')
print('==================================================')

        

Number of estimators: 100
Criterion: entropy
Max depth: 5
Training time (mins): 1.718788222471873
Precision (score): 0.857314231211713
Number of estimators: 100
Criterion: gini
Max depth: 5
Training time (mins): 1.4157062689463298
Precision (score): 0.8247485223597636
Number of estimators: 200
Criterion: entropy
Max depth: 5
Training time (mins): 3.264460519949595
Precision (score): 0.8343315933822879
Number of estimators: 200
Criterion: gini
Max depth: 5
Training time (mins): 2.9093011657396954
Precision (score): 0.7486812659846548
Number of estimators: 300
Criterion: entropy
Max depth: 5
Training time (mins): 5.01885005235672
Precision (score): 0.8699591314195677
Number of estimators: 300
Criterion: gini
Max depth: 5
Training time (mins): 4.32206263144811
Precision (score): 0.8683720952707699
Number of estimators: 400
Criterion: entropy
Max depth: 5
Training time (mins): 7.576039183139801
Precision (score): 0.8405624566984455
Number of estimators: 400
Criterion: gini
Max depth: 5
Tra

KeyboardInterrupt: 