### Import libraries.

In [1]:
import pandas as pd
import numpy as np

import pickle
from functions import get_scores
from functions import run_model

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, GradientBoostingClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn import metrics

### Load data.

In [2]:
df = pickle.load(open("../datasets/occupancy.p", "rb"))
# df = pd.read_csv('../datasets/occupancy.csv')

In [149]:
df

Unnamed: 0_level_0,temperature,humidity,light,co2,humidity_ratio,occupancy
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2015-02-04 17:51:00,23.180,27.2720,426.00,721.25,0.004793,1
2015-02-04 17:52:00,23.150,27.2675,429.50,714.00,0.004783,1
2015-02-04 17:53:00,23.150,27.2450,426.00,713.50,0.004779,1
2015-02-04 17:54:00,23.150,27.2000,426.00,708.25,0.004772,1
2015-02-04 17:55:00,23.100,27.2000,426.00,704.50,0.004757,1
...,...,...,...,...,...,...
2015-02-18 09:15:00,20.815,27.7175,429.75,1505.25,0.004213,1
2015-02-18 09:16:00,20.865,27.7450,423.50,1514.50,0.004230,1
2015-02-18 09:17:00,20.890,27.7450,423.50,1521.50,0.004237,1
2015-02-18 09:18:00,20.890,28.0225,418.75,1632.00,0.004279,1


### Createa a dataframe for storing model scores

In [12]:
column_names = ['Model name', 'Features', 'Best score', 'Train score', 'Test score', 
                'Sensitivity', 'Specificity', 'Precision', 'Accuracy', 'F1-score']
scores_df = pd.DataFrame(columns=column_names)
scores_df.head()

Unnamed: 0,Model name,Features,Best score,Train score,Test score,Sensitivity,Specificity,Precision,Accuracy,F1-score


### Features and target variables.

In [5]:
features_list = [['temperature', 'humidity', 'light', 'co2', 'humidity_ratio'],
                 ['temperature', 'humidity', 'light', 'co2'],
                 ['temperature', 'humidity', 'co2', 'humidity_ratio'],
                 ['temperature', 'humidity', 'light', 'humidity_ratio'],
                 ['temperature', 'humidity', 'humidity_ratio'],
                 ['temperature', 'humidity'],
                 ['temperature', 'light'],
                 ['humidity', 'light'],
                 ['temperature', 'co2']
                ]
target = 'occupancy'

### Initialize common grid search parameters for all models.

In [152]:
cv_folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
verbose = 1
n_jobs = 4

### Random Forest (RF)

In [153]:
model_name = 'rf'
params = { 
    'n_estimators' : [75,100,125],
    'max_features' : [None, 'auto'],
    'max_depth' : [None, 5, 6]
}

In [154]:
for i, features in enumerate(features_list):
    model = RandomForestClassifier(random_state=0)
    scores = run_model(df, features, target, params, model, model_name+str(i))
    scores_series = pd.Series(scores, index=scores_df.columns)
    scores_df = scores_df.append(scores_series, ignore_index=True)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.2s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   18.8s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   13.8s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.5s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   19.0s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.5s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   14.2s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.6s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   14.8s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.8s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   10.8s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    3.8s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    7.6s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    4.6s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    9.0s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    6.2s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   11.3s finished


### Linear Discriminant Analysis (LDA)

In [155]:
# https://machinelearningmastery.com/linear-discriminant-analysis-with-python/

In [156]:
model_name = 'lda'
params = { 
    'solver' : ['svd', 'lsqr', 'eigen']
}

In [157]:
for i, features in enumerate(features_list):
    model = LinearDiscriminantAnalysis()
    scores = run_model(df, features, target, params, model, model_name+str(i))
    scores_series = pd.Series(scores, index=scores_df.columns)
    scores_df = scores_df.append(scores_series, ignore_index=True)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits
Fitting 5 folds for each of 3 candidates, totalling 15 fits


[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished


### Classification and Regression Trees (CART)

In [158]:
# https://www.datacamp.com/community/tutorials/decision-tree-classification-python

In [159]:
model_name = 'cart'
params = { 
    'max_depth' : [None, 2, 5],
    'max_features' : ['auto', 'sqrt', 'log2']
}

In [160]:
for i, features in enumerate(features_list):
    model = DecisionTreeClassifier(random_state=0)
    scores = run_model(df, features, target, params, model, model_name+str(i))
    scores_series = pd.Series(scores, index=scores_df.columns)
    scores_df = scores_df.append(scores_series, ignore_index=True)

[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.2s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.2s finished


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Fitting 5 folds for each of 9 candidates, totalling 45 fits
Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Done  38 out of  45 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.1s finished
[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.1s finished


### Gradient Boosting Machine (GBM)

In [161]:
# https://machinelearningmastery.com/gradient-boosting-machine-ensemble-in-python/

In [162]:
model_name = 'gbm'
params = { 
    'learning_rate' : [0.1, 0.2, 0.3],
    'n_estimators' : [100, 200],
    'max_depth' : [3, 5]
}

In [163]:
for i, features in enumerate(features_list):
    model = GradientBoostingClassifier(random_state=0)
    scores = run_model(df, features, target, params, model, model_name+str(i))
    scores_series = pd.Series(scores, index=scores_df.columns)
    scores_df = scores_df.append(scores_series, ignore_index=True)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   16.5s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   25.3s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   19.3s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   15.2s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   23.6s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   12.5s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   19.4s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   11.3s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   17.4s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.2s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   12.5s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.9s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:    8.9s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.4s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   11.8s finished


Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    8.6s
[Parallel(n_jobs=4)]: Done  60 out of  60 | elapsed:   13.1s finished


### K-Nearest Neighbor (KNN)

In [17]:
model_name = 'knn'
params = { 
    'n_neighbors' : [3, 5, 10],
    'p' : [1,2],
    'leaf_size' : [1, 5, 10]
}

In [18]:
for i, features in enumerate(features_list):
    model = KNeighborsClassifier()    
    scores = run_model(df, features, target, params, model, model_name+str(i))
    scores_series = pd.Series(scores, index=scores_df.columns)
    scores_df = scores_df.append(scores_series, ignore_index=True)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.8s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    2.0s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    1.9s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    1.7s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    2.3s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.4s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    1.6s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    1.5s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    2.0s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    2.3s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.7s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    2.0s finished


Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    1.3s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:    1.5s finished


### Model summary table.

In [19]:
scores_df

Unnamed: 0,Model name,Features,Best score,Train score,Test score,Sensitivity,Specificity,Precision,Accuracy,F1-score
0,knn0,"temperature, humidity, light, co2, humidity_ratio",0.9906,0.9931,0.9926,0.9915,0.9929,0.974,0.9926,0.9827
1,knn1,"temperature, humidity, light, co2",0.9906,0.9931,0.9926,0.9915,0.9929,0.974,0.9926,0.9827
2,knn2,"temperature, humidity, co2, humidity_ratio",0.9271,0.951,0.9363,0.8392,0.9623,0.8564,0.9363,0.8477
3,knn3,"temperature, humidity, light, humidity_ratio",0.9902,0.9913,0.9913,0.9915,0.9912,0.968,0.9913,0.9796
4,knn4,"temperature, humidity, humidity_ratio",0.954,0.9691,0.9528,0.8857,0.9708,0.8904,0.9528,0.8881
5,knn5,"temperature, humidity",0.9537,0.9686,0.9535,0.8825,0.9725,0.8958,0.9535,0.8891
6,knn6,"temperature, light",0.9907,0.9908,0.9913,0.9915,0.9912,0.968,0.9913,0.9796
7,knn7,"humidity, light",0.9901,0.9911,0.9911,0.9915,0.9909,0.967,0.9911,0.9791
8,knn8,"temperature, co2",0.9046,0.9318,0.9122,0.7651,0.9515,0.8087,0.9122,0.7863
9,knn0,"temperature, humidity, light, co2, humidity_ratio",0.9908,0.9945,0.9926,0.9894,0.9935,0.976,0.9926,0.9827


In [22]:
scores_df.sort_values(by='Accuracy', ascending=False).head()

Unnamed: 0,Model name,Features,Best score,Train score,Test score,Sensitivity,Specificity,Precision,Accuracy,F1-score
0,knn0,"temperature, humidity, light, co2, humidity_ratio",0.9906,0.9931,0.9926,0.9915,0.9929,0.974,0.9926,0.9827
1,knn1,"temperature, humidity, light, co2",0.9906,0.9931,0.9926,0.9915,0.9929,0.974,0.9926,0.9827
10,knn1,"temperature, humidity, light, co2",0.9908,0.9945,0.9926,0.9894,0.9935,0.976,0.9926,0.9827
9,knn0,"temperature, humidity, light, co2, humidity_ratio",0.9908,0.9945,0.9926,0.9894,0.9935,0.976,0.9926,0.9827
6,knn6,"temperature, light",0.9907,0.9908,0.9913,0.9915,0.9912,0.968,0.9913,0.9796


In [166]:
scores_df.to_csv('../models/scores.csv', index=False)
scores_df.to_pickle('../models/scores.p')