### Random Forest
##### Features: temperature, humidity, light, co2, humidity ratio

In [220]:
features = ['temperature', 'humidity', 'light', 'co2', 'humidity_ratio']
X = df[features]
y = df['occupancy']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [221]:
model = RandomForestClassifier(random_state=0)

In [222]:
params = { 
    'n_estimators' : [75,100,125],
    'max_features' : [None, 'auto'],
    'max_depth' : [None, 5, 6]
}

In [223]:
gs = GridSearchCV(model, param_grid=params, cv=cv_folds, verbose=verbose, n_jobs=n_jobs)

In [224]:
%time gs.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:   10.6s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   18.3s finished


CPU times: user 1.1 s, sys: 100 ms, total: 1.2 s
Wall time: 19.2 s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=RandomForestClassifier(random_state=0), n_jobs=4,
             param_grid={'max_depth': [None, 5, 6],
                         'max_features': [None, 'auto'],
                         'n_estimators': [75, 100, 125]},
             verbose=1)

In [225]:
filename = '../models/random_forest.sav'

In [226]:
pickle.dump(gs, open(filename, 'wb')) 

In [227]:
gs = pickle.load(open(filename, 'rb'))

In [228]:
gs.best_params_

{'max_depth': None, 'max_features': 'auto', 'n_estimators': 125}

In [229]:
scores = get_scores(gs, X_train, y_train, X_test, y_test)
scores.insert(0, 'Random Forest')
scores.insert(1, ', '.join(features))
scores_series = pd.Series(scores, index=scores_df.columns)
scores_df = scores_df.append(scores_series, ignore_index=True)

### Random Forest
##### Features: temperature, humidity, light, co2

In [230]:
features = ['temperature', 'humidity', 'light', 'co2']
X = df[features]
y = df['occupancy']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [231]:
model = RandomForestClassifier(random_state=0)

In [232]:
params = { 
    'n_estimators' : [75,100,125],
    'max_features' : [None, 'auto'],
    'max_depth' : [None, 5, 6]
}

In [233]:
gs = GridSearchCV(model, param_grid=params, cv=cv_folds, verbose=verbose, n_jobs=n_jobs)

In [234]:
%time gs.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    7.8s
[Parallel(n_jobs=4)]: Done  90 out of  90 | elapsed:   14.6s finished


CPU times: user 977 ms, sys: 33.6 ms, total: 1.01 s
Wall time: 15.5 s


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=RandomForestClassifier(random_state=0), n_jobs=4,
             param_grid={'max_depth': [None, 5, 6],
                         'max_features': [None, 'auto'],
                         'n_estimators': [75, 100, 125]},
             verbose=1)

In [235]:
filename = '../models/random_forest1.sav'

In [236]:
pickle.dump(gs, open(filename, 'wb')) 

In [237]:
gs = pickle.load(open(filename, 'rb'))

In [238]:
gs.best_params_

{'max_depth': None, 'max_features': 'auto', 'n_estimators': 125}

In [239]:
scores = get_scores(gs, X_train, y_train, X_test, y_test)
scores.insert(0, 'Random Forest')
scores.insert(1, ', '.join(features))
scores_series = pd.Series(scores, index=scores_df.columns)
scores_df = scores_df.append(scores_series, ignore_index=True)

### LDA

In [240]:
# https://machinelearningmastery.com/linear-discriminant-analysis-with-python/
features = ['temperature', 'humidity', 'light', 'co2', 'humidity_ratio']
X = df[features]
y = df['occupancy']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [241]:
model = LinearDiscriminantAnalysis()

In [242]:
params = { 
    'solver' : ['svd', 'lsqr', 'eigen']
}

In [243]:
gs = GridSearchCV(model, param_grid=params, cv=cv_folds, verbose=verbose, n_jobs=n_jobs)

In [244]:
%time gs.fit(X_train, y_train)

Fitting 5 folds for each of 3 candidates, totalling 15 fits
CPU times: user 47.1 ms, sys: 7.67 ms, total: 54.8 ms
Wall time: 130 ms


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  15 out of  15 | elapsed:    0.1s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=LinearDiscriminantAnalysis(), n_jobs=4,
             param_grid={'solver': ['svd', 'lsqr', 'eigen']}, verbose=1)

In [245]:
filename = '../models/lda.sav'

In [246]:
pickle.dump(gs, open(filename, 'wb')) 

In [247]:
gs = pickle.load(open(filename, 'rb'))

In [248]:
gs.best_params_

{'solver': 'svd'}

In [249]:
scores = get_scores(gs, X_train, y_train, X_test, y_test)
scores.insert(0, 'LDA')
scores.insert(1, ', '.join(features))
scores_series = pd.Series(scores, index=scores_df.columns)
scores_df = scores_df.append(scores_series, ignore_index=True)

### Classification and Regression Trees (CART)

In [250]:
# https://www.datacamp.com/community/tutorials/decision-tree-classification-python
features = ['temperature', 'humidity', 'light', 'co2', 'humidity_ratio']
X = df[features]
y = df['occupancy']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=0)

In [251]:
model = DecisionTreeClassifier(random_state=0)

In [252]:
params = { 
    'max_depth' : [None, 2, 5],
    'max_features' : ['auto', 'sqrt', 'log2']
}

In [253]:
gs = GridSearchCV(model, param_grid=params, cv=cv_folds, verbose=verbose, n_jobs=n_jobs)

In [254]:
%time gs.fit(X_train, y_train)

Fitting 5 folds for each of 9 candidates, totalling 45 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


CPU times: user 615 ms, sys: 56.4 ms, total: 672 ms
Wall time: 228 ms


[Parallel(n_jobs=4)]: Done  45 out of  45 | elapsed:    0.2s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             estimator=DecisionTreeClassifier(random_state=0), n_jobs=4,
             param_grid={'max_depth': [None, 2, 5],
                         'max_features': ['auto', 'sqrt', 'log2']},
             verbose=1)

In [255]:
filename = '../models/cart.sav'

In [256]:
pickle.dump(gs, open(filename, 'wb')) 

In [257]:
gs = pickle.load(open(filename, 'rb'))

In [258]:
gs.best_params_

{'max_depth': None, 'max_features': 'auto'}

In [259]:
scores = get_scores(gs, X_train, y_train, X_test, y_test)
scores.insert(0, 'CART')
scores.insert(1, ', '.join(features))
scores_series = pd.Series(scores, index=scores_df.columns)
scores_df = scores_df.append(scores_series, ignore_index=True)