# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [119]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import StackingClassifier
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import roc_auc_score
from joblib import dump, load
from tqdm.notebook import trange, tqdm
import numpy as np

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [52]:
df = pd.read_csv("../data/dayofweek-not-scaled.csv")
X = df.drop(['dayofweek'], axis=1).values
y = df[['dayofweek']].values[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=21, stratify=y_train)
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [53]:
svc = SVC(C=10, class_weight=None, gamma='auto', kernel='rbf', random_state=21, probability=True)
svc.fit(X_train, y_train)
predict = svc.predict(X_valid)
print("accuracy is", accuracy_score(y_valid, predict))
print("precision is", precision_score(y_valid, predict, average='weighted'))
print("recall is", recall_score(y_valid, predict, average='weighted'))

accuracy is 0.8777777777777778
precision is 0.8816152211617203
recall is 0.8777777777777778


In [54]:
dtc = DecisionTreeClassifier(max_depth=21, class_weight='balanced', criterion='gini', random_state=21)
dtc.fit(X_train, y_train)
predict = dtc.predict(X_valid)
print("accuracy is", accuracy_score(y_valid, predict))
print("precision is", precision_score(y_valid, predict, average='weighted'))
print("recall is", recall_score(y_valid, predict, average='weighted'))

accuracy is 0.8666666666666667
precision is 0.8716971333998339
recall is 0.8666666666666667


In [55]:
rfc = RandomForestClassifier(max_depth=24, class_weight='balanced', criterion='entropy',n_estimators=100, random_state=21)
rfc.fit(X_train, y_train)
predict = rfc.predict(X_valid)
print("accuracy is", accuracy_score(y_valid, predict))
print("precision is", precision_score(y_valid, predict, average='weighted'))
print("recall is", recall_score(y_valid, predict, average='weighted'))

accuracy is 0.8962962962962963
precision is 0.8969799473598222
recall is 0.8962962962962963


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [56]:
vc = VotingClassifier(estimators=[('svc', svc), ('dtc', dtc), ('rfc', rfc)], voting='hard')
vc = vc.fit(X_train, y_train)
predict = vc.predict(X_valid)
print("accuracy is", accuracy_score (y_valid, predict))
print("precision is", precision_score(y_valid, predict, average='weighted'))
print("recall is", recall_score(y_valid, predict, average='weighted'))

accuracy is 0.9
precision is 0.8999260615927283
recall is 0.9


In [73]:
best_acc = 0
best_prec = 0
with tqdm(total=(5 * 5 * 5)) as pbar:
    for i in range(5):
        for j in range(5):
            for k in range(5):
                vc = VotingClassifier(estimators=[('svc', svc), ('dtc', dtc), ('rfc', rfc)], voting='hard', weights=[i,j,k], n_jobs=8)
                vc = vc.fit(X_train, y_train)
                predict = vc.predict(X_test)
                acc = accuracy_score(y_test, predict)
                if acc == best_acc:
                    prec = precision_score(y_test, predict, average='weighted')
                    if best_prec < prec:
                        best_prec = prec
                        weights = [i, j, k]
                if acc > best_acc:
                    best_acc = acc
                    weights = [i, j, k]
                pbar.update(1)

  0%|          | 0/125 [00:00<?, ?it/s]

In [74]:
print("best weights:", weights)
vc = VotingClassifier(estimators=[('svc', svc), ('dtc', dtc), ('rfc', rfc)], voting='hard', weights=weights, n_jobs=8)
vc = vc.fit(X_train, y_train)
predict = vc.predict(X_test)
print("accuracy is", accuracy_score (y_test, predict))
print("precision is", precision_score(y_test, predict, average='weighted'))
print("recall is", recall_score(y_test, predict, average='weighted'))

best weights: [4, 2, 3]
accuracy is 0.9201183431952663
precision is 0.9228244006008212
recall is 0.9201183431952663


In [75]:
predict = rfc.predict(X_valid)
print("accuracy is", accuracy_score(y_valid, predict))
print("precision is", precision_score(y_valid, predict, average='weighted'))
print("recall is", recall_score(y_valid, predict, average='weighted'))

accuracy is 0.8962962962962963
precision is 0.8969799473598222
recall is 0.8962962962962963


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [88]:
best_acc = 0
best_prec = 0

with tqdm(total=(5)) as pbar:
    for n_estimators in (5, 10, 25, 50, 100):
        bc = BaggingClassifier(base_estimator=SVC(C=10,
                                class_weight=None,gamma='auto',
                                kernel='rbf', random_state=21, probability=True),
                                   random_state=21, n_jobs=8, n_estimators=n_estimators)
        bc.fit(X_train, y_train)
        predict = bc.predict(X_test)
        acc = accuracy_score(y_test, predict)
        if acc == best_acc:
            prec = precision_score(y_test, predict, average='weighted')
            if best_prec < prec:
                best_prec = prec
                weights = [i, j, k]
        if acc > best_acc:
            best_acc = acc
            params = [n_estimators]
        bc.fit(X_train, y_train)
        pbar.update(1)

  0%|          | 0/5 [00:00<?, ?it/s]

In [89]:
bc = BaggingClassifier(base_estimator=SVC(C=10,
                                    class_weight=None,gamma='auto',
                                    kernel='rbf', random_state=21, probability=True),
                                       random_state=21, n_jobs=8, n_estimators=params[0])
bc.fit(X_train, y_train)
predict = bc.predict(X_test)
print("accuracy is", accuracy_score (y_test, predict))
print("precision is", precision_score(y_test, predict, average='weighted'))
print("recall is", recall_score(y_test, predict, average='weighted'))

accuracy is 0.8846153846153846
precision is 0.8894129703464362
recall is 0.8846153846153846


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [131]:
def crossval(n_splits, model, passthrough):
    prec_arr = []
    acc_arr = []
    rec_arr = []
    estimators=[(type(model).__name__, model)]
    kf = StratifiedKFold(n_splits=n_splits, random_state=21, shuffle=True)
    for train_index, valid_index in kf.split(X_train, y_train):
        X_train_set = X_train[train_index]
        y_train_set = y_train[train_index]
        clf = StackingClassifier(estimators=estimators, final_estimator=LogisticRegression(solver='liblinear'), passthrough=passthrough, n_jobs=8)
        clf.fit(X_train_set, y_train_set)
        predict = clf.predict(X_valid)
        acc = accuracy_score (y_valid, predict)
        prec = precision_score(y_valid, predict, average='weighted')
        rec = recall_score(y_valid, predict, average='weighted')
        prec_arr += [prec]
        acc_arr += [acc]
        rec_arr += [rec]
    return np.array(acc_arr).mean(), np.array(prec).mean(), np.array(rec_arr).mean()

In [133]:
svc1 = SVC(C=10, class_weight=None, gamma='auto', kernel='rbf', random_state=21, probability=True)
dtc1 = DecisionTreeClassifier(max_depth=21, class_weight='balanced', criterion='gini', random_state=21)
rfc1 = RandomForestClassifier(max_depth=24, class_weight='balanced', criterion='entropy',n_estimators=100, random_state=21)

with tqdm(total=(6 * 2 * 3)) as pbar:
    svc_acc = 0
    svc_prec = 0
    svc_params = []

    for n_splits in [2, 3, 4, 5, 6, 7]:
        for passthrough in (False, True):
            acc, prec, rec = crossval(n_splits, svc1, passthrough)
            if acc == svc_acc:
                if svc_prec < prec:
                    svc_prec = prec
                    svc_params = [n_splits, passthrough]
            if acc > svc_acc:
                svc_acc = acc
                svc_prec = prec
                svc_params = [n_splits, passthrough]
            pbar.update(1)
    dtc_acc = 0
    dtc_prec = 0
    dtc_params = []
    for n_splits in [2, 3, 4, 5, 6, 7]:
        for passthrough in (False, True):
            acc, prec, rec = crossval(n_splits, dtc1, passthrough)
            if acc == dtc_acc:
                if dtc_prec < prec:
                    dtc_prec = prec
                    dtc_params = [n_splits, passthrough]
            if acc > dtc_acc:
                dtc_acc = acc
                dtc_prec = prec
                dtc_params = [n_splits, passthrough]
            pbar.update(1)
    rfc_acc = 0
    rfc_prec = 0
    rfc_params = []    
    for n_splits in [2, 3, 4, 5, 6, 7]:
        for passthrough in (False, True):
            acc, prec, rec = crossval(n_splits, rfc1, passthrough)
            if acc == rfc_acc:
                if rfc_prec < prec:
                    rfc_prec = prec
                    rfc_params = [n_splits, passthrough]
            if acc > rfc_acc:
                rfc_acc = acc
                rfc_prec = prec
                rfc_params = [n_splits, passthrough]
            pbar.update(1)
acc, prec, rec = crossval(svc_params[0], svc1, svc_params[1])
print("SVC accuracy is", acc)
print("SVC precision is", prec)
print("SVC recall is", rec)
acc, prec, rec = crossval(dtc_params[0], dtc1, dtc_params[1])
print("DecisionTreeClassifier accuracy is", acc)
print("DecisionTreeClassifier precision is", prec)
print("DecisionTreeClassifier recall is", rec)
acc, prec, rec = crossval(rfc_params[0], rfc1, rfc_params[1])
print("RandomForestClassifier accuracy is", acc)
print("RandomForestClassifier precision is", prec)
print("RandomForestClassifier recall is", rec)

  0%|          | 0/36 [00:00<?, ?it/s]

SVC accuracy is 0.865079365079365
SVC precision is 0.8624797266317733
SVC recall is 0.865079365079365
DecisionTreeClassifier accuracy is 0.8449735449735449
DecisionTreeClassifier precision is 0.8753737499381271
DecisionTreeClassifier recall is 0.8449735449735449
RandomForestClassifier accuracy is 0.8878306878306879
RandomForestClassifier precision is 0.8850338782100021
RandomForestClassifier recall is 0.8878306878306879


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [138]:
rfc = RandomForestClassifier(max_depth=24, class_weight='balanced', criterion='entropy',n_estimators=100, random_state=21)
rfc.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=24, random_state=21)

In [139]:
worst_day = 1
worst_day_acc  = 1
for day in range(7):
    indexes = np.where(y_test == day)
    predict = rfc.predict(X_test[indexes])
    predict = np.round(predict, 0)
    error = accuracy_score(predict, y_test[indexes])
    if error < worst_day_acc:
        worst_day_acc = error  
        worst_day = day
print("Worst day:", worst_day, "Error:", worst_day_acc)

worst_user = -1
worst_user_acc  = 1
for user in range(2, 33):
    indexes = np.where(X[:,user] == 1)
    if len(indexes[0]) == 0:
        continue
    predict = rfc.predict(X[indexes])
    predict = np.round(predict, 0)
    error = accuracy_score(predict, y[indexes])
    if error < worst_user_acc:
        worst_user_acc = error  
        worst_user = user
print("Worst user:", df.columns[worst_user + 1], "Acc:", worst_user_acc)

worst_lab = -1
worst_lab_acc  = 1
for lab in range(32, len(df.columns) - 1):
    indexes = np.where(X[:,lab] == 1)
    if len(indexes[0]) == 0:
        continue
    predict = rfc.predict(X[indexes])
    predict = np.round(predict, 0)
    error = accuracy_score(predict, y[indexes])
    if error < worst_lab_acc:
        worst_lab_acc = error  
        worst_lab = lab
print("Worst lab:", df.columns[worst_lab + 1], "Acc:", worst_lab_acc)

Worst day: 0 Error: 0.7037037037037037
Worst user: uid_user_22 Acc: 0.7142857142857143
Worst lab: labname_lab03 Acc: 0.0
