# Day 09. Exercise 03
# Ensembles

## 0. Imports

In [194]:
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
from itertools import product
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.ensemble import StackingClassifier, BaggingClassifier
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import ConfusionMatrixDisplay

## 1. Preprocessing

1. Create the same dataframe as in the previous exercise.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test` and then get `X_train`, `y_train`, `X_valid`, `y_valid` from the previous `X_train`, `y_train`. Use the additional parameter `stratify`.

In [152]:
df = pd.read_csv('data/dayofweek-not-scaled.csv')

In [154]:
X = df.drop(['dayofweek'], axis=1)
y = df['dayofweek']

In [156]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y,
                                                    random_state=21)

In [158]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, stratify=y_train,
                                                    random_state=21)

## 2. Individual classifiers

1. Train SVM, decision tree and random forest again with the best parameters that you got from the 01 exercise with `random_state=21` for all of them.
2. Evaluate `accuracy`, `precision`, and `recall` for them on the validation set.
3. The result of each cell of the section should look like this:

```
accuracy is 0.87778
precision is 0.88162
recall is 0.87778
```

In [162]:
def scores_calc(X_train, y_train, X_valid, y_valid, model):
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_valid)
    
    accuracy = accuracy_score(y_valid, y_pred)
    precision = precision_score(y_valid, y_pred, average='weighted')
    recall = recall_score(y_valid, y_pred, average='weighted')
    
    print(f"Accuracy: {accuracy:.5f}")
    print(f"Precision: {precision:.5f}")
    print(f"Recall: {recall:.5f}")
    
    return accuracy, precision, recall

In [164]:
svm = SVC(kernel='rbf', C=10, gamma='auto', probability=True, random_state=21)
_ = scores_calc(X_train, y_train, X_valid, y_valid, svm)

Accuracy: 0.87778
Precision: 0.88162
Recall: 0.87778


In [165]:
dtc = DecisionTreeClassifier(max_depth=22, random_state=21)
_ = scores_calc(X_train, y_train, X_valid, y_valid, dtc)

Accuracy: 0.86296
Precision: 0.86414
Recall: 0.86296


In [166]:
rfc = RandomForestClassifier(n_estimators=100, max_depth = 28, 
                                criterion = "gini", random_state=21)
_ = scores_calc(X_train, y_train, X_valid, y_valid, rfc)

Accuracy: 0.88889
Precision: 0.88940
Recall: 0.88889


## 3. Voting classifiers

1. Using `VotingClassifier` and the three models that you have just trained, calculate the `accuracy`, `precision`, and `recall` on the validation set.
2. Play with the other parameteres.
3. Calculate the `accuracy`, `precision` and `recall` on the test set for the model with the best weights in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).

In [172]:
estimators=[
    ('svm', svm),
    ('dt', dtc),
    ('rfc', rfc)]
weights = [
    [1, 1, 1],  # Все модели имеют одинаковый вес
    [2, 1, 1],  # Первая модель имеет больший вес
    [1, 2, 1],  # Вторая модель имеет больший вес
    [1, 1, 2],   # Третья модель имеет больший вес
    [4, 1, 1],
    [1, 4, 1],
    [1, 1, 4],
    [4, 1, 4]

]

In [174]:
for w in weights:
    voting_clf = VotingClassifier(estimators=estimators, voting='soft', weights=w)
    print("Weight:", w, ':')
    _ = scores_calc(X_train, y_train, X_valid, y_valid, voting_clf)

Weight: [1, 1, 1] :
Accuracy: 0.88148
Precision: 0.88314
Recall: 0.88148
Weight: [2, 1, 1] :
Accuracy: 0.90000
Precision: 0.90125
Recall: 0.90000
Weight: [1, 2, 1] :
Accuracy: 0.86296
Precision: 0.86414
Recall: 0.86296
Weight: [1, 1, 2] :
Accuracy: 0.90000
Precision: 0.90066
Recall: 0.90000
Weight: [4, 1, 1] :
Accuracy: 0.90000
Precision: 0.90352
Recall: 0.90000
Weight: [1, 4, 1] :
Accuracy: 0.86296
Precision: 0.86414
Recall: 0.86296
Weight: [1, 1, 4] :
Accuracy: 0.89630
Precision: 0.89674
Recall: 0.89630
Weight: [4, 1, 4] :
Accuracy: 0.90741
Precision: 0.90929
Recall: 0.90741


In [175]:
for w in weights:
    voting_clf = VotingClassifier(estimators=estimators, voting='hard', weights=w)
    print("Weight:", w, ':')
    _ = scores_calc(X_train, y_train, X_valid, y_valid, voting_clf)

Weight: [1, 1, 1] :
Accuracy: 0.88889
Precision: 0.88879
Recall: 0.88889
Weight: [2, 1, 1] :
Accuracy: 0.88148
Precision: 0.88865
Recall: 0.88148
Weight: [1, 2, 1] :
Accuracy: 0.87407
Precision: 0.87439
Recall: 0.87407
Weight: [1, 1, 2] :
Accuracy: 0.89630
Precision: 0.89641
Recall: 0.89630
Weight: [4, 1, 1] :
Accuracy: 0.87778
Precision: 0.88162
Recall: 0.87778
Weight: [1, 4, 1] :
Accuracy: 0.86296
Precision: 0.86414
Recall: 0.86296
Weight: [1, 1, 4] :
Accuracy: 0.88889
Precision: 0.88940
Recall: 0.88889
Weight: [4, 1, 4] :
Accuracy: 0.88889
Precision: 0.88879
Recall: 0.88889


In [176]:
voting_clf = VotingClassifier(estimators=estimators, voting='soft', weights = [4, 1, 4])
_ = scores_calc(X_train, y_train, X_test, y_test, voting_clf)

Accuracy: 0.90237
Precision: 0.90544
Recall: 0.90237


## 4. Bagging classifiers

1. Using `BaggingClassifier` and `SVM` with the best parameters create an ensemble, try different values of the `n_estimators`, use `random_state=21`.
2. Play with the other parameters.
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision)

In [179]:
def gridSearch(X_train, y_train, X_valid, y_valid, model, param_grid):
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=21)

    grid_search = GridSearchCV(
        estimator=model,
        param_grid=param_grid,
        scoring='accuracy',
        cv=skf,
        return_train_score=True,
        n_jobs=-1,  # Использовать все ядра процессора
        verbose = 1
    )

    grid_search.fit(X_train, y_train)

    best_params = grid_search.best_params_
    best_score = grid_search.best_score_

    best_model = grid_search.best_estimator_
    best_model.fit(X_train, y_train)

    y_pred = best_model.predict(X_valid)
    valid_score = accuracy_score(y_valid, y_pred)

    results_df = pd.DataFrame(grid_search.cv_results_)
    results_df_sorted = results_df.sort_values(by='rank_test_score')

    return best_params, best_score, valid_score, results_df_sorted

In [180]:
bagging_clf = BaggingClassifier(svm)

In [181]:
param_grid = {'n_estimators': range(10, 60, 10), 'random_state': [21]}
best_params, best_score, valid_score, results_df_sorted = gridSearch(X_train, y_train, X_valid, y_valid, bagging_clf, param_grid)

Fitting 3 folds for each of 5 candidates, totalling 15 fits


In [183]:
print(best_params, best_score, valid_score)

{'n_estimators': 20, 'random_state': 21} 0.784798308057361 0.8851851851851852


In [184]:
best_bagging_clf = BaggingClassifier(svm, n_estimators = 30, random_state=21)
_ = scores_calc(X_train, y_train, X_test, y_test, best_bagging_clf)

Accuracy: 0.87278
Precision: 0.87840
Recall: 0.87278


## 5. Stacking classifiers

1. To achieve reproducibility in this case you will have to create an object of cross-validation generator: `StratifiedKFold(n_splits=n, shuffle=True, random_state=21)`, where `n` you will try to optimize (the details are below).
2. Using `StackingClassifier` and the three models that you have recently trained, calculate the `accuracy`, `precision` and `recall` on the validation set, try different values of `n_splits` `[2, 3, 4, 5, 6, 7]` in the cross-validation generator and parameter `passthrough` in the classifier itself,
3. Calculate the `accuracy`, `precision`, and `recall` for the model with the best parameters (in terms of accuracy) on the test set (if there are several of them with equal values, choose the one with the higher precision). Use `final_estimator=LogisticRegression(solver='liblinear')`.

In [212]:
results = []

n_splits_values = [2, 3, 4, 5, 6, 7]
passthrough_values = [True, False]

for n_splits in n_splits_values:
    for passthrough in passthrough_values:
        cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=21)
        
        stacking_clf = StackingClassifier(estimators=[
                                                        ('svm', svm),
                                                        ('dt', dtc),
                                                        ('rfc', rfc)
                                                    ], cv = cv,
                                           passthrough=passthrough,
            final_estimator=LogisticRegression(max_iter=1000, solver='liblinear'))
        
        accuracy = cross_val_score(stacking_clf, X_train, y_train, cv=cv, scoring='accuracy').mean()
        
        results.append((n_splits, passthrough, accuracy))

In [213]:
results

[(2, True, 0.8413729128014842),
 (2, False, 0.8283858998144713),
 (3, True, 0.8654776642938202),
 (3, False, 0.8682657587950068),
 (4, True, 0.8812370921106981),
 (4, False, 0.8821733443480656),
 (5, True, 0.8979715762273901),
 (5, False, 0.8998277347114556),
 (6, True, 0.8970411752534658),
 (6, False, 0.8942530519346162),
 (7, True, 0.8979591836734694),
 (7, False, 0.901669758812616)]

In [220]:
cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=21)
best_stacking_clf = StackingClassifier(estimators=[
                                                        ('svm', svm),
                                                        ('dt', dtc),
                                                        ('rfc', rfc)
                                                    ], 
                                           passthrough=False, cv = cv,
                                           final_estimator = LogisticRegression(solver='liblinear', max_iter=1000))
_ = scores_calc(X_train, y_train, X_test, y_test, best_stacking_clf)

Accuracy: 0.89941
Precision: 0.90327
Recall: 0.89941


## 6. Predictions

1. Choose the best model in terms of accuracy (if there are several of them with equal values, choose the one with the higher precision).
2. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your full dataset), for which labname and for which users.
3. Save the model.

In [222]:
best_model = VotingClassifier(estimators=estimators, voting='soft', weights = [4, 1, 4]).fit(X_train, y_train)

In [224]:
df_forecast = pd.DataFrame({"predict":best_model.predict(X_test)}, index= y_test.index)
df_fit = pd.DataFrame({"predict":best_model.predict(X_train)}, index= y_train.index)
df['forecast'] = pd.concat([df_fit, df_forecast])

In [226]:
df['Error'] = (df["forecast"]!=df.dayofweek)*1
error_analysis = df.groupby(['dayofweek']).agg(
    total_samples=('Error', 'size'),
    total_errors=('Error', 'sum')
).reset_index()
error_analysis['perc_error'] = (error_analysis.total_errors/error_analysis.total_samples).round(2)

In [228]:
error_analysis.sort_values(by = 'perc_error', ascending=False)

Unnamed: 0,dayofweek,total_samples,total_errors,perc_error
0,0,136,32,0.24
4,4,104,20,0.19
5,5,271,52,0.19
1,1,274,50,0.18
6,6,356,64,0.18
2,2,149,26,0.17
3,3,396,68,0.17


In [230]:
((df[df.Error == 1].iloc[:,2:31].sum()/df.iloc[:,2:31].sum()).round(2)).sort_values(ascending=False)

uid_user_23    0.50
uid_user_6     0.33
uid_user_17    0.32
uid_user_18    0.29
uid_user_22    0.29
uid_user_16    0.28
uid_user_29    0.25
uid_user_3     0.24
uid_user_15    0.24
uid_user_14    0.23
uid_user_27    0.22
uid_user_2     0.21
uid_user_4     0.21
uid_user_20    0.19
uid_user_24    0.18
dayofweek      0.18
uid_user_26    0.17
uid_user_25    0.16
uid_user_31    0.15
uid_user_1     0.15
uid_user_10    0.14
uid_user_19    0.13
uid_user_28    0.13
uid_user_30    0.13
uid_user_13    0.12
uid_user_12    0.12
uid_user_21    0.09
uid_user_0     0.00
uid_user_11    0.00
dtype: float64

In [232]:
((df[df.Error == 1].iloc[:,33:-2].sum()/df.iloc[:,33:-2].sum()).round(2)).sort_values(ascending=False)

labname_lab03       1.00
labname_lab03s      1.00
labname_lab05s      0.25
labname_laba06      0.23
labname_laba04s     0.22
labname_laba04      0.21
labname_laba06s     0.20
labname_code_rvw    0.18
labname_project1    0.18
labname_laba05      0.16
labname_lab02       0.00
dtype: float64

In [234]:
joblib.dump(best_model, 'data/best_ensemle_ex03.joblib')

['data/best_ensemle_ex03.joblib']