# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import numpy as np


from sklearn.metrics import confusion_matrix
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, StratifiedKFold

import joblib

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/dayofweek.csv')

In [3]:
X = df.drop(columns = ['dayofweek'])
y = df.dayofweek

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [5]:
def crossval(n_splits, X, y, model_class):

    skf = StratifiedKFold(n_splits=n_splits)
   
    scor = []
    for train_index, test_index in skf.split(X, y):
        
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]   

        model_class.fit(X_train, y_train)

        y_pred_train= model_class.predict(X_train)
        y_pred_test= model_class.predict(X_test)

        accuracy_train =  accuracy_score(y_train, y_pred_train)
        accuracy_test = accuracy_score(y_test, y_pred_test)
        scor.append(accuracy_test)
        print('train - {:.5f}   |   valid - {:.5f}'.format(accuracy_train, accuracy_test))       
        
    print('Average accuracy on crossval is {:.5f}'.format(np.mean(scor)))
    print('Std is {:.5f}'.format(np.std(scor)))

In [6]:
%%time
crossval(10, X_train, y_train, LogisticRegression(random_state=21, fit_intercept=False))

train - 0.62902   |   valid - 0.59259
train - 0.64633   |   valid - 0.62963
train - 0.63479   |   valid - 0.56296
train - 0.65622   |   valid - 0.61481
train - 0.63397   |   valid - 0.57778
train - 0.64056   |   valid - 0.59259
train - 0.64138   |   valid - 0.65926
train - 0.65952   |   valid - 0.56296
train - 0.64333   |   valid - 0.59701
train - 0.63674   |   valid - 0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
CPU times: total: 1.19 s
Wall time: 569 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [7]:
%%time
crossval(10, X_train, y_train, LogisticRegression(random_state=21, 
                                                  fit_intercept=False,
                                                  penalty='none',
                                                  solver='newton-cg'))

train - 0.66694   |   valid - 0.63704
train - 0.65787   |   valid - 0.65926
train - 0.66694   |   valid - 0.57778
train - 0.66529   |   valid - 0.62963
train - 0.66694   |   valid - 0.62222
train - 0.65952   |   valid - 0.57778
train - 0.65045   |   valid - 0.69630
train - 0.68425   |   valid - 0.61481
train - 0.66474   |   valid - 0.62687
train - 0.65651   |   valid - 0.60448
Average accuracy on crossval is 0.62462
Std is 0.03379
CPU times: total: 11.2 s
Wall time: 3.7 s


In [8]:
%%time
crossval(10, X_train, y_train, LogisticRegression(random_state=21, 
                                                  fit_intercept=False,
                                                  penalty='l1',
                                                  solver='liblinear'))

train - 0.61830   |   valid - 0.54815
train - 0.62737   |   valid - 0.62222
train - 0.60511   |   valid - 0.54074
train - 0.63644   |   valid - 0.62222
train - 0.62407   |   valid - 0.55556
train - 0.62325   |   valid - 0.58519
train - 0.61253   |   valid - 0.63704
train - 0.64716   |   valid - 0.58519
train - 0.63015   |   valid - 0.59701
train - 0.61367   |   valid - 0.59701
Average accuracy on crossval is 0.58903
Std is 0.03129
CPU times: total: 156 ms
Wall time: 130 ms


In [9]:
%%time
crossval(10, X_train, y_train, LogisticRegression(random_state=21, 
                                                  fit_intercept=False,
                                                  penalty='l2',
                                                  solver='sag'))

train - 0.62902   |   valid - 0.59259
train - 0.64633   |   valid - 0.62963
train - 0.63479   |   valid - 0.56296
train - 0.65622   |   valid - 0.61481
train - 0.63397   |   valid - 0.57778
train - 0.64056   |   valid - 0.59259
train - 0.64221   |   valid - 0.65926
train - 0.65952   |   valid - 0.56296
train - 0.64333   |   valid - 0.59701
train - 0.63674   |   valid - 0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
CPU times: total: 375 ms
Wall time: 377 ms


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [10]:
%%time
crossval(10, X_train, y_train, SVC(kernel='linear', probability=True, random_state=21))

train - 0.70486   |   valid - 0.65926
train - 0.69662   |   valid - 0.75556
train - 0.69415   |   valid - 0.62222
train - 0.70239   |   valid - 0.65185
train - 0.69085   |   valid - 0.65185
train - 0.68920   |   valid - 0.64444
train - 0.69250   |   valid - 0.72593
train - 0.70074   |   valid - 0.62222
train - 0.69605   |   valid - 0.61940
train - 0.71087   |   valid - 0.63433
Average accuracy on crossval is 0.65871
Std is 0.04359
CPU times: total: 2.64 s
Wall time: 2.66 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [11]:
%%time
crossval(10, X_train, y_train, SVC(C=0.1, kernel='linear', probability=True, random_state=21))

train - 0.58120   |   valid - 0.55556
train - 0.57543   |   valid - 0.56296
train - 0.57378   |   valid - 0.57037
train - 0.59275   |   valid - 0.57037
train - 0.58120   |   valid - 0.54815
train - 0.57955   |   valid - 0.54815
train - 0.57296   |   valid - 0.61481
train - 0.59192   |   valid - 0.54815
train - 0.59967   |   valid - 0.52985
train - 0.57825   |   valid - 0.57463
Average accuracy on crossval is 0.56230
Std is 0.02177
CPU times: total: 2.86 s
Wall time: 2.86 s


In [12]:
%%time
crossval(10, X_train, y_train, SVC(C=5, kernel='linear', probability=True, random_state=21))

train - 0.71723   |   valid - 0.65185
train - 0.76257   |   valid - 0.81481
train - 0.77906   |   valid - 0.68148
train - 0.73537   |   valid - 0.67407
train - 0.72135   |   valid - 0.73333
train - 0.72960   |   valid - 0.70370
train - 0.72712   |   valid - 0.71852
train - 0.75021   |   valid - 0.65926
train - 0.76112   |   valid - 0.67164
train - 0.76359   |   valid - 0.68657
Average accuracy on crossval is 0.69952
Std is 0.04542
CPU times: total: 3.09 s
Wall time: 3.1 s


In [13]:
%%time
crossval(10, X_train, y_train, SVC(C=10, kernel='linear', probability=True, random_state=21))

train - 0.75021   |   valid - 0.72593
train - 0.77741   |   valid - 0.82963
train - 0.78566   |   valid - 0.68148
train - 0.76834   |   valid - 0.73333
train - 0.75185   |   valid - 0.77778
train - 0.75598   |   valid - 0.68889
train - 0.76257   |   valid - 0.74074
train - 0.77411   |   valid - 0.68889
train - 0.78254   |   valid - 0.71642
train - 0.78418   |   valid - 0.69403
Average accuracy on crossval is 0.72771
Std is 0.04417
CPU times: total: 3.77 s
Wall time: 3.78 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [14]:
%%time
crossval(10, X_train, y_train, DecisionTreeClassifier(max_depth=10, random_state=21))

train - 0.81039   |   valid - 0.74074
train - 0.77741   |   valid - 0.74074
train - 0.83347   |   valid - 0.70370
train - 0.79720   |   valid - 0.76296
train - 0.82440   |   valid - 0.75556
train - 0.80379   |   valid - 0.68889
train - 0.80709   |   valid - 0.76296
train - 0.80132   |   valid - 0.65926
train - 0.80807   |   valid - 0.75373
train - 0.80478   |   valid - 0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562
CPU times: total: 62.5 ms
Wall time: 61 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [15]:
%%time
crossval(10, X_train, y_train, DecisionTreeClassifier(max_depth=5, random_state=21))

train - 0.59522   |   valid - 0.53333
train - 0.56307   |   valid - 0.53333
train - 0.60181   |   valid - 0.55556
train - 0.59604   |   valid - 0.57037
train - 0.60264   |   valid - 0.57778
train - 0.57955   |   valid - 0.53333
train - 0.58368   |   valid - 0.54815
train - 0.59275   |   valid - 0.51111
train - 0.58237   |   valid - 0.56716
train - 0.60132   |   valid - 0.50000
Average accuracy on crossval is 0.54301
Std is 0.02423
CPU times: total: 62.5 ms
Wall time: 54.5 ms


In [16]:
%%time
crossval(10, X_train, y_train, DecisionTreeClassifier(max_depth=7, random_state=21))

train - 0.70322   |   valid - 0.64444
train - 0.67271   |   valid - 0.68148
train - 0.68261   |   valid - 0.57037
train - 0.68425   |   valid - 0.65185
train - 0.70734   |   valid - 0.64444
train - 0.68755   |   valid - 0.60741
train - 0.69662   |   valid - 0.71111
train - 0.68590   |   valid - 0.63704
train - 0.69357   |   valid - 0.70149
train - 0.70758   |   valid - 0.64925
Average accuracy on crossval is 0.64989
Std is 0.03971
CPU times: total: 62.5 ms
Wall time: 56 ms


In [17]:
%%time
crossval(10, X_train, y_train, DecisionTreeClassifier(max_depth=10, random_state=21))

train - 0.81039   |   valid - 0.74074
train - 0.77741   |   valid - 0.74074
train - 0.83347   |   valid - 0.70370
train - 0.79720   |   valid - 0.76296
train - 0.82440   |   valid - 0.75556
train - 0.80379   |   valid - 0.68889
train - 0.80709   |   valid - 0.76296
train - 0.80132   |   valid - 0.65926
train - 0.80807   |   valid - 0.75373
train - 0.80478   |   valid - 0.68657
Average accuracy on crossval is 0.72551
Std is 0.03562
CPU times: total: 62.5 ms
Wall time: 65 ms


In [18]:
%%time
crossval(10, X_train, y_train, DecisionTreeClassifier(max_depth=15, random_state=21))

train - 0.95796   |   valid - 0.82222
train - 0.93075   |   valid - 0.83704
train - 0.95631   |   valid - 0.83704
train - 0.95301   |   valid - 0.86667
train - 0.95136   |   valid - 0.88889
train - 0.94724   |   valid - 0.82222
train - 0.95466   |   valid - 0.90370
train - 0.94971   |   valid - 0.87407
train - 0.95305   |   valid - 0.83582
train - 0.94316   |   valid - 0.85821
Average accuracy on crossval is 0.85459
Std is 0.02682
CPU times: total: 62.5 ms
Wall time: 65.4 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [19]:
%%time
crossval(10, X_train, y_train, RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21))

train - 0.96455   |   valid - 0.88148
train - 0.96208   |   valid - 0.91852
train - 0.96785   |   valid - 0.86667
train - 0.96455   |   valid - 0.89630
train - 0.96538   |   valid - 0.91111
train - 0.96538   |   valid - 0.88148
train - 0.97115   |   valid - 0.91852
train - 0.96867   |   valid - 0.85185
train - 0.97364   |   valid - 0.88060
train - 0.97941   |   valid - 0.86567
Average accuracy on crossval is 0.88722
Std is 0.02204
CPU times: total: 734 ms
Wall time: 735 ms


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [20]:
%%time
crossval(10, X_train, y_train, RandomForestClassifier(n_estimators=20, max_depth=10, random_state=21))

train - 0.82358   |   valid - 0.77037
train - 0.84996   |   valid - 0.82963
train - 0.86645   |   valid - 0.80741
train - 0.88788   |   valid - 0.80741
train - 0.86645   |   valid - 0.85926
train - 0.84666   |   valid - 0.74074
train - 0.85326   |   valid - 0.82222
train - 0.84996   |   valid - 0.75556
train - 0.88221   |   valid - 0.79104
train - 0.87068   |   valid - 0.76119
Average accuracy on crossval is 0.79448
Std is 0.03555
CPU times: total: 297 ms
Wall time: 306 ms


In [21]:
%%time
crossval(10, X_train, y_train, RandomForestClassifier(n_estimators=100, max_depth=14, random_state=21))

train - 0.96950   |   valid - 0.87407
train - 0.96867   |   valid - 0.91111
train - 0.97197   |   valid - 0.87407
train - 0.96867   |   valid - 0.89630
train - 0.96867   |   valid - 0.90370
train - 0.96702   |   valid - 0.85926
train - 0.97444   |   valid - 0.91111
train - 0.96950   |   valid - 0.88148
train - 0.97117   |   valid - 0.88060
train - 0.97199   |   valid - 0.86567
Average accuracy on crossval is 0.88574
Std is 0.01773
CPU times: total: 1.41 s
Wall time: 1.4 s


In [22]:
%%time
crossval(10, X_train, y_train, RandomForestClassifier(n_estimators=60, max_depth=7, random_state=21))

train - 0.71805   |   valid - 0.65185
train - 0.70816   |   valid - 0.70370
train - 0.71723   |   valid - 0.62222
train - 0.73042   |   valid - 0.66667
train - 0.72712   |   valid - 0.71111
train - 0.75598   |   valid - 0.67407
train - 0.72630   |   valid - 0.73333
train - 0.73289   |   valid - 0.61481
train - 0.73558   |   valid - 0.66418
train - 0.72817   |   valid - 0.67164
Average accuracy on crossval is 0.67136
Std is 0.03541
CPU times: total: 719 ms
Wall time: 720 ms


In [23]:
%%time
crossval(10, X_train, y_train, RandomForestClassifier(n_estimators=100, max_depth=9, random_state=21))

train - 0.81781   |   valid - 0.72593
train - 0.85243   |   valid - 0.84444
train - 0.85326   |   valid - 0.77778
train - 0.84336   |   valid - 0.79259
train - 0.83759   |   valid - 0.80741
train - 0.83182   |   valid - 0.72593
train - 0.84336   |   valid - 0.79259
train - 0.84089   |   valid - 0.72593
train - 0.85173   |   valid - 0.77612
train - 0.84679   |   valid - 0.77612
Average accuracy on crossval is 0.77448
Std is 0.03705
CPU times: total: 1.22 s
Wall time: 1.23 s


In [24]:
%%time
crossval(10, X_train, y_train, RandomForestClassifier(n_estimators=150, max_depth=9, random_state=21))

train - 0.82523   |   valid - 0.73333
train - 0.86645   |   valid - 0.83704
train - 0.85903   |   valid - 0.78519
train - 0.85326   |   valid - 0.80000
train - 0.83100   |   valid - 0.81481
train - 0.84254   |   valid - 0.72593
train - 0.83677   |   valid - 0.80000
train - 0.84089   |   valid - 0.70370
train - 0.85997   |   valid - 0.77612
train - 0.85173   |   valid - 0.77612
Average accuracy on crossval is 0.77522
Std is 0.04002
CPU times: total: 1.89 s
Wall time: 1.9 s


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [25]:
best_model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
best_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=14, n_estimators=50, random_state=21)

In [26]:
y_pred = best_model.predict(X_test)

In [27]:
test_acc = accuracy_score(y_test, y_pred)
print(f'Accuracy test: {test_acc:.5f}')

Accuracy test: 0.90828


In [28]:
cm = confusion_matrix(y_test, y_pred, labels=np.unique(y_test))
errors = (cm.sum(axis=1) - np.diag(cm)) / cm.sum(axis=1)
for i, day in enumerate(np.unique(y_test)):
    print(f'final accuracy for {day}: {errors[i]:.2%}')

final accuracy for 0: 25.93%
final accuracy for 1: 12.73%
final accuracy for 2: 6.67%
final accuracy for 3: 1.25%
final accuracy for 4: 19.05%
final accuracy for 5: 5.56%
final accuracy for 6: 9.86%


In [29]:
# 5. Сохраняем модель
joblib.dump(best_model, '../data/best_random_forest.joblib')

['../data/best_random_forest.joblib']

Сравнение моделей и анализ ошибок
Сравнение точности:

Логистическая регрессия: ~0.60
SVM (оптимальные параметры): ~0.72–0.75
DecisionTreeClassifier (оптимальные параметры): ~0.72–0.86
RandomForestClassifier (лучшие параметры): 0.88722 на тесте
Модель случайного леса с параметрами n_estimators=100, max_depth=14 показала наивысшую точность среди всех протестированных моделей.

Анализ ошибок по дням недели:

Наибольшая доля ошибок — для класса 0 (25.93%). Это может говорить о сложности распознавания этого дня недели, либо о недостатке признаков для его выделения.
Следующие по ошибкам: класс 4 (19.05%) и класс 1 (12.73%).
Минимальные ошибки — для классов 3 (1.25%), 5 (5.56%), 2 (6.67%), 6 (9.86%).
(0 - 6 дни недели)
Вывод:
Модель хорошо справляется с большинством классов, но для некоторых дней недели (особенно 0 и 4) ошибки выше среднего. Это может быть связано с пересечением признаков между днями или дисбалансом данных. Для дальнейшего улучшения можно попробовать увеличить количество признаков или использовать методы балансировки