# Day 09. Exercise 00
# Regularization

## 0. Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/dayofweek.csv')

X = df.drop('dayofweek', axis=1)
y = df['dayofweek']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [3]:
def print_crossval_scores(model, X_train, y_train, cv=10, verbose=True):
    skf = StratifiedKFold(n_splits=cv, random_state=21, shuffle=True)
    train_scores = []
    valid_scores = []

    for train_index, valid_index in skf.split(X_train, y_train):
        X_train_fold, X_valid_fold = X_train.iloc[train_index], X_train.iloc[valid_index]
        y_train_fold, y_valid_fold = y_train.iloc[train_index], y_train.iloc[valid_index]

        model.fit(X_train_fold, y_train_fold)

        train_score = model.score(X_train_fold, y_train_fold)
        valid_score = model.score(X_valid_fold, y_valid_fold)

        train_scores.append(train_score)
        valid_scores.append(valid_score)
        if verbose:
            print(f"train - {train_score:.5f}   |   valid - {valid_score:.5f}")

    avg_accuracy = np.mean(valid_scores)
    std_accuracy = np.std(valid_scores)
    print(f"Average accuracy on crossval is {avg_accuracy:.5f}")
    print(f"Std is {std_accuracy:.5f}")

In [4]:
%%time

model = LogisticRegression(random_state=21, fit_intercept=False)
print_crossval_scores(model, X_train, y_train)

train - 0.68920   |   valid - 0.66667
train - 0.68508   |   valid - 0.62963
train - 0.69250   |   valid - 0.63704
train - 0.69744   |   valid - 0.67407
train - 0.69085   |   valid - 0.68889
train - 0.68920   |   valid - 0.65926
train - 0.70981   |   valid - 0.62963
train - 0.70322   |   valid - 0.59259
train - 0.68534   |   valid - 0.71642
train - 0.70428   |   valid - 0.61940
Average accuracy on crossval is 0.65136
Std is 0.03475
CPU times: user 3.67 s, sys: 3.46 s, total: 7.12 s
Wall time: 1.1 s


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [5]:
%%time

params = {
    'l1': ['liblinear'],
    'l2': ['lbfgs', 'liblinear', 'newton-cg', 'sag', 'saga'],
    None: ['newton-cg']
}

for penalty in params:
    for solver in params[penalty]:
        model = LogisticRegression(penalty=penalty, solver=solver, random_state=21, fit_intercept=False)
        print(f"Penalty: {penalty}, Solver: {solver}")
        print_crossval_scores(model, X_train, y_train, verbose=False)

Penalty: l1, Solver: liblinear
Average accuracy on crossval is 0.63946
Std is 0.04613
Penalty: l2, Solver: lbfgs
Average accuracy on crossval is 0.65136
Std is 0.03475
Penalty: l2, Solver: liblinear
Average accuracy on crossval is 0.63504
Std is 0.04654
Penalty: l2, Solver: newton-cg
Average accuracy on crossval is 0.65136
Std is 0.03475
Penalty: l2, Solver: sag
Average accuracy on crossval is 0.65136
Std is 0.03475
Penalty: l2, Solver: saga
Average accuracy on crossval is 0.65136
Std is 0.03475
Penalty: None, Solver: newton-cg
Average accuracy on crossval is 0.66247
Std is 0.04018
CPU times: user 27.6 s, sys: 24.2 s, total: 51.8 s
Wall time: 9.61 s


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [6]:
%%time

model = SVC(probability=True, kernel='linear', random_state=21)
print_crossval_scores(model, X_train, y_train)

train - 0.77164   |   valid - 0.71111
train - 0.77988   |   valid - 0.67407
train - 0.76752   |   valid - 0.71852
train - 0.77659   |   valid - 0.69630
train - 0.76587   |   valid - 0.74815
train - 0.77329   |   valid - 0.76296
train - 0.79308   |   valid - 0.63704
train - 0.77824   |   valid - 0.65926
train - 0.76194   |   valid - 0.70896
train - 0.78336   |   valid - 0.64925
Average accuracy on crossval is 0.69656
Std is 0.03949
CPU times: user 3.8 s, sys: 18.3 ms, total: 3.82 s
Wall time: 3.82 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [7]:
%%time

params = [0.01, 0.1, 1, 10, 100]

for C in params:
    model = SVC(probability=True, C=C, kernel='linear', random_state=21)
    print(f"C: {C}")
    print_crossval_scores(model, X_train, y_train, verbose=False)

C: 0.01
Average accuracy on crossval is 0.38719
Std is 0.02551
C: 0.1
Average accuracy on crossval is 0.60459
Std is 0.03271
C: 1
Average accuracy on crossval is 0.69656
Std is 0.03949
C: 10
Average accuracy on crossval is 0.73886
Std is 0.03835
C: 100
Average accuracy on crossval is 0.77077
Std is 0.02579
CPU times: user 35.3 s, sys: 108 ms, total: 35.4 s
Wall time: 35.4 s


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [8]:
%%time

model = DecisionTreeClassifier(max_depth=10, random_state=21)
print_crossval_scores(model, X_train, y_train)

train - 0.80379   |   valid - 0.75556
train - 0.80214   |   valid - 0.71852
train - 0.80049   |   valid - 0.71852
train - 0.81863   |   valid - 0.82963
train - 0.79802   |   valid - 0.68889
train - 0.81616   |   valid - 0.74815
train - 0.80791   |   valid - 0.66667
train - 0.81369   |   valid - 0.68148
train - 0.79901   |   valid - 0.76119
train - 0.82043   |   valid - 0.73881
Average accuracy on crossval is 0.73074
Std is 0.04503
CPU times: user 147 ms, sys: 0 ns, total: 147 ms
Wall time: 146 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [9]:
%%time

params = [2, 3, 5, 8, 10, 12]

for depth in params:
    print(f"Depth: {depth}")
    model = DecisionTreeClassifier(max_depth=depth, random_state=21)
    print_crossval_scores(model, X_train, y_train, verbose=False)

Depth: 2
Average accuracy on crossval is 0.43467
Std is 0.02310
Depth: 3
Average accuracy on crossval is 0.47105
Std is 0.01833
Depth: 5
Average accuracy on crossval is 0.57268
Std is 0.03400
Depth: 8
Average accuracy on crossval is 0.68548
Std is 0.04417
Depth: 10
Average accuracy on crossval is 0.73074
Std is 0.04503
Depth: 12
Average accuracy on crossval is 0.81603
Std is 0.03774
CPU times: user 736 ms, sys: 1.4 ms, total: 738 ms
Wall time: 737 ms


In [10]:
%%time

params = [2, 3, 4, 5]

for min_samples_split in params:
    print(f"Min samples split: {min_samples_split}")
    model = DecisionTreeClassifier(max_depth=10, min_samples_split=min_samples_split, random_state=21)
    print_crossval_scores(model, X_train, y_train, verbose=False)

Min samples split: 2
Average accuracy on crossval is 0.73074
Std is 0.04503
Min samples split: 3
Average accuracy on crossval is 0.72926
Std is 0.04659
Min samples split: 4
Average accuracy on crossval is 0.72925
Std is 0.04336
Min samples split: 5
Average accuracy on crossval is 0.72701
Std is 0.04688
CPU times: user 636 ms, sys: 1.95 ms, total: 638 ms
Wall time: 637 ms


In [11]:
%%time

params = [2, 3, 4, 5]

for min_samples_leaf in params:
    print(f"Min samples leaf: {min_samples_leaf}")
    model = DecisionTreeClassifier(max_depth=10, min_samples_leaf=min_samples_leaf, random_state=21)
    print_crossval_scores(model, X_train, y_train, verbose=False)

Min samples leaf: 2
Average accuracy on crossval is 0.72109
Std is 0.04850
Min samples leaf: 3
Average accuracy on crossval is 0.71589
Std is 0.05231
Min samples leaf: 4
Average accuracy on crossval is 0.70180
Std is 0.04843
Min samples leaf: 5
Average accuracy on crossval is 0.68693
Std is 0.05078
CPU times: user 493 ms, sys: 1.08 ms, total: 494 ms
Wall time: 492 ms


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [12]:
%%time

model = RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
print_crossval_scores(model, X_train, y_train)

train - 0.95383   |   valid - 0.87407
train - 0.94806   |   valid - 0.81481
train - 0.95218   |   valid - 0.86667
train - 0.93817   |   valid - 0.85185
train - 0.94147   |   valid - 0.84444
train - 0.95383   |   valid - 0.85926
train - 0.94064   |   valid - 0.84444
train - 0.94559   |   valid - 0.82963
train - 0.94893   |   valid - 0.90299
train - 0.94399   |   valid - 0.76119
Average accuracy on crossval is 0.84494
Std is 0.03619
CPU times: user 1.35 s, sys: 3.08 ms, total: 1.36 s
Wall time: 1.36 s


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [13]:
%%time

params = {
    'n_estimators': [20, 50, 100],
    'max_depth': [5, 14, 18],
}

for n_estimators in params['n_estimators']:
    for max_depth in params['max_depth']:
        model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, random_state=21)
        print(f"n_estimators: {n_estimators}, max_depth: {max_depth}")
        print_crossval_scores(model, X_train, y_train, verbose=False)

n_estimators: 20, max_depth: 5
Average accuracy on crossval is 0.53334
Std is 0.03147
n_estimators: 20, max_depth: 14
Average accuracy on crossval is 0.83384
Std is 0.03670
n_estimators: 20, max_depth: 18
Average accuracy on crossval is 0.87016
Std is 0.03080
n_estimators: 50, max_depth: 5
Average accuracy on crossval is 0.56227
Std is 0.03321
n_estimators: 50, max_depth: 14
Average accuracy on crossval is 0.84494
Std is 0.03619
n_estimators: 50, max_depth: 18
Average accuracy on crossval is 0.87907
Std is 0.02579
n_estimators: 100, max_depth: 5
Average accuracy on crossval is 0.55338
Std is 0.03172
n_estimators: 100, max_depth: 14
Average accuracy on crossval is 0.84198
Std is 0.03177
n_estimators: 100, max_depth: 18
Average accuracy on crossval is 0.87759
Std is 0.02607
CPU times: user 12.9 s, sys: 30.7 ms, total: 13 s
Wall time: 13 s


In [14]:
%%time

params = [2, 3, 4, 5]

for min_samples_split in params:
    print(f"Min samples split: {min_samples_split}")
    model = RandomForestClassifier(n_estimators=50, max_depth=18, random_state=21)
    print_crossval_scores(model, X_train, y_train, verbose=False)

Min samples split: 2


Average accuracy on crossval is 0.87907
Std is 0.02579
Min samples split: 3
Average accuracy on crossval is 0.87907
Std is 0.02579
Min samples split: 4
Average accuracy on crossval is 0.87907
Std is 0.02579
Min samples split: 5
Average accuracy on crossval is 0.87907
Std is 0.02579
CPU times: user 5.71 s, sys: 18.7 ms, total: 5.73 s
Wall time: 5.74 s


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [15]:
best_model = RandomForestClassifier(n_estimators=50, max_depth=18, random_state=21)
best_model.fit(X_train, y_train)

y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Final Accuracy:", accuracy)

Final Accuracy: 0.8816568047337278


In [16]:
result = pd.DataFrame({'actual': y_test, 'predicted': y_pred})

errors = result[result['actual'] != result['predicted']]['actual'].value_counts(normalize=True) * 100

errors_df = pd.DataFrame({'weekday': errors.index, 'error_percentage': errors.values})
errors_df['weekday'] = errors_df['weekday'].replace({0: 'Monday', 1: 'Tuesday', 2: 'Wednesday', 3: 'Thursday', 4: 'Friday', 5: 'Saturday', 6: 'Sunday'})

errors_df

Unnamed: 0,weekday,error_percentage
0,Tuesday,20.0
1,Monday,20.0
2,Saturday,17.5
3,Wednesday,15.0
4,Sunday,12.5
5,Friday,10.0
6,Thursday,5.0


In [17]:
from joblib import dump

filepath = '../data/random_forest_model.joblib'
dump(best_model, filepath)

['../data/random_forest_model.joblib']