# Day 09. Exercise 00
# Regularization

## 0. Imports

In [4]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from joblib import dump, load
import numpy as np


## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. !!!!!!!!!!!!!!!!!!!!!!!! Use the additional parameter `stratify`.

In [81]:
df = pd.read_csv('../data/dayofweek.csv')
X = df.drop(['dayofweek'], axis=1).values
y = df[['dayofweek']].values[:, 0]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

#now the distribution partition is about ~ 4 for all classes (thx for parameter straitify)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [173]:
def crossval(n_splits, X, y, model):
    train_rmse = []
    test_rmse = []
    kf = KFold(n_splits=n_splits, random_state=21, shuffle=True)
    for train_index, test_index in kf.split(X):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        y_predict_train = model.predict(X_train)
        y_predict_test = model.predict(X_test)
        test = mean_squared_error(y_predict_test, y_test)
        train  = mean_squared_error(y_predict_train, y_train)
        print("train - ", round(train, 5), '\t|\t', "valid - ", round(test, 5))
        train_rmse += [train]
        test_rmse += [test]
    print("Average accuracy on crossval is", round(np.array(test_rmse).mean(), 5))
    print("Std is ", round(np.array(test_rmse).std(), 5))
    return np.array(test_rmse).mean(), np.array(test_rmse).std()

In [174]:
%%time
logreg = LogisticRegression(random_state=21, fit_intercept=False)
#logreg.fit(X_train, y_train)
crossval(10, X, y, logreg)

train -  2.21622     |     valid -  1.5503
train -  2.07251     |     valid -  2.84024
train -  2.14041     |     valid -  2.21893
train -  2.23665     |     valid -  2.65089
train -  2.17007     |     valid -  2.42012
train -  2.27818     |     valid -  2.07692
train -  2.29051     |     valid -  2.66071
train -  2.12055     |     valid -  2.52976
train -  2.20356     |     valid -  3.14881
train -  2.30632     |     valid -  1.8869
Average accuracy on crossval is 2.39836
Std is  0.45001
CPU times: user 6.98 s, sys: 5.59 s, total: 12.6 s
Wall time: 1.67 s


(2.3983586925894618, 0.45000929259262157)

### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [152]:
%%time
logreg_none = LogisticRegression(penalty='none', random_state=21, fit_intercept=False, solver='newton-cg')
#logreg_none.fit(X_train, y_train)
crossval(10, X, y, logreg_none)

train -  2.12195     |     valid -  1.53254
train -  2.0468     |     valid -  2.73964
train -  2.13052     |     valid -  2.23077
train -  2.07779     |     valid -  2.24852
train -  2.02175     |     valid -  2.07101
train -  2.12327     |     valid -  2.12426
train -  2.16601     |     valid -  2.6131
train -  2.08432     |     valid -  1.83333
train -  2.03294     |     valid -  3.17857
train -  2.02174     |     valid -  1.79167
Average accuracy on crossval is 2.23634
Std is  0.46676
CPU times: user 29.5 s, sys: 23 s, total: 52.5 s
Wall time: 6.97 s


In [153]:
%%time
logreg_l1 = LogisticRegression(penalty='l1', random_state=21, fit_intercept=False, solver='liblinear')
#logreg_none.fit(X_train, y_train)
crossval(10, X, y, logreg_none)

train -  2.12195     |     valid -  1.53254
train -  2.0468     |     valid -  2.73964
train -  2.13052     |     valid -  2.23077
train -  2.07779     |     valid -  2.24852
train -  2.02175     |     valid -  2.07101
train -  2.12327     |     valid -  2.12426
train -  2.16601     |     valid -  2.6131
train -  2.08432     |     valid -  1.83333
train -  2.03294     |     valid -  3.17857
train -  2.02174     |     valid -  1.79167
Average accuracy on crossval is 2.23634
Std is  0.46676
CPU times: user 29.4 s, sys: 22.4 s, total: 51.8 s
Wall time: 6.91 s


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [165]:
%%time
svc = SVC(probability=True, random_state=21, kernel='linear')
crossval(10, X, y, svc)

train -  2.01187     |     valid -  1.42604
train -  1.84641     |     valid -  2.48521
train -  1.91562     |     valid -  2.26627
train -  2.07976     |     valid -  2.45562
train -  1.91694     |     valid -  2.06509
train -  1.98484     |     valid -  1.39053
train -  2.02701     |     valid -  2.40476
train -  1.9137     |     valid -  2.38095
train -  1.93412     |     valid -  2.36905
train -  2.01713     |     valid -  1.65476
Average accuracy on crossval is 2.08983
Std is  0.41264
CPU times: user 5.64 s, sys: 0 ns, total: 5.64 s
Wall time: 5.64 s


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [176]:
%%time
#dofault value of C is 1
for c in [0.5, 1.7, 2, 3, 4, 10]:
    print("C parameter is:", c)
    svc = SVC(probability=True, C=c, random_state=21, kernel='linear')
    crossval(10, X, y, svc)
    print()
    

C parameter is: 0.5
train -  2.11338     |     valid -  1.62722
train -  1.98484     |     valid -  2.85799
train -  2.15755     |     valid -  2.40828
train -  2.19183     |     valid -  2.91716
train -  2.05603     |     valid -  1.88166
train -  2.12986     |     valid -  1.64497
train -  2.20553     |     valid -  2.24405
train -  2.15349     |     valid -  2.53571
train -  2.20356     |     valid -  2.46429
train -  2.18841     |     valid -  1.79167
Average accuracy on crossval is 2.2373
Std is  0.45447

C parameter is: 1.7
train -  2.00066     |     valid -  1.52071
train -  1.7027     |     valid -  2.20118
train -  1.86223     |     valid -  2.3432
train -  1.84838     |     valid -  2.29586
train -  1.9143     |     valid -  2.06509
train -  1.93408     |     valid -  1.4142
train -  2.06258     |     valid -  2.41071
train -  1.88603     |     valid -  2.33929
train -  1.93742     |     valid -  2.39286
train -  1.97563     |     valid -  1.84524
Average accuracy on crossval

## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [206]:
%%time
dtr = DecisionTreeRegressor(max_depth=10, random_state=21)
crossval(10, X, y, dtr)

train -  1.02315     |     valid -  1.01324
train -  0.95225     |     valid -  1.61382
train -  0.89033     |     valid -  1.48791
train -  1.10181     |     valid -  1.97395
train -  0.94216     |     valid -  1.30399
train -  1.04046     |     valid -  0.83123
train -  1.0133     |     valid -  1.55136
train -  0.95846     |     valid -  1.5373
train -  0.88345     |     valid -  1.6109
train -  0.88466     |     valid -  1.05682
Average accuracy on crossval is 1.39805
Std is  0.32698
CPU times: user 51.6 ms, sys: 0 ns, total: 51.6 ms
Wall time: 48.8 ms


(1.3980511980642922, 0.3269753579622608)

### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [205]:
%%time
for max_depth in [14, 17, 20, 25, 30, 50, 100]:
    print("Max depth parameter is:", max_depth)
    dtr = DecisionTreeRegressor(max_depth=max_depth, random_state=21)
    crossval(10, X, y, dtr)
    print()
#the optimal depth is near 25

Max depth parameter is: 14
train -  0.52356     |     valid -  0.85309
train -  0.5671     |     valid -  1.13364
train -  0.47109     |     valid -  1.4898
train -  0.70651     |     valid -  1.48805
train -  0.35379     |     valid -  0.92586
train -  0.50839     |     valid -  0.87273
train -  0.4908     |     valid -  1.24304
train -  0.54201     |     valid -  0.90476
train -  0.37308     |     valid -  1.28226
train -  0.34434     |     valid -  0.80634
Average accuracy on crossval is 1.09996
Std is  0.24993

Max depth parameter is: 17
train -  0.19661     |     valid -  0.76592
train -  0.20558     |     valid -  0.84105
train -  0.15992     |     valid -  1.21175
train -  0.50041     |     valid -  1.42551
train -  0.0815     |     valid -  0.75332
train -  0.19045     |     valid -  0.7454
train -  0.22106     |     valid -  1.0274
train -  0.19525     |     valid -  0.79481
train -  0.15535     |     valid -  1.30353
train -  0.10297     |     valid -  0.62858
Average accurac

In [214]:
%%time
for max_features in [4, 7, 10, 20]:
    print("max_features parameter is:", max_features)
    dtr = DecisionTreeRegressor(max_depth=25, max_features=max_features, random_state=21)
    crossval(10, X, y, dtr)
    print()

max_features parameter is: 4
train -  0.21043     |     valid -  0.69411
train -  0.24427     |     valid -  0.69413
train -  0.16982     |     valid -  1.44526
train -  0.30576     |     valid -  2.01882
train -  0.15506     |     valid -  1.05201
train -  0.24976     |     valid -  0.74177
train -  0.37656     |     valid -  0.96246
train -  0.27423     |     valid -  1.85279
train -  0.3441     |     valid -  1.3065
train -  0.16678     |     valid -  1.07519
Average accuracy on crossval is 1.1843
Std is  0.44512

max_features parameter is: 7
train -  0.10085     |     valid -  0.79768
train -  0.11865     |     valid -  1.09355
train -  0.03449     |     valid -  1.44914
train -  0.04093     |     valid -  1.15904
train -  0.15126     |     valid -  1.25578
train -  0.22676     |     valid -  0.44646
train -  0.03605     |     valid -  1.01563
train -  0.0214     |     valid -  1.23034
train -  0.06943     |     valid -  1.30306
train -  0.06972     |     valid -  0.99308
Average a

## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [215]:
%%time
rfr = RandomForestRegressor(n_estimators=50, max_depth=14, random_state=21)
crossval(10, X, y, dtr)

train -  0.01676     |     valid -  0.91627
train -  0.02163     |     valid -  1.03337
train -  0.00398     |     valid -  1.07512
train -  0.04247     |     valid -  0.62941
train -  0.0298     |     valid -  0.74577
train -  0.151     |     valid -  0.90008
train -  0.01027     |     valid -  1.00967
train -  0.01332     |     valid -  1.01083
train -  0.00489     |     valid -  1.045
train -  0.02225     |     valid -  0.9043
Average accuracy on crossval is 0.92698
Std is  0.13557
CPU times: user 47.7 ms, sys: 824 µs, total: 48.5 ms
Wall time: 45.6 ms


(0.926982450276123, 0.13557229445362104)

### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [213]:
best_mean_acc = 100

for max_depth in [10, 20, 25, 30, 50]:
    for n_estimators in [30, 70, 90, 120]:
        print("max_depth:", max_depth, "n_estimators", n_estimators)
        dtr = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=21)
        acc, _ = crossval(10, X, y, dtr)
        if (acc < best_mean_acc):
            best_mean_acc = acc
            best_depth = max_depth
            best_est = n_estimators
        print()
print("best_mean_acc", best_mean_acc, "best_depth", best_depth, "best n_est", best_est)    

max_depth: 10 n_estimators 30
train -  0.7702     |     valid -  0.73782
train -  0.71903     |     valid -  1.24285
train -  0.71961     |     valid -  1.30585
train -  0.72041     |     valid -  1.32863
train -  0.6721     |     valid -  0.80922
train -  0.78579     |     valid -  0.84013
train -  0.67977     |     valid -  1.26031
train -  0.67473     |     valid -  1.13262
train -  0.67635     |     valid -  1.06397
train -  0.72462     |     valid -  0.78805
Average accuracy on crossval is 1.05094
Std is  0.22352

max_depth: 10 n_estimators 70
train -  0.74707     |     valid -  0.70152
train -  0.70003     |     valid -  1.14894
train -  0.70005     |     valid -  1.32663
train -  0.69444     |     valid -  1.24459
train -  0.68106     |     valid -  0.81018
train -  0.79272     |     valid -  0.84688
train -  0.67221     |     valid -  1.2328
train -  0.69116     |     valid -  1.18575
train -  0.6664     |     valid -  1.07911
train -  0.70934     |     valid -  0.76153
Average

train -  0.06906     |     valid -  0.56744
train -  0.07427     |     valid -  0.61722
train -  0.07806     |     valid -  0.37235
Average accuracy on crossval is 0.51791
Std is  0.11896

max_depth: 50 n_estimators 30
train -  0.08317     |     valid -  0.49138
train -  0.08515     |     valid -  0.57946
train -  0.08625     |     valid -  0.78625
train -  0.08328     |     valid -  0.59425
train -  0.07986     |     valid -  0.40427
train -  0.08808     |     valid -  0.40682
train -  0.06997     |     valid -  0.5514
train -  0.08131     |     valid -  0.57753
train -  0.08002     |     valid -  0.61277
train -  0.08274     |     valid -  0.38834
Average accuracy on crossval is 0.53925
Std is  0.11554

max_depth: 50 n_estimators 70
train -  0.07089     |     valid -  0.42513
train -  0.06967     |     valid -  0.53223
train -  0.07242     |     valid -  0.77751
train -  0.071     |     valid -  0.56668
train -  0.07338     |     valid -  0.43586
train -  0.07721     |     valid -  0

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [223]:
dtr = RandomForestRegressor(n_estimators=best_est, max_depth=best_depth, random_state=21)
crossval(10, X, y, dtr)
predict = dtr.predict(X_test)
print("Accuracy: ", mean_squared_error(predict, y_test))

train -  0.07008     |     valid -  0.40723
train -  0.07023     |     valid -  0.52955
train -  0.07206     |     valid -  0.77441
train -  0.0716     |     valid -  0.57044
train -  0.07305     |     valid -  0.42311
train -  0.07933     |     valid -  0.37072
train -  0.06681     |     valid -  0.53448
train -  0.06988     |     valid -  0.57456
train -  0.07262     |     valid -  0.62904
train -  0.07658     |     valid -  0.35155
Average accuracy on crossval is 0.51651
Std is  0.12453
Accuracy:  0.09465373657681349


In [258]:
worst_day = 1
worst_day_acc  = 1
for day in range(7):
    indexes = np.where(y_test == day)
    predict = dtr.predict(X_test[indexes])
    predict = np.round(predict, 0)
    error = accuracy_score(predict, y_test[indexes])
    #print("day", day, "Acc:", mean_squared_error(predict, y_test[indexes]))
    if error < worst_day_acc:
        worst_day_acc = error  
        worst_day = day
print("Worst day:", worst_day, "Error:", worst_day_acc)

Worst day: 0 Error: 0.7037037037037037


In [260]:
dump(dtr, "RandomForestModel")

['RandomForestModel']