# Day 09. Exercise 00
# Regularization

## 0. Imports

In [51]:
import pandas as pd 
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
import joblib
import numpy as np 

## 1. Preprocessing

1. Read the file `dayofweek.csv` that you used in the previous day to a dataframe.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [18]:
df=pd.read_csv("/home/danka/bootcampDS/DS_Bootcamp.Day08.ID_886521-1/src/data/dayofweek.csv", sep=',')
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,-0.788667,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,-0.756764,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,-0.724861,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,-0.692958,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,-0.661055,-2.562352,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [19]:
X=df.loc[:, df.columns!='dayofweek']
Y=df['dayofweek']

In [20]:
X_train, X_test, Y_train, Y_test=train_test_split(X, Y, test_size=0.2, random_state=21, stratify=Y)

## 2. Logreg regularization

### a. Default regularization

1. Train a baseline model with the only parameters `random_state=21`, `fit_intercept=False`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model


The result of the code where you trained and evaluated the baseline model should be exactly like this (use `%%time` to get the info about how long it took to run the cell):

```
train -  0.62902   |   valid -  0.59259
train -  0.64633   |   valid -  0.62963
train -  0.63479   |   valid -  0.56296
train -  0.65622   |   valid -  0.61481
train -  0.63397   |   valid -  0.57778
train -  0.64056   |   valid -  0.59259
train -  0.64138   |   valid -  0.65926
train -  0.65952   |   valid -  0.56296
train -  0.64333   |   valid -  0.59701
train -  0.63674   |   valid -  0.62687
Average accuracy on crossval is 0.60165
Std is 0.02943
```

In [26]:
def stratified_cross_val(n_splits, X_1, y_1, model):
    folds=StratifiedKFold(n_splits=n_splits)
    accuracy_test=[]
    accuracy_train=[]

    for train_ind, test_ind in folds.split(X_1, y_1):
        model.fit(X_1.iloc[train_ind], y_1.iloc[train_ind])
        predict_train=model.predict(X_1.iloc[train_ind])
        predict_test=model.predict(X_1.iloc[test_ind])

        accuracy_train.append(accuracy_score(y_1.iloc[train_ind], predict_train))
        accuracy_test.append(accuracy_score(y_1.iloc[test_ind], predict_test))

    for i in range(len(accuracy_test)):
        print(f"train - {accuracy_train[i]:.5f}  |  test - {accuracy_test[i]:.5f}")
    print(f"Avereage accuracy in cross validation is {np.mean(accuracy_test):.5f}")
    print(f"STD is {np.std(accuracy_test):.5f}")


In [27]:
%%time
model_logreg=LogisticRegression(random_state=21, fit_intercept=False)
stratified_cross_val(10, X_train, Y_train, model_logreg)


train - 0.62819  |  test - 0.59259
train - 0.64716  |  test - 0.62963
train - 0.63479  |  test - 0.57037
train - 0.65540  |  test - 0.61481
train - 0.63314  |  test - 0.57778
train - 0.64056  |  test - 0.59259
train - 0.64221  |  test - 0.65926
train - 0.65952  |  test - 0.56296
train - 0.64333  |  test - 0.59701
train - 0.63591  |  test - 0.62687
Avereage accuracy in cross validation is 0.60239
STD is 0.02852
CPU times: user 5.9 s, sys: 39.8 ms, total: 5.94 s
Wall time: 785 ms


### b. Optimizing regularization parameters

1. In the cells below try different values of penalty: `none`, `l1`, `l2` – you can change the values of solver too.

In [28]:
model_logreg = LogisticRegression(random_state=21, fit_intercept=False, penalty=None, max_iter=1000)
stratified_cross_val(10, X_train,Y_train, model_logreg)

train - 0.66612  |  test - 0.63704
train - 0.65622  |  test - 0.65926
train - 0.66694  |  test - 0.57778
train - 0.66529  |  test - 0.62963
train - 0.66777  |  test - 0.61481
train - 0.65870  |  test - 0.57778
train - 0.64963  |  test - 0.69630
train - 0.68508  |  test - 0.61481
train - 0.66392  |  test - 0.62687
train - 0.65733  |  test - 0.60448
Avereage accuracy in cross validation is 0.62388
STD is 0.03392


In [29]:
model_logreg=LogisticRegression(random_state=21, fit_intercept=False,solver='saga', penalty='l1', max_iter=1000)
stratified_cross_val(10, X_train, Y_train, model_logreg)

train - 0.63726  |  test - 0.58519
train - 0.64221  |  test - 0.61481
train - 0.62984  |  test - 0.55556
train - 0.64386  |  test - 0.60000
train - 0.63232  |  test - 0.57778
train - 0.63644  |  test - 0.57778
train - 0.63644  |  test - 0.65926
train - 0.65622  |  test - 0.57778
train - 0.64580  |  test - 0.58955
train - 0.63839  |  test - 0.62687
Avereage accuracy in cross validation is 0.59646
STD is 0.02848


In [32]:
model_logreg=LogisticRegression(random_state=21, fit_intercept=False, solver='newton-cg', penalty='l2', max_iter=1000)
stratified_cross_val(10, X_train, Y_train, model_logreg)

train - 0.62902  |  test - 0.59259
train - 0.64633  |  test - 0.62963
train - 0.63479  |  test - 0.56296
train - 0.65622  |  test - 0.61481
train - 0.63397  |  test - 0.57778
train - 0.64056  |  test - 0.59259
train - 0.64138  |  test - 0.65926
train - 0.65952  |  test - 0.55556
train - 0.64333  |  test - 0.59701
train - 0.63674  |  test - 0.62687
Avereage accuracy in cross validation is 0.60091
STD is 0.03047


## 3. SVM regularization

### a. Default regularization

1. Train a baseline model with the only parameters `probability=True`, `kernel='linear'`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [36]:
model_svc=SVC(probability=True, kernel='linear', random_state=21)
stratified_cross_val(10, X_train, Y_train,model_svc)

train - 0.70486  |  test - 0.65926
train - 0.69662  |  test - 0.75556
train - 0.69415  |  test - 0.62222
train - 0.70239  |  test - 0.65185
train - 0.69085  |  test - 0.65185
train - 0.68920  |  test - 0.64444
train - 0.69250  |  test - 0.72593
train - 0.70074  |  test - 0.62222
train - 0.69605  |  test - 0.61940
train - 0.71087  |  test - 0.63433
Avereage accuracy in cross validation is 0.65871
STD is 0.04359


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `C`.

In [39]:
model_svc=SVC(probability=True, kernel='linear', random_state=21, C=0.01)
stratified_cross_val(10, X_train, Y_train,model_svc)

train - 0.37923  |  test - 0.40000
train - 0.37923  |  test - 0.40000
train - 0.38417  |  test - 0.35556
train - 0.35449  |  test - 0.36296
train - 0.38252  |  test - 0.37037
train - 0.38087  |  test - 0.38519
train - 0.37923  |  test - 0.40000
train - 0.38252  |  test - 0.37037
train - 0.38468  |  test - 0.35075
train - 0.38386  |  test - 0.35821
Avereage accuracy in cross validation is 0.37534
STD is 0.01848


In [40]:
model_svc=SVC(probability=True, kernel='linear', random_state=21, C=0.1)
stratified_cross_val(10, X_train, Y_train,model_svc)

train - 0.58120  |  test - 0.55556
train - 0.57543  |  test - 0.56296
train - 0.57378  |  test - 0.57037
train - 0.59275  |  test - 0.57037
train - 0.58120  |  test - 0.54815
train - 0.57955  |  test - 0.54815
train - 0.57296  |  test - 0.61481
train - 0.59192  |  test - 0.54815
train - 0.59967  |  test - 0.52985
train - 0.57825  |  test - 0.57463
Avereage accuracy in cross validation is 0.56230
STD is 0.02177


In [41]:
model_svc=SVC(probability=True, kernel='linear', random_state=21, C=10)
stratified_cross_val(10, X_train, Y_train,model_svc)

train - 0.75021  |  test - 0.72593
train - 0.77741  |  test - 0.82963
train - 0.78566  |  test - 0.68148
train - 0.76834  |  test - 0.73333
train - 0.75185  |  test - 0.77778
train - 0.75598  |  test - 0.68889
train - 0.76257  |  test - 0.74074
train - 0.77411  |  test - 0.68889
train - 0.78254  |  test - 0.71642
train - 0.78418  |  test - 0.69403
Avereage accuracy in cross validation is 0.72771
STD is 0.04417


In [42]:
model_svc=SVC(probability=True, kernel='linear', random_state=21, C=100)
stratified_cross_val(10, X_train, Y_train,model_svc)

train - 0.78401  |  test - 0.74815
train - 0.79720  |  test - 0.84444
train - 0.80956  |  test - 0.72593
train - 0.79060  |  test - 0.76296
train - 0.79060  |  test - 0.77778
train - 0.79637  |  test - 0.74815
train - 0.78401  |  test - 0.77037
train - 0.80462  |  test - 0.73333
train - 0.79819  |  test - 0.70896
train - 0.79901  |  test - 0.73881
Avereage accuracy in cross validation is 0.75589
STD is 0.03550


## 4. Tree

### a. Default regularization

1. Train a baseline model with the only parameter `max_depth=10` and `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [45]:
model_tree=DecisionTreeClassifier(random_state=21, max_depth=10)
stratified_cross_val(10, X_train, Y_train, model_tree)

train - 0.81039  |  test - 0.74074
train - 0.77741  |  test - 0.74074
train - 0.83347  |  test - 0.70370
train - 0.79720  |  test - 0.76296
train - 0.82440  |  test - 0.75556
train - 0.80379  |  test - 0.68889
train - 0.80709  |  test - 0.76296
train - 0.80132  |  test - 0.65926
train - 0.80807  |  test - 0.75373
train - 0.80478  |  test - 0.68657
Avereage accuracy in cross validation is 0.72551
STD is 0.03562


### b. Optimizing regularization parameters

1. In the cells below try different values of the parameter `max_depth`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [46]:
model_tree=DecisionTreeClassifier(random_state=21, max_depth=15)
stratified_cross_val(10, X_train, Y_train, model_tree)

train - 0.95796  |  test - 0.82222
train - 0.93075  |  test - 0.83704
train - 0.95631  |  test - 0.83704
train - 0.95301  |  test - 0.86667
train - 0.95136  |  test - 0.88889
train - 0.94724  |  test - 0.82222
train - 0.95466  |  test - 0.90370
train - 0.94971  |  test - 0.87407
train - 0.95305  |  test - 0.83582
train - 0.94316  |  test - 0.85821
Avereage accuracy in cross validation is 0.85459
STD is 0.02682


In [47]:
model_tree=DecisionTreeClassifier(random_state=21, max_depth=4)
stratified_cross_val(10, X_train, Y_train, model_tree)

train - 0.53998  |  test - 0.48148
train - 0.52679  |  test - 0.52593
train - 0.54493  |  test - 0.47407
train - 0.54163  |  test - 0.52593
train - 0.54493  |  test - 0.57037
train - 0.52679  |  test - 0.51111
train - 0.52844  |  test - 0.50370
train - 0.53669  |  test - 0.48889
train - 0.53624  |  test - 0.54478
train - 0.54613  |  test - 0.46269
Avereage accuracy in cross validation is 0.50889
STD is 0.03190


In [49]:
model_tree=DecisionTreeClassifier(random_state=21, max_depth=25, min_samples_split=10, min_samples_leaf=5)
stratified_cross_val(10, X_train, Y_train, model_tree)

train - 0.86727  |  test - 0.78519
train - 0.86645  |  test - 0.74074
train - 0.87716  |  test - 0.72593
train - 0.86727  |  test - 0.82963
train - 0.87552  |  test - 0.80741
train - 0.86150  |  test - 0.80000
train - 0.86810  |  test - 0.82222
train - 0.86892  |  test - 0.74074
train - 0.86079  |  test - 0.72388
train - 0.86738  |  test - 0.76866
Avereage accuracy in cross validation is 0.77444
STD is 0.03795


## 5. Random forest

### a. Default regularization

1. Train a baseline model with the only parameters `n_estimators=50`, `max_depth=14`, `random_state=21`.
2. Use stratified K-fold cross-validation with `10` splits to evaluate the accuracy of the model.
3. The format of the result of the code where you trained and evaluated the baseline model should be similar to what you have got for the logreg.

In [52]:
model_rand=RandomForestClassifier(n_estimators=50, max_depth=14, random_state=21)
stratified_cross_val(10, X_train, Y_train, model_rand)

train - 0.96455  |  test - 0.88148
train - 0.96208  |  test - 0.91852
train - 0.96785  |  test - 0.86667
train - 0.96455  |  test - 0.89630
train - 0.96538  |  test - 0.91111
train - 0.96538  |  test - 0.88148
train - 0.97115  |  test - 0.91852
train - 0.96867  |  test - 0.85185
train - 0.97364  |  test - 0.88060
train - 0.97941  |  test - 0.86567
Avereage accuracy in cross validation is 0.88722
STD is 0.02204


### b. Optimizing regularization parameters

1. In the new cells try different values of the parameters `max_depth` and `n_estimators`.
2. As a bonus, play with other regularization parameters trying to find the best combination.

In [53]:
model_rand=RandomForestClassifier(n_estimators=100, max_depth=10, random_state=21)
stratified_cross_val(10, X_train, Y_train, model_rand)

train - 0.86562  |  test - 0.77037
train - 0.88293  |  test - 0.85185
train - 0.89118  |  test - 0.80000
train - 0.90107  |  test - 0.82963
train - 0.87634  |  test - 0.84444
train - 0.87716  |  test - 0.77037
train - 0.87057  |  test - 0.81481
train - 0.87799  |  test - 0.76296
train - 0.88056  |  test - 0.79104
train - 0.88056  |  test - 0.78358
Avereage accuracy in cross validation is 0.80191
STD is 0.03034


In [55]:
model_rand=RandomForestClassifier(n_estimators=200, max_depth=25, random_state=21)
stratified_cross_val(10, X_train, Y_train, model_rand)

train - 1.00000  |  test - 0.90370
train - 1.00000  |  test - 0.95556
train - 1.00000  |  test - 0.89630
train - 1.00000  |  test - 0.94074
train - 1.00000  |  test - 0.91111
train - 1.00000  |  test - 0.88889
train - 1.00000  |  test - 0.92593
train - 1.00000  |  test - 0.91111
train - 1.00000  |  test - 0.92537
train - 1.00000  |  test - 0.91045
Avereage accuracy in cross validation is 0.91692
STD is 0.01926


In [57]:
model_rand=RandomForestClassifier(n_estimators=500, max_depth=10, random_state=21)
stratified_cross_val(10, X_train, Y_train, model_rand)

train - 0.87552  |  test - 0.78519
train - 0.89613  |  test - 0.86667
train - 0.90190  |  test - 0.82963
train - 0.89942  |  test - 0.81481
train - 0.88129  |  test - 0.84444
train - 0.88376  |  test - 0.77778
train - 0.88129  |  test - 0.85926
train - 0.88541  |  test - 0.75556
train - 0.88468  |  test - 0.81343
train - 0.88056  |  test - 0.80597
Avereage accuracy in cross validation is 0.81527
STD is 0.03406


In [59]:
model_rand=RandomForestClassifier(n_estimators=200, max_depth=25, random_state=21, max_features=5)
stratified_cross_val(10, X_train, Y_train, model_rand)

train - 0.99918  |  test - 0.89630
train - 0.99918  |  test - 0.95556
train - 0.99918  |  test - 0.89630
train - 1.00000  |  test - 0.92593
train - 0.99918  |  test - 0.91852
train - 1.00000  |  test - 0.88889
train - 1.00000  |  test - 0.92593
train - 1.00000  |  test - 0.90370
train - 1.00000  |  test - 0.91791
train - 1.00000  |  test - 0.89552
Avereage accuracy in cross validation is 0.91245
STD is 0.01931


In [60]:
model_rand=RandomForestClassifier(n_estimators=200, max_depth=25, random_state=21, max_features=5, bootstrap=False)
stratified_cross_val(10, X_train, Y_train, model_rand)

train - 1.00000  |  test - 0.91111
train - 1.00000  |  test - 0.94815
train - 1.00000  |  test - 0.89630
train - 1.00000  |  test - 0.92593
train - 1.00000  |  test - 0.92593
train - 1.00000  |  test - 0.89630
train - 1.00000  |  test - 0.92593
train - 1.00000  |  test - 0.90370
train - 1.00000  |  test - 0.93284
train - 1.00000  |  test - 0.91791
Avereage accuracy in cross validation is 0.91841
STD is 0.01584


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.
3. Analyze: for which weekday your model makes the most errors (in % of the total number of samples of that class in your test dataset).
4. Save the model.

In [61]:
model_rand=RandomForestClassifier(n_estimators=200, max_depth=15, random_state=21)
model_rand.fit(X_train, Y_train)
predict_forest_test=model_rand.predict(X_test)

In [62]:
accuracy_score(Y_test, predict_forest_test)

0.8994082840236687

In [63]:
conf_matrix=confusion_matrix(Y_test, predict_forest_test)
print('Errors per weekday')
for i in range(7):
    total=conf_matrix[i].sum()
    correct=conf_matrix[i][i]
    res_errors=total-correct
    eror_per_class=res_errors/total*100
    print(f'{i}: {eror_per_class:.2f}%')


Errors per weekday
0: 25.93%
1: 14.55%
2: 6.67%
3: 2.50%
4: 14.29%
5: 9.26%
6: 9.86%


In [64]:

joblib.dump(model_rand, '../data/random_forest_regul_model.pkl')


['../data/random_forest_regul_model.pkl']