# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

from tqdm.notebook import tqdm

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')

df_day = pd.read_csv('../data/dayofweek.csv')
df['dayofweek'] = df_day['dayofweek']

X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21, stratify=y
)
df.columns

Index(['numTrials', 'hour', 'uid_user_0', 'uid_user_1', 'uid_user_10',
       'uid_user_11', 'uid_user_12', 'uid_user_13', 'uid_user_14',
       'uid_user_15', 'uid_user_16', 'uid_user_17', 'uid_user_18',
       'uid_user_19', 'uid_user_2', 'uid_user_20', 'uid_user_21',
       'uid_user_22', 'uid_user_23', 'uid_user_24', 'uid_user_25',
       'uid_user_26', 'uid_user_27', 'uid_user_28', 'uid_user_29',
       'uid_user_3', 'uid_user_30', 'uid_user_31', 'uid_user_4', 'uid_user_6',
       'uid_user_7', 'uid_user_8', 'labname_code_rvw', 'labname_lab02',
       'labname_lab03', 'labname_lab03s', 'labname_lab05s', 'labname_laba04',
       'labname_laba04s', 'labname_laba05', 'labname_laba06',
       'labname_laba06s', 'labname_project1', 'dayofweek'],
      dtype='object')

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [4]:
param_grid = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None],
    'probability': [True],
    'random_state': [21],
}

svc = SVC()
grid = GridSearchCV(svc, param_grid, cv=5, n_jobs=-1, verbose=1)
grid.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  26 tasks      | elapsed:    4.5s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:   41.6s
[Parallel(n_jobs=-1)]: Done 360 out of 360 | elapsed:  4.0min finished


GridSearchCV(cv=5, estimator=SVC(), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid'],
                         'probability': [True], 'random_state': [21]},
             verbose=1)

In [5]:
results = pd.DataFrame(grid.cv_results_)
results_sorted = results.sort_values('rank_test_score')
results_sorted[['params', 'mean_test_score', 'rank_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score
70,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.876109,1
64,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.863500,2
58,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.816018,3
52,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.808608,4
63,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.721052,5
...,...,...,...
53,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.129792,68
65,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.115693,69
41,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.079380,70
17,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062310,71


In [6]:
best_score = results_sorted['mean_test_score'].iloc[0]
best_score

0.8761090458488228

In [7]:
tenth_score = results_sorted['mean_test_score'].iloc[9]
tenth_score

0.7062343384276469

In [8]:
best_score - tenth_score

0.16987470742117583

## Есть небольшая разница между лучшей и десятой комбинацией (0.16987).
## Это значит, что разные параметры дерева решений дают разную точность на данных.
## Модель плохо устойчива к выбору гиперпараметров.

## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
param_grid_tree = {
    'max_depth': list(range(1, 49)),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini'],
    'random_state': [21]
}

tree = DecisionTreeClassifier()
grid_tree = GridSearchCV(tree, param_grid_tree, cv=5, n_jobs=-1, verbose=1)
grid_tree.fit(X_train, y_train)

Fitting 5 folds for each of 192 candidates, totalling 960 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  28 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 960 out of 960 | elapsed:    1.0s finished


GridSearchCV(cv=5, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'random_state': [21]},
             verbose=1)

In [10]:
results_tree = pd.DataFrame(grid_tree.cv_results_)
results_tree_sorted = results_tree.sort_values('rank_test_score')
results_tree_sorted[['params', 'mean_test_score', 'rank_test_score']].head(10)

Unnamed: 0,params,mean_test_score,rank_test_score
68,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873865,1
72,"{'class_weight': 'balanced', 'criterion': 'gin...",0.873854,2
69,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872378,3
95,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872372,4
93,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872372,4
70,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872372,4
74,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872372,4
75,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872372,4
76,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872372,4
77,"{'class_weight': 'balanced', 'criterion': 'gin...",0.872372,4


In [11]:
best_score = results_tree_sorted['mean_test_score'].iloc[0]
tenth_score = results_tree_sorted['mean_test_score'].iloc[9]
best_score - tenth_score

0.0014924962136859676

## Разница между лучшей и десятой комбинацией составляет всего 0.0015 (0.15%).
## Это очень маленькая разница, значит, модель Decision tree устойчива к выбору гиперпараметров — можно выбрать любую из топовых комбинаций.

## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [12]:
param_grid_rf = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': list(range(1, 50)),
    'class_weight': ['balanced', None],
    'criterion': ['entropy', 'gini'],
    'random_state': [21]
}

rf = RandomForestClassifier()
grid_rf = GridSearchCV(rf, param_grid_rf, cv=5, n_jobs=-1, verbose=1)
grid_rf.fit(X_train, y_train)

Fitting 5 folds for each of 784 candidates, totalling 3920 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:    0.2s
[Parallel(n_jobs=-1)]: Done 584 tasks      | elapsed:    6.2s
[Parallel(n_jobs=-1)]: Done 1584 tasks      | elapsed:   16.4s
[Parallel(n_jobs=-1)]: Done 2984 tasks      | elapsed:   29.9s
[Parallel(n_jobs=-1)]: Done 3920 out of 3920 | elapsed:   38.6s finished


GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['entropy', 'gini'],
                         'max_depth': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...],
                         'n_estimators': [5, 10, 50, 100],
                         'random_state': [21]},
             verbose=1)

In [13]:
results_rf = pd.DataFrame(grid_rf.cv_results_)
results_rf_sorted = results_rf.sort_values('rank_test_score')
results_rf_sorted[['params', 'mean_test_score', 'rank_test_score']].head(10)

Unnamed: 0,params,mean_test_score,rank_test_score
95,"{'class_weight': 'balanced', 'criterion': 'ent...",0.904293,1
115,"{'class_weight': 'balanced', 'criterion': 'ent...",0.90429,2
698,"{'class_weight': None, 'criterion': 'gini', 'm...",0.90429,2
314,"{'class_weight': 'balanced', 'criterion': 'gin...",0.903549,4
711,"{'class_weight': None, 'criterion': 'gini', 'm...",0.903547,5
99,"{'class_weight': 'balanced', 'criterion': 'ent...",0.902809,6
326,"{'class_weight': 'balanced', 'criterion': 'gin...",0.902809,7
767,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,8
779,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,8
775,"{'class_weight': None, 'criterion': 'gini', 'm...",0.902806,8


In [14]:
best_rf_score = results_rf_sorted['mean_test_score'].iloc[0]
tenth_rf_score = results_rf_sorted['mean_test_score'].iloc[9]
best_rf_score - tenth_rf_score

0.0014869888475838033

## Разница между лучшей и десятой комбинацией составляет всего 0.0015 (0.15%).
## Это очень маленькая разница, значит, модель Random forest тоже устойчива к выбору гиперпараметров — можно выбрать любую из топовых комбинаций.

## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [15]:
param_combinations = [
    (n, d, cw, cr)
    for n in [5, 10, 50, 100]
    for d in range(1, 50)
    for cw in ['balanced', None]
    for cr in ['entropy', 'gini']
]

In [16]:
results_manual = []
for n, d, cw, cr in tqdm(param_combinations, desc="Manual GridSearch"):
    rf = RandomForestClassifier(
        n_estimators=n,
        max_depth=d,
        class_weight=cw,
        criterion=cr,
        random_state=21
    )
    scores = cross_val_score(rf, X_train, y_train, cv=5, n_jobs=-1)
    results_manual.append({
        'n_estimators': n,
        'max_depth': d,
        'class_weight': cw,
        'criterion': cr,
        'mean_accuracy': scores.mean(),
        'std_accuracy': scores.std()
    })

HBox(children=(FloatProgress(value=0.0, description='Manual GridSearch', max=784.0, style=ProgressStyle(descri…




In [17]:
df_manual = pd.DataFrame(results_manual)
df_manual_sorted = df_manual.sort_values('mean_accuracy', ascending=False)
df_manual_sorted.head(10)

Unnamed: 0,n_estimators,max_depth,class_weight,criterion,mean_accuracy,std_accuracy
680,100,24,balanced,entropy,0.904293,0.012361
503,50,28,,gini,0.90429,0.010961
700,100,29,balanced,entropy,0.90429,0.012156
509,50,30,balanced,gini,0.903549,0.012056
711,100,31,,gini,0.903547,0.01438
684,100,25,balanced,entropy,0.902809,0.013639
521,50,33,balanced,gini,0.902809,0.013628
783,100,49,,gini,0.902806,0.01046
507,50,29,,gini,0.902806,0.011698
731,100,36,,gini,0.902806,0.01046


In [18]:
best_score = df_manual_sorted['mean_accuracy'].iloc[0]
best_score

0.9042929918766351

In [19]:
t_score = df_manual_sorted['mean_accuracy'].iloc[9]
t_score

0.9028060030290513

In [20]:
best_score - tenth_score

0.03192069392812913

## Разница между лучшей и десятой комбинацией составляет всего 0.0015 (0.15%).
## Это очень маленькая разница, значит, модель Random forest тоже устойчива к выбору гиперпараметров — можно выбрать любую из топовых комбинаций.

## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [21]:
best_params = df_manual_sorted.iloc[0]

In [22]:
best_rf = RandomForestClassifier(
    n_estimators=best_params['n_estimators'],
    max_depth=best_params['max_depth'],
    class_weight=best_params['class_weight'],
    criterion=best_params['criterion'],
    random_state=21
)
best_rf.fit(X_train, y_train)

RandomForestClassifier(class_weight='balanced', criterion='entropy',
                       max_depth=24, random_state=21)

In [23]:
y_pred = best_rf.predict(X_test)
test_accuracy = accuracy_score(y_test, y_pred)
test_accuracy

0.9260355029585798