# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [132]:
import pandas as pd
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from joblib import dump, load
from tqdm.notebook import trange, tqdm
import numpy as np

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1LjhbMTPnZsZpa1Uj75bCOaFpcfORXF3m/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [4]:
df = pd.read_csv("../data/dayofweek-not-scaled.csv")
X = df.drop(['dayofweek'], axis=1).values
y = df[['dayofweek']].values[:, 0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
df.head()

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [42]:
parameters = {'kernel':('linear', 'rbf', 'sigmoid'),
              'C':(0.01, 0.1, 1, 1.5, 5, 10),
              'gamma': ['scale', 'auto'],
              'class_weight': ['balanced', None]}
svc = SVC(C=0.01, random_state=21, probability=True)
# svc.fit(X_train, y_train)
best_svc = GridSearchCV(svc, parameters, scoring='accuracy', n_jobs=4)
arr = best_svc.fit(X_train, y_train)

In [108]:
results =  pd.DataFrame(best_svc.cv_results_)
results = results[['param_C', 'param_class_weight', "param_gamma", "param_kernel", "rank_test_score", "mean_test_score"]]
results.sort_values(by=['rank_test_score'], inplace=True)
results

Unnamed: 0,param_C,param_class_weight,param_gamma,param_kernel,rank_test_score,mean_test_score
70,10,,auto,rbf,1,0.876109
64,10,balanced,auto,rbf,2,0.863500
58,5,,auto,rbf,3,0.816018
52,5,balanced,auto,rbf,4,0.808608
63,10,balanced,auto,linear,5,0.721052
...,...,...,...,...,...,...
53,5,balanced,auto,sigmoid,68,0.129792
65,10,balanced,auto,sigmoid,69,0.115693
41,1.5,balanced,auto,sigmoid,70,0.079380
17,0.1,balanced,auto,sigmoid,71,0.062310


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [117]:
params = {'max_depth': tuple(range(1, 50)),
              'criterion': ('entropy', 'gini'),
              'class_weight': ['balanced', None]}
dtr = GridSearchCV(DecisionTreeClassifier(random_state=21), params,  scoring='accuracy', n_jobs=8)
dtr.fit(X_train, y_train)


GridSearchCV(estimator=DecisionTreeClassifier(random_state=21), n_jobs=8,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ('entropy', 'gini'),
                         'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...)},
             scoring='accuracy')

In [119]:
results_dtr =  pd.DataFrame(dtr.cv_results_)
results_dtr = results_dtr[['param_max_depth', 'param_class_weight', "param_criterion",  "rank_test_score", "mean_test_score"]]
results_dtr.sort_values(by=['rank_test_score'], inplace=True)
results_dtr

Unnamed: 0,param_max_depth,param_class_weight,param_criterion,rank_test_score,mean_test_score
69,21,balanced,gini,1,0.873865
73,25,balanced,gini,2,0.873854
70,22,balanced,gini,3,0.872378
97,49,balanced,gini,4,0.872372
71,23,balanced,gini,4,0.872372
...,...,...,...,...,...
51,3,balanced,gini,192,0.373906
147,1,,gini,193,0.355330
98,1,,entropy,193,0.355330
49,1,balanced,gini,195,0.286358


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [121]:
params = {'max_depth': tuple(range(1, 50)),
              'n_estimators': (5, 10, 50, 100),
              'criterion': ('entropy', 'gini'),
              'class_weight': ['balanced', None]}
rfc = GridSearchCV(RandomForestClassifier(random_state=21), params,  scoring='accuracy', n_jobs=8)
rfc.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(random_state=21), n_jobs=8,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ('entropy', 'gini'),
                         'max_depth': (1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
                                       13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
                                       23, 24, 25, 26, 27, 28, 29, 30, ...),
                         'n_estimators': (5, 10, 50, 100)},
             scoring='accuracy')

In [123]:
results_rfc =  pd.DataFrame(rfc.cv_results_)
results_rfc = results_rfc[['param_max_depth', 'param_class_weight', "param_criterion",  "rank_test_score", "param_n_estimators", "mean_test_score"]]
results_rfc.sort_values(by=['rank_test_score'], inplace=True)
results_rfc

Unnamed: 0,param_max_depth,param_class_weight,param_criterion,rank_test_score,param_n_estimators,mean_test_score
95,24,balanced,entropy,1,100,0.904293
115,29,balanced,entropy,2,100,0.904290
698,28,,gini,2,50,0.904290
314,30,balanced,gini,4,50,0.903549
711,31,,gini,5,100,0.903547
...,...,...,...,...,...,...
392,1,,entropy,780,5,0.353832
4,2,balanced,entropy,781,5,0.353110
200,2,balanced,gini,782,5,0.346419
196,1,balanced,gini,783,5,0.283390


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [148]:
max_depth_range = tuple(range(1, 50))
n_estimators_range =  (5, 10, 50, 100)
criterion_range =  ('entropy', 'gini')
class_weight_range = ['balanced', None]
result_df = pd.DataFrame(columns=['max_depth',
                                  'n_estimators',
                                  'criterion', 
                                  'class_weight', 
                                  'mean_accuracy', 'std_accuracy'])
i = 0
with tqdm(total=(len(max_depth_range) * len(n_estimators_range) * len(criterion_range) * len(class_weight_range))) as pbar:
    for max_depth in max_depth_range:
        for n_estimators in n_estimators_range:
            for criterion in criterion_range:
                for class_weight in class_weight_range:
                    rfc = RandomForestClassifier(random_state=21,
                                                 max_depth=max_depth,
                                                 n_estimators=n_estimators,
                                                 criterion=criterion,
                                                 class_weight=class_weight,
                                                 n_jobs=8)
                    scores = cross_val_score(rfc, X_test, y_test, cv=5)
                    result_df.loc[i] = [max_depth,
                                        n_estimators, criterion,
                                        class_weight, scores.mean(), scores.std()]
                    pbar.update(1)
                    i += 1
                
                        

  0%|          | 0/784 [00:00<?, ?it/s]

In [153]:
result_df.sort_values(by=['mean_accuracy'], ascending=False, inplace=True)
result_df

Unnamed: 0,max_depth,n_estimators,criterion,class_weight,mean_accuracy,std_accuracy
348,22,100,entropy,balanced,0.769315,0.029733
332,21,100,entropy,balanced,0.769315,0.031070
380,24,100,entropy,balanced,0.766330,0.035038
319,20,100,gini,,0.763389,0.031817
476,30,100,entropy,balanced,0.763345,0.032042
...,...,...,...,...,...,...
16,2,5,entropy,balanced,0.277700,0.101690
4,1,10,entropy,balanced,0.266198,0.055034
6,1,10,gini,balanced,0.254170,0.079035
0,1,5,entropy,balanced,0.212906,0.039580


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [162]:
final_model = RandomForestClassifier(random_state=21,
                                    max_depth=24,
                                    n_estimators=100,
                                    criterion='entropy',
                                    class_weight="balanced",
                                    n_jobs=8)
final_model.fit(X_train, y_train)
predict = final_model.predict(X_test)
print("final accuracy is:", accuracy_score(predict, y_test))

final accuracy is: 0.9260355029585798
