# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [195]:
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
import numpy as np
import joblib
from tqdm.notebook import tqdm
from sklearn.tree import DecisionTreeClassifier

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [196]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        df = X.copy()
        df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
        df['dayofweek'] = pd.to_datetime(df['timestamp']).dt.dayofweek
        df = df.drop(columns=['timestamp'])
        return df

In [197]:
class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_col):
        self.target_col = target_col
        self.encoder = None
        self.cat_cols = None
    def fit(self, X, y=None):
        self.cat_cols = X.select_dtypes(include='object').columns.tolist()
        if self.target_col in self.cat_cols:
            self.cat_cols.remove(self.target_col)
        self.encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
        self.encoder.fit(X[self.cat_cols])
        return self
    def transform(self, X):
        X_ = X.copy()
        y = X_[self.target_col]
        X_ = X_.drop(columns=[self.target_col])
        if self.cat_cols:
            encoded = self.encoder.transform(X_[self.cat_cols])
            encoded_df = pd.DataFrame(encoded, columns=self.encoder.get_feature_names(self.cat_cols), index=X_.index)
            X_ = X_.drop(columns=self.cat_cols)
            X_ = pd.concat([X_, encoded_df], axis=1)
        return X_, y

In [198]:
class TrainValidationTest:
    def __init__(self, test_size=0.2, valid_size=0.25, random_state=21):
        self.test_size = test_size
        self.valid_size = valid_size
        self.random_state = random_state
    def split(self, X, y):
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=self.test_size, random_state=self.random_state, stratify=y
        )
        X_train, X_valid, y_train, y_valid = train_test_split(
            X_train, y_train, test_size=self.valid_size, random_state=self.random_state, stratify=y_train
        )
        return X_train, X_valid, X_test, y_train, y_valid, y_test

## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.877778
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.866667
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.907407
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [199]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
        self.results = []
        self.best_estimator = None
    
    def choose(self, X_train, y_train, X_valid, y_valid):
        best_model_name = None
        best_score = 0
        self.best_estimator = None
        
        for i, grid in enumerate(self.grids):
            model_name = self.grid_dict[i]
            print(f"Estimator {model_name}")
            total_fits = np.prod([len(v) for v in grid.param_grid[0].values()]) * 2
            
            with tqdm(total=total_fits, unit="it") as pbar:
                grid.fit(X_train, y_train)
                
                valid_score = grid.score(X_valid, y_valid)
                
                # Сохраняем результаты
                result = {
                    'model': model_name,
                    'best_params': grid.best_params_,
                    'train_score': grid.best_score_,
                    'valid_score': valid_score,
                    'best_estimator': grid.best_estimator_
                }
                self.results.append(result)
                
                if valid_score > best_score:
                    best_score = valid_score
                    best_model_name = model_name
                    self.best_estimator = grid.best_estimator_

                pbar.update(total_fits)    
                print(f"Best params: {grid.best_params_}")
                print(f"Best training accuracy: {grid.best_score_:.3f}")
                print(f"Validation set accuracy for best params: {valid_score:.3f}")

        print(f"\nClassifier with best validation set accuracy: {best_model_name}")
        return self.best_estimator
    
    def best_results(self):
        return pd.DataFrame([{
            'model': r['model'],
            'train_score': r['train_score'],
            'valid_score': r['valid_score'],
            'best_params': str(r['best_params'])
        } for r in self.results])

In [200]:
# class ModelSelection:
#     def __init__(self, grids, grid_dict):
#         self.grids = grids
#         self.grid_dict = grid_dict
#         self.results = []
#     def choose(self, X_train, y_train, X_valid, y_valid):
#         best_score = -np.inf
#         best_model = None
#         best_name = None
#         for idx, grid in enumerate(self.grids):
#             name = self.grid_dict[idx]
#             print(f"Estimator: {name}")
#             
#          
#             
#             
#             with tqdm(total=len(grid.param_grid), unit="it") as pbar:
#                 grid.fit(X_train, y_train)
#                 pbar.update(len(grid.param_grid), unit="it")
#                 grid.fit(X_train, y_train)
#             train_score = grid.best_score_
#             valid_score = grid.score(X_valid, y_valid)
#             print(f"Best params: {grid.best_params_}")
#             print(f"Best training accuracy: {train_score:.3f}")
#             print(f"Validation set accuracy score for best params: {valid_score:.3f}\n")
#             self.results.append({
#                 'model': name,
#                 'params': grid.best_params_,
#                 'valid_score': valid_score
#             })
#             if valid_score > best_score:
#                 best_score = valid_score
#                 best_model = grid.best_estimator_
#                 best_name = name
#         print(f"Classifier with best validation set accuracy: {best_name}")
#         self.best_model = best_model
#         return best_model
#     def best_results(self):
#         return pd.DataFrame(self.results)

## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [201]:
class Finalize:
    def __init__(self, estimator):
        self.estimator = estimator
    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)
        acc = self.estimator.score(X_test, y_test)
        print(f"Accuracy of the final model is {acc}")
        return acc
    def save_model(self, path):
        joblib.dump(self.estimator, path)
        print(f"Model saved to {path}")

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [202]:
df = pd.read_csv('../data/checker_submits.csv')

In [203]:
preprocessing = Pipeline([
    ('feature_extractor', FeatureExtractor()),
    ('onehot_encoder', MyOneHotEncoder('dayofweek')) 
])


In [204]:
X, y = preprocessing.fit_transform(df)


In [205]:
splitter = TrainValidationTest()
X_train, X_valid, X_test, y_train, y_valid, y_test = splitter.split(X, y)

In [206]:
svm_params = [{
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None],
    'random_state': [21],
    'probability': [True]
}]
tree_params = [{
    'criterion': ['gini', 'entropy'],
    'max_depth': [5, 10, 15, 21],
    'class_weight': ['balanced', None],
    'random_state': [21]
}]
rf_params = [{
    'n_estimators': [10, 30, 50, 100],
    'criterion': ['gini', 'entropy'],
    'max_depth': [10, 15, 21, 22],
    'class_weight': ['balanced', None],
    'random_state': [21]
}]

gs_svm = GridSearchCV(SVC(), svm_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_tree = GridSearchCV(DecisionTreeClassifier(), tree_params, scoring='accuracy', cv=2, n_jobs=-1)
gs_rf = GridSearchCV(RandomForestClassifier(), rf_params, scoring='accuracy', cv=2, n_jobs=-1)

grids = [gs_svm, gs_tree, gs_rf]
grid_dict = {0: 'SVM', 1: 'Decision Tree', 2: 'Random Forest'}

selector = ModelSelection(grids, grid_dict)
best_model = selector.choose(X_train, y_train, X_valid, y_valid)
results_df = selector.best_results()
print(results_df)

Estimator SVM


HBox(children=(FloatProgress(value=0.0, max=144.0), HTML(value='')))

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.752
Validation set accuracy for best params: 0.855

Estimator Decision Tree


HBox(children=(FloatProgress(value=0.0, max=32.0), HTML(value='')))

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.802
Validation set accuracy for best params: 0.864

Estimator Random Forest


HBox(children=(FloatProgress(value=0.0, max=128.0), HTML(value='')))

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 22, 'n_estimators': 100, 'random_state': 21}
Best training accuracy: 0.856
Validation set accuracy for best params: 0.884


Classifier with best validation set accuracy: Random Forest
           model  train_score  valid_score  \
0            SVM     0.751714     0.854599   
1  Decision Tree     0.802201     0.863501   
2  Random Forest     0.855596     0.884273   

                                         best_params  
0  {'C': 10, 'class_weight': None, 'gamma': 'auto...  
1  {'class_weight': None, 'criterion': 'gini', 'm...  
2  {'class_weight': None, 'criterion': 'gini', 'm...  


In [207]:
final = Finalize(best_model)
acc = final.final_score(X_train, y_train, X_test, y_test)
model_name = f"../data/{type(best_model).__name__}_{acc:.3f}.sav"
final.save_model(model_name)

Accuracy of the final model is 0.9053254437869822
Model saved to ../data/RandomForestClassifier_0.905.sav
