# Day 09. Exercise 04
# Pipelines and OOP

## 0. Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from joblib import dump
from tqdm.notebook import tqdm

## 1. Preprocessing pipeline

Create three custom transformers, the first two out of which will be used within a [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html).

1. `FeatureExtractor()` class:
 - Takes a dataframe with `uid`, `labname`, `numTrials`, `timestamp` from the file [`checker_submits.csv`](https://drive.google.com/file/d/14voc4fNJZiLEFaZyd8nEG-lQt5JjatYw/view?usp=sharing).
 - Extracts `hour` from `timestamp`.
 - Extracts `weekday` from `timestamp` (numbers).
 - Drops the `timestamp` column.
 - Returns the new dataframe.


2. `MyOneHotEncoder()` class:
 - Takes the dataframe from the result of the previous transformation and the name of the target column.
 - Identifies all the categorical features and transforms them with `OneHotEncoder()`. If the target column is categorical too, then the transformation should not apply to it.
 - Drops the initial categorical features.
 - Returns the dataframe with the features and the series with the target column.


3. `TrainValidationTest()` class:
 - Takes `X` and `y`.
 - Returns `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` (`test_size=0.2`, `random_state=21`, `stratified`).


In [2]:
class FeatureExtractor(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        df['hour'] = pd.to_datetime(df['timestamp']).dt.hour
        df['dayofweek'] = pd.to_datetime(df['timestamp']).dt.weekday
        df.drop(columns=['timestamp'], inplace=True)
        return df

class MyOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, target_column):
        self.target_column = target_column
        self.encoder = OneHotEncoder()
    
    def fit(self, df):
        categorical_cols = df.select_dtypes(include=['object']).columns
        self.categorical_cols = [col for col in categorical_cols if col != self.target_column]
        self.encoder.fit(df[self.categorical_cols])
        return self
    
    def transform(self, df):
        encoded_features = self.encoder.transform(df[self.categorical_cols])
        encoded_df = pd.DataFrame(encoded_features.toarray(), columns=self.encoder.get_feature_names_out(input_features=self.categorical_cols))
        df.drop(columns=self.categorical_cols, inplace=True)
        return pd.concat([df.reset_index(drop=True), encoded_df], axis=1)

class TrainValidationTest(BaseEstimator, TransformerMixin):
    def __init__(self):
        pass
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, df):
        X = df.drop(columns='dayofweek')
        y = df['dayofweek']
        X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)
        X_valid, X_test, y_valid, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=21, stratify=y_temp)
        return X_train, X_valid, X_test, y_train, y_valid, y_test


## 2. Model selection pipeline

`ModelSelection()` class

 - Takes a list of `GridSearchCV` instances and a dict where the keys are the indexes from that list and the values are the names of the models, the example is below in the reverse order (from high-level to low-level perspective):

```
ModelSelection(grids, grid_dict)

grids = [gs_svm, gs_tree, gs_rf]

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs), where jobs you can specify by yourself

svm_params = [{'kernel':('linear', 'rbf', 'sigmoid'), 'C':[0.01, 0.1, 1, 1.5, 5, 10], 'gamma': ['scale', 'auto'], 'class_weight':('balanced', None), 'random_state':[21], 'probability':[True]}]
```

 - Method `choose()` takes `X_train`, `y_train`, `X_valid`, `y_valid` and returns the name of the best classifier among all the models on the validation set
 - Method `best_results()` returns a dataframe with the columns `model`, `params`, `valid_score` where the rows are the best models within each class of models.

```
model	params	valid_score
0	SVM	{'C': 10, 'class_weight': None, 'gamma': 'auto...	0.772727
1	Decision Tree	{'class_weight': 'balanced', 'criterion': 'gin...	0.801484
2	Random Forest	{'class_weight': None, 'criterion': 'entropy',...	0.855288
```

 - When you iterate through the parameters of a model class, print the name of that class and show the progress using `tqdm.notebook`, in the end of the cycle print the best model of that class.

```
Estimator: SVM
100%
125/125 [01:32<00:00, 1.36it/s]
Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.773
Validation set accuracy score for best params: 0.878 

Estimator: Decision Tree
100%
57/57 [01:07<00:00, 1.22it/s]
Best params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21, 'random_state': 21}
Best training accuracy: 0.801
Validation set accuracy score for best params: 0.867 

Estimator: Random Forest
100%
284/284 [06:47<00:00, 1.13s/it]
Best params: {'class_weight': None, 'criterion': 'entropy', 'max_depth': 22, 'n_estimators': 50, 'random_state': 21}
Best training accuracy: 0.855
Validation set accuracy score for best params: 0.907 

Classifier with best validation set accuracy: Random Forest
```

In [3]:
class ModelSelection:
    def __init__(self, grids, grid_dict):
        self.grids = grids
        self.grid_dict = grid_dict
    
    def best_results(self):
        results = []
        for i, grid in enumerate(self.grids):
            best_params = grid.best_params_
            best_score = grid.best_score_
            model_name = self.grid_dict[i]
            results.append({'model': model_name, 'params': best_params, 'valid_score': best_score})
        return pd.DataFrame(results)

    def choose(self, X_train, y_train, X_valid, y_valid):
        best_model = None
        best_score = 0
        for i, grid in enumerate(self.grids):
            print(f"Estimator: {self.grid_dict[i]}")
            for params in tqdm(grid.param_grid, desc="Grid search progress"):
                grid.set_params(param_grid=params)
                grid.fit(X_train, y_train)
                print(f"Best params: {grid.best_params_}")
                print(f"Best training accuracy: {grid.best_score_:.3f}")
                score = grid.score(X_valid, y_valid)
                print(f"Validation set accuracy score for best params: {score:.3f}\n")
                if score > best_score:
                    best_score = score
                    best_model = self.grid_dict[i]
                    best_estimator = grid.best_estimator_
        print(f"Classifier with best validation set accuracy: {best_model}")
        return best_model, best_estimator


## 3. Finalization

`Finalize()` class
 - Takes an estimator.
 - Method `final_score()` takes `X_train`, `y_train`, `X_test`, `y_test` and returns the accuracy of the model as in the example below:
```
final.final_score(X_train, y_train, X_test, y_test)
Accuracy of the final model is 0.908284023668639
```
 - Method `save_model()` takes a path, saves the model to this path and prints that the model was successfully saved.

In [4]:
class Finalize:
    def __init__(self, estimator):
        self.estimator = estimator

    def final_score(self, X_train, y_train, X_test, y_test):
        self.estimator.fit(X_train, y_train)
        y_pred = self.estimator.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        print(f'Accuracy of the final model is {accuracy}')
        return accuracy

    def save_model(self, path):
        dump(self.estimator, path)
        print(f'The model was successfully saved to {path}')

## 4. Main program

1. Load the data from the file (****name of file****).
2. Create the preprocessing pipeline that consists of two custom transformers: `FeatureExtractor()` and `MyOneHotEncoder()`:
```
preprocessing = Pipeline([('feature_extractor', FeatureExtractor()), ('onehot_encoder', MyOneHotEncoder('dayofweek'))])
```
3. Use that pipeline and its method `fit_transform()` on the initial dataset.
```
data = preprocessing.fit_transform(df)
```
4. Get `X_train`, `X_valid`, `X_test`, `y_train`, `y_valid`, `y_test` using `TrainValidationTest()` and the result of the pipeline.
5. Create an instance of `ModelSelection()`, use the method `choose()` applying it to the models that you want and parameters that you want, get the dataframe of the best results.
6. create an instance of `Finalize()` with your best model, use method `final_score()` and save the model in the format: `name_of_the_model_{accuracy on test dataset}.sav`.

That is it, congrats!

In [5]:
df = pd.read_csv('../data/checker_submits.csv')

preprocessing = Pipeline([
    ('feature_extractor', FeatureExtractor()),
    ('onehot_encoder', MyOneHotEncoder('dayofweek')
)])

data = preprocessing.fit_transform(df)

In [6]:
X_train, X_valid, X_test, y_train, y_valid, y_test = TrainValidationTest().fit_transform(data)

In [7]:
svm = SVC(random_state=21)
tree = DecisionTreeClassifier(random_state=21)
rf = RandomForestClassifier(random_state=21)

svm_params = [
    {
        'kernel':('linear', 'rbf', 'sigmoid'),
        'C':[0.01, 0.1, 1, 1.5, 5, 10],
        'gamma': ['scale', 'auto'],
        'class_weight':('balanced', None),
        'random_state':[21],
        'probability':[True]
    }
]

tree_params = [
    {
        'max_depth': range(1, 50),
        'class_weight': ['balanced', None],
        'criterion': ['entropy', 'gini']
    }
]

rf_params = [
    {
        'n_estimators': [5, 10, 50, 100],
        'max_depth': range(1, 50),
        'class_weight': ['balanced', None],
        'criterion': ['entropy', 'gini']
}]

In [8]:
jobs = -1

gs_svm = GridSearchCV(estimator=svm, param_grid=svm_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_tree = GridSearchCV(estimator=tree, param_grid=tree_params, scoring='accuracy', cv=2, n_jobs=jobs)
gs_rf = GridSearchCV(estimator=rf, param_grid=rf_params, scoring='accuracy', cv=2, n_jobs=jobs)

grid_dict = {
    0: 'SVM',
    1: 'Decision Tree',
    2: 'Random Forest'
}

selector = ModelSelection([gs_svm, gs_tree, gs_rf], grid_dict)
best_model, best_estimator = selector.choose(X_train, y_train, X_valid, y_valid)

Estimator: SVM


Grid search progress:   0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf', 'probability': True, 'random_state': 21}
Best training accuracy: 0.809
Validation set accuracy score for best params: 0.870

Estimator: Decision Tree


Grid search progress:   0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'class_weight': None, 'criterion': 'gini', 'max_depth': 17}
Best training accuracy: 0.829
Validation set accuracy score for best params: 0.811

Estimator: Random Forest


Grid search progress:   0%|          | 0/1 [00:00<?, ?it/s]

Best params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 26, 'n_estimators': 100}
Best training accuracy: 0.877
Validation set accuracy score for best params: 0.917

Classifier with best validation set accuracy: Random Forest


In [9]:
selector.best_results()

Unnamed: 0,model,params,valid_score
0,SVM,"{'C': 10, 'class_weight': None, 'gamma': 'auto...",0.809347
1,Decision Tree,"{'class_weight': None, 'criterion': 'gini', 'm...",0.829377
2,Random Forest,"{'class_weight': 'balanced', 'criterion': 'ent...",0.876855


In [10]:
finalize = Finalize(best_estimator)
accuracy = finalize.final_score(X_train, y_train, X_test, y_test)
model_name = f'../data/{best_model}_{accuracy:.4f}.sav'
finalize.save_model(model_name)

Accuracy of the final model is 0.9467455621301775
The model was successfully saved to ../data/Random Forest_0.9467.sav
