# Training Regression Models using sklearn pipelines

# Without Pipeline

## Importing Libraries

In [1]:
#importing Libraries
import pandas as pd
# import dvc.api
import os
import sys
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import mlflow
#import local libraries
#Adding scripts path
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.data_loader import load_df_from_csv
from scripts.ML_modelling_utils import *


## Loading Clean Data

In [2]:
clean_data = load_df_from_csv('../data/train.csv')
y_values = clean_data['Sales']
x_values = clean_data.drop(['Sales'],axis=1)

## Training using Random Forest Regressor

In [3]:
# Splitting Data (60,20,20)
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [7]:
mlflow.autolog(log_input_examples=True, log_model_signatures=True, log_models=False, silent=True)
# Create a based model
rf = RandomForestRegressor()
with mlflow.start_run() as run:
    rf.fit(x_train, y_train)

    train_score = rf.score(x_train, y_train)
    valid_score = rf.score(x_valid,y_valid)
    valid_metrics = calculate_metrics(y_valid,rf.predict(x_valid),"Validation ")
    test_score = rf.score(x_test,y_test)
    test_metrics = calculate_metrics(y_test, rf.predict(x_test), "Test ")

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)
    mlflow.sklearn.log_model(rf, generate_model_name(test_metrics['Test RMSE Score']))


Modelling Utilities:INFO->RMSE Score is: 0.23
Modelling Utilities:INFO->R2 Square Score is: 0.95
Modelling Utilities:INFO->MAE Score is: 0.13
Modelling Utilities:INFO->RMSE Score is: 0.23
Modelling Utilities:INFO->R2 Square Score is: 0.95
Modelling Utilities:INFO->MAE Score is: 0.13


In [5]:
train_score

0.9918217208283953

In [6]:
valid_score

0.9464088698187895

In [7]:
valid_metrics

{'RMSE Score': 0.23207756030571391,
 'R2_Squared': 0.9464088698187895,
 'MAE Score': 0.13363749553641205}

In [8]:
test_metrics


{'RMSE Score': 0.23242887038376997,
 'R2_Squared': 0.9458018010617835,
 'MAE Score': 0.13350047867580667}

In [9]:
features = pd.DataFrame()
features["Feature"] = x_train.columns
features["Importance"] = rf.feature_importances_
features.sort_values(by='Importance', ascending=False)


Unnamed: 0,Feature,Importance
7,Open,0.460141
15,CompetitionDistance,0.161633
8,Promo,0.073437
17,CompetitionOpenSinceYear,0.050305
16,CompetitionOpenSinceMonth,0.049603
0,DayOfWeek,0.033223
19,Promo2SinceWeek,0.024841
13,StoreType,0.024464
5,Day,0.022929
3,Month,0.019423


## Parameter Tunning

In [9]:
# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'criterion': ['mse'],
    'max_depth': [10, 15, 20],
    'max_features': [2, 3],
    'n_estimators': [10, 15],
    'warm_start': [True]
}

rf2 = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf2, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=0)


**MSE** is used to check how close estimates or forecasts are to actual values. Lower the MSE, the closer is forecast to actual.

In [12]:
# Fit the grid search to the data
mlflow.autolog(log_input_examples=True, log_model_signatures=True, log_models=False, silent=True)
with mlflow.start_run() as run:
    best_model = grid_search.fit(x_train, y_train)

    train_score = best_model.score(x_train,y_train)
    valid_score = best_model.score(x_valid, y_valid)
    valid_metrics = calculate_metrics(
        y_valid, best_model.predict(x_valid), "Validation ")
    test_score = best_model.score(x_test, y_test)
    test_metrics = calculate_metrics(
        y_test, best_model.predict(x_test), "Test ")

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)
    mlflow.sklearn.log_model(best_model, generate_model_name(
        test_metrics['Test RMSE Score']))


Modelling Utilities:INFO->RMSE Score is: 45.95221%
Modelling Utilities:INFO->R2 Square Score is: 78.98935%
Modelling Utilities:INFO->MAE Score is: 29.64358%
Modelling Utilities:INFO->RMSE Score is: 45.58945%
Modelling Utilities:INFO->R2 Square Score is: 79.14868%
Modelling Utilities:INFO->MAE Score is: 29.50637%


In [13]:
best_model.best_params_

{'bootstrap': True,
 'criterion': 'mse',
 'max_depth': 20,
 'max_features': 3,
 'n_estimators': 10,
 'warm_start': True}

In [14]:
grid_features = pd.DataFrame()
grid_features["Feature"] = x_train.columns
grid_features["Importance"] = best_model.best_estimator_.feature_importances_
grid_features.sort_values(by='Importance', ascending=False)


Unnamed: 0,Feature,Importance
7,Open,0.235976
0,DayOfWeek,0.221043
8,Promo,0.123794
15,CompetitionDistance,0.080959
1,WeekDay,0.051712
9,StateHoliday,0.037606
16,CompetitionOpenSinceMonth,0.036478
17,CompetitionOpenSinceYear,0.035154
5,Day,0.025608
13,StoreType,0.022285


# Using Pipeline

## Loading just merged unclean data

In [53]:
merged_data = load_df_from_csv('../data/train.csv')
y_values = merged_data['Sales']
x_values = merged_data.drop(['Sales'], axis=1)

## Spliting Data Sets

In [54]:
# Splitting Data (60,20,20)
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

## Creating Transformers for our numeric and categorical data

In [55]:
class CustomMaxImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()
        
    def fit(self, X, y=0):
        self.max_value = X.max()

        return self

    def transform(self, X, y=0):
        return np.where(X.isna(), self.max_value, X)


In [41]:
class CustomMostFrequentImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        super().__init__()

    def fit(self, X, y=0):
        most_occuring = Counter(X.flat).most_common(1)
        self.mode_value = most_occuring[0][0]

        return self

    def transform(self, X, y=0):
        return np.where(X.isna(), self.mode_value, X)


In [56]:
numeric_transformer = Pipeline(steps=[
    ('custom_max', CustomMaxImputer()),
    ('scaler', StandardScaler())
])
categorical_transformer = Pipeline(steps=[
    ('custom_mode', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder()),
])


## Identifying our columns and passing it to a ColumnTransformer

In [57]:
# Scaling Sales column
merged_data['Sales'] = numeric_transformer.fit_transform(merged_data[["Sales"]])


In [58]:
numeric_features = merged_data.select_dtypes(include=['int64', 'float64','uint8','uint16','float32']).columns

categorical_features = merged_data.select_dtypes(include=['object']).columns

class make_present_col_selector_class:
    def __init__(self, selected_columns):
        self.selected_columns = selected_columns

    def __call__(self, df):
        return [col for col in df.columns if col in self.selected_columns]

preprocessor = ColumnTransformer(
    transformers=[
        ('numeric', numeric_transformer, make_present_col_selector_class(numeric_features)), 
        ('categorical', categorical_transformer, make_present_col_selector_class(categorical_features))
    ])


## Creating our RandomForestClassifier Pipeline with our preprocessor

In [59]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor), 
    ('regressor', RandomForestRegressor())
])


## Training Model using Pipeline

In [60]:
# Fit the pipeline with the data
mlflow.autolog(log_input_examples=True, disable_for_unsupported_versions=True, silent=True)
with mlflow.start_run() as run:
    best_model = pipeline.fit(x_train, y_train)

    train_score = best_model.score(x_train, y_train)
    valid_score = best_model.score(x_valid, y_valid)
    valid_metrics = calculate_metrics(y_valid, best_model.predict(x_valid))
    test_score = best_model.score(x_test, y_test)
    test_metrics = calculate_metrics(y_test, best_model.predict(x_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)


## Parameter Tunning

In [12]:
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [{
    "regressor": [RandomForestClassifier()],
    "regressor__n_estimators": [10, 15],
    "regressor__max_depth":[5, 8, 15],
    "regressor__min_samples_leaf":[1, 2],
    "regressor__max_leaf_nodes": [2, 5]
}]
     
# create a gridsearch of the pipeline, the fit the best model
grid_search_pipeline = GridSearchCV(
    pipeline_rf, grid_param, cv=5, verbose=0, n_jobs=-1)  # Fit grid search


In [None]:
# Fit the grid search to the data
with mlflow.start_run() as run:
    best_model = grid_search_pipeline.fit(X_train, y_train)

    train_score = best_model.score(x_train, y_train)
    valid_score = best_model.score(x_valid, y_valid)
    valid_metrics = calculate_metrics(y_valid, best_model.predict(x_valid))
    test_score = best_model.score(x_test, y_test)
    test_metrics = calculate_metrics(y_test, best_model.predict(x_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)

# Prediciton Interval

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
# Set lower and upper quantile
LOWER_ALPHA = 0.1
UPPER_ALPHA = 0.9
# Each model has to be separate
lower_model = GradientBoostingRegressor(loss="quantile",
                                        alpha=LOWER_ALPHA)
# The mid model will use the default loss
mid_model = GradientBoostingRegressor(loss="ls")
upper_model = GradientBoostingRegressor(loss="quantile",
                                        alpha=UPPER_ALPHA)


In [None]:
# Fit models
lower_model.fit(X_train, y_train)
mid_model.fit(X_train, y_train)
upper_model.fit(X_train, y_train)
# Record actual values on test set
predictions = pd.DataFrame(y_test)
# Predict
predictions['lower'] = lower_model.predict(X_test)
predictions['mid'] = mid_model.predict(X_test)
predictions['upper'] = upper_model.predict(X_test)


In [None]:
energy_data = go.Scatter(x=energy_series.index,
                         y=energy_series.values)
layout = go.Layout(title='Energy Plot', xaxis=dict(title='Date'),
                   yaxis=dict(title='(kWh)'))
fig = go.Figure(data=[energy_data], layout=layout)
py.iplot(fig, sharing='public')


# Get the steam data
steam_series = df.loc[:, ("Steam", "4")]
# Create the steam data object
steam_data = go.Scatter(x=steam_series.index,
                        y=steam_series.values,
                        # Specify axis
                        yaxis='y2')

layout = go.Layout(height=600, width=800,
                   title='Energy and Steam Plot',
                   # Same x and first y
                   xaxis=dict(title='Date'),
                   yaxis=dict(title='Energy', color='red'),
                   # Add a second yaxis to the right of the plot
                   yaxis2=dict(title='Steam', color='blue',
                               overlaying='y', side='right')
                   )
fig = go.Figure(data=[energy_data, steam_data], layout=layout)
py.iplot(fig, sharing='public')
