# ML Modeling

In [1]:
# importing useful modules
!pip install category_encoders
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split 
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn import preprocessing
from sklearn import metrics
import numpy as np 
import matplotlib.pyplot as plt
import category_encoders as ce
%matplotlib inline
import pickle
import mlflow
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, GridSearchCV
import warnings
warnings.filterwarnings("ignore")
np.random.seed(40)
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import ElasticNet
from urllib.parse import urlparse
import mlflow
import mlflow.sklearn



In [2]:
#Adding scripts path
import sys
sys.path.insert(0,'../scripts/')
from data_preProcessing import data_preProcessing_script
from data_cleaner import DataCleaner

In [3]:
# importing versioned datasets
df = pd.read_csv('../data/train_store.csv')
df.head()

Unnamed: 0,DayOfWeek,WeekDay,Year,Month,Season,Day,MonthTiming,Sales,Customers,Open,...,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,SalesPerCustomer,CompetitionOpenSince
0,0.501484,0.630672,1.502077,0.346724,0.389712,1.740766,1.205894,-0.063134,-0.067027,0.452399,...,-0.940975,-0.538742,0.436247,-0.385752,-1.001128,-0.421904,-0.305054,0.045131,-0.093996,0.595717
1,0.000831,0.630672,1.502077,0.346724,0.389712,1.626969,1.205894,-0.136239,-0.092863,0.452399,...,-0.940975,-0.538742,0.436247,-0.385752,-1.001128,-0.421904,-0.305054,0.045131,-0.207208,0.595717
2,-0.499823,0.630672,1.502077,0.346724,0.389712,1.513173,1.205894,-0.20784,-0.158887,0.452399,...,-0.940975,-0.538742,0.436247,-0.385752,-1.001128,-0.421904,-0.305054,0.045131,-0.227101,0.595717
3,-1.000476,0.630672,1.502077,0.346724,0.389712,1.399377,1.205894,-0.138946,-0.052674,0.452399,...,-0.940975,-0.538742,0.436247,-0.385752,-1.001128,-0.421904,-0.305054,0.045131,-0.303632,0.595717
4,-1.501129,0.630672,1.502077,0.346724,0.389712,1.285581,1.205894,0.189274,0.096598,0.452399,...,-0.940975,-0.538742,0.436247,-0.385752,-1.001128,-0.421904,-0.305054,0.045131,0.097227,0.595717


# Building models with sklearn pipelines

In [4]:
## separate the independent and target variable 
X = df.drop(['Customers', 'Sales', 'SalesPerCustomer'], axis = 1)
y = df['Sales']

In [5]:
# Splitting our dataset into train and test subsets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.2, random_state=42)
print ("Training and testing split was successful.")

Training and testing split was successful.


In [6]:
# Define the Pipeline
model_pipeline = Pipeline(steps=[('random_forest', RandomForestRegressor(max_depth=10,random_state=2))])

In [7]:
def calculate_metrics(y_test, y_preds, name:str=''):
    try:
        rmse = np.sqrt(mean_squared_error(y_test, y_preds))
        r_sq = r2_score(y_test, y_preds)
        mae = mean_absolute_error(y_test, y_preds)

        # Logging Values
        mlflow.log_param("{} RMSE Score is: {:.5%}".format(name, rmse))
        mlflow.log_param("{} R2 Square Score is: {:.5%}".format(name, r_sq))
        mlflow.log_param("{} MAE Score is: {:.5%}".format(name, mae))

        return {f'{name}RMSE Score': rmse, f'{name}R2_Squared': r_sq, f'{name}MAE Score': mae}
    except Exception as e:
        print("unable to calculate matrix")    

In [8]:
# Fit the pipeline with the data
mlflow.autolog(log_input_examples=True, disable_for_unsupported_versions=True, silent=True)
with mlflow.start_run() as run:
    best_model = model_pipeline.fit(X_train, y_train)

    train_score = best_model.score(X_train, y_train)
    valid_score = best_model.score(X_valid, y_valid)
    valid_metrics = calculate_metrics(y_valid, best_model.predict(X_valid))
    test_score = best_model.score(X_test, y_test)
    test_metrics = calculate_metrics(y_test, best_model.predict(X_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)

MlflowException: Could not find experiment with ID 0

In [9]:
# mlflow run
def eval_metrics(actual, pred):
    rmse = np.sqrt(mean_squared_error(actual, pred))
    mae = mean_absolute_error(actual, pred)
    r2 = r2_score(actual, pred)
    return rmse, mae, r2

with mlflow.start_run():
    model_pipeline.fit(X_train, y_train)

    predicted_qualities = model_pipeline.predict(X_test)

    (rmse, mae, r2) = eval_metrics(y_test, predicted_qualities)

    print("  RMSE: %s" % (rmse*100))
    print("  MAE: %s" % (mae*100))
    print("  R2: %s" % (r2*100))

    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("r2", r2)
    mlflow.log_metric("mae", mae)

    # tracking_url_type_store = urlparse(mlflow.get_tracking_uri()).scheme

    # # Model registry does not work with file store
    # if tracking_url_type_store != "file":

    #     # Register the model
    #     # There are other ways to use the Model Registry, which depends on the use case,
    #     # please refer to the doc for more information:
    #     # https://mlflow.org/docs/latest/model-registry.html#api-workflow
    #     mlflow.sklearn.log_model(
    #         lr, "model", registered_model_name="Regression")
    # else:
    #     mlflow.sklearn.log_model(lr, "model")

MlflowException: Could not find experiment with ID 0

In [None]:
save_model(model, test_metrics['RMSE Score'])

# Parameter Tunning

In [None]:
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [{
    "regressor": [RandomForestRegressor()],
    "regressor__n_estimators": [10, 15],
    "regressor__max_depth":[5, 8, 15],
    "regressor__min_samples_leaf":[1, 2],
    "regressor__bootstrap": [True, False],
    "regressor__criterion": ['mse'],
    "regressor__max_leaf_nodes": [2, 5],
    "regressor__max_features": [2, 3],
    "regressor__warm_start": [True, False]
}]
     
# create a gridsearch of the pipeline, the fit the best model
grid_search_pipeline = GridSearchCV(
    model_pipeline, grid_param, cv=3, verbose=0, n_jobs=-1)  # Fit grid search

In [None]:
def calculate_metrics(y_test, y_preds, name:str=''):
    try:
        rmse = np.sqrt(mean_squared_error(y_test, y_preds))
        r_sq = r2_score(y_test, y_preds)
        mae = mean_absolute_error(y_test, y_preds)

        # Logging Values
        logger.info("{} RMSE Score is: {:.5%}".format(name, rmse))
        logger.info("{} R2 Square Score is: {:.5%}".format(name, r_sq))
        logger.info("{} MAE Score is: {:.5%}".format(name, mae))

        return {f'{name}RMSE Score': rmse, f'{name}R2_Squared': r_sq, f'{name}MAE Score': mae}
    except Exception as e:
        logger.exception("Model Metrics Calculation failed")

In [None]:
# Fit the pipeline with the data
mlflow.autolog(log_input_examples=True, disable_for_unsupported_versions=True, silent=True)
with mlflow.start_run() as run:
    best_model = model_pipeline.fit(X_train, y_train)

    train_score = best_model.score(X_train, y_train)
    valid_score = best_model.score(X_valid, y_valid)
    valid_metrics = calculate_metrics(y_valid, best_model.predict(X_valid))
    test_score = best_model.score(X_test, y_test)
    test_metrics = calculate_metrics(y_test, best_model.predict(X_test))

    mlflow.log_metric("Valid Score", valid_score)
    mlflow.log_metrics(valid_metrics)
    mlflow.log_metric("Test Score", test_score)
    mlflow.log_metrics(test_metrics)