# Training Regression Models using sklearn pipelines

## Importing Libraries

In [1]:
#importing Libraries
import pandas as pd
# import dvc.api
import os
import sys
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import mlflow
#import local libraries
#Adding scripts path
sys.path.append(os.path.abspath(os.path.join('..')))
from scripts.data_loader import load_df_from_csv
from scripts.ML_modelling_utils import *
#importing dvc_data_loader script
# from scripts.dvc_data_loader import *


## Loading Clean Data

In [2]:
clean_data = load_df_from_csv('../data/train.csv')
y_values = clean_data['Sales']
x_values = clean_data.drop(['Sales'],axis=1)

## Training using Random Forest Regressor

In [3]:
# Splitting Data (60,20,20)
x_train, x_test, y_train, y_test = train_test_split(x_values, y_values, test_size=0.2, random_state=42)
x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.2, random_state=42)

In [4]:
mlflow.autolog()
# Create a based model
rf = RandomForestRegressor()
with mlflow.start_run() as run:
    rf.fit(x_train, y_train)

2021/07/28 22:10:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for sklearn.
2021/07/28 22:10:25 INFO mlflow.tracking.fluent: Autologging successfully enabled for xgboost.


In [5]:
train_score = rf.score(x_train, y_train)


In [6]:
valid_score = rf.score(x_valid,y_valid)

In [7]:
valid_metrics = calculate_metrics(y_valid,rf.predict(x_valid))

In [8]:
test_metrics = calculate_metrics(y_test, rf.predict(x_test))


In [9]:
features = pd.DataFrame()
features["Feature"] = x_train.columns
features["Importance"] = rf.feature_importances_
features.sort_values(by='Importance', ascending=False)


Unnamed: 0,Feature,Importance
7,Open,0.460084
15,CompetitionDistance,0.160795
8,Promo,0.073357
17,CompetitionOpenSinceYear,0.051225
16,CompetitionOpenSinceMonth,0.049667
0,DayOfWeek,0.033195
13,StoreType,0.024443
19,Promo2SinceWeek,0.024357
5,Day,0.022875
3,Month,0.019388


In [10]:
# with mlflow.start_run():
#     mlflow.log_parameter('')
#     mlflow.log_metric('')
#     mlflow.sklearn.log_model(rf,'model')

In [11]:
# Create the parameter grid based on the results of random search
param_grid = {
    'bootstrap': [True],
    'criterion': ['mse'],
    'max_depth': [80, 90],
    'max_features': [2, 3],
    'min_samples_leaf': [3, 4],
    'min_samples_split': [8, 10],
    'n_estimators': [100, 200]
}

rf2 = RandomForestRegressor()
# Instantiate the grid search model
grid_search = GridSearchCV(estimator=rf2, param_grid=param_grid,
                           cv=3, n_jobs=-1, verbose=2)


In [12]:
# Fit the grid search to the data
# grid_search.fit(x_train, y_train)

In [13]:
# grid_search.best_params_

# best_grid = grid_search.best_estimator_
