In [1]:
import mlflow
import os
import numpy as np
import pandas as pd

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

import seaborn as sns
import matplotlib.pyplot as plt

from pickle import dump

from hyperopt import STATUS_OK

from scripts.preprocessing_regression_models import Preprocess
from scripts.model_registry import ModelRegistry
from scripts.scoring import Scoring
from scripts.monitoring import Monitoring
from scripts.config_regression import (year_month_train, 
    input_data_path_train,
    seed)

In [2]:
local_path_save = './local_artifacts_tmp/01_Linear_Regression/'

### MLFlow setting

In [3]:
if not os.path.exists(local_path_save):
    os.makedirs(local_path_save)

#save all metadata in a sqlite db. Artifacts will be saved on local folder ./mlflow    
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Name of the experiment
exp_name = "01 - Linear Regression"
# set up MlFlow axperiment
experiment_id = mlflow.set_experiment(exp_name)

In [4]:
year_month = year_month_train
input_data_path = input_data_path_train

### Fitting and evaluation functions

In [5]:
def lr_evaluation(Y_train, Y_test, Y_pred_train, Y_pred_test):
        ''' 
        Function for evaluation of Linear Regression goodness
        '''
        rmse_train = mean_squared_error(Y_train,Y_pred_train)**0.5
        rmse_test = mean_squared_error(Y_test,Y_pred_test)**0.5
        
        mae_train = mean_absolute_error(Y_train, Y_pred_train)
        mae_test = mean_absolute_error(Y_test, Y_pred_test)
    
        return rmse_train, rmse_test, mae_train, mae_test

In [6]:
def objective_lr(X_train, X_test, Y_train, Y_test, run_name: str = 'Unnamed'):
    '''
    Fitting function for Linear Regression
    '''
    with mlflow.start_run(run_name=run_name):
        mlflow.set_tag('model_type','LinearRegression')
        mlflow.set_tag('year_month',year_month)
        
        mlflow.log_param('model_type','LinearRegression')
        mlflow.log_param('data',input_data_path)

        lr = LinearRegression()

        lr.fit(X_train, Y_train)

        Y_pred_train = lr.predict(X_train)
        Y_pred_test = lr.predict(X_test)
        rmse_train, rmse_test, mae_train, mae_test = lr_evaluation(
                Y_train, Y_test, Y_pred_train, Y_pred_test
        )

        mlflow.log_metrics({'rmse_train': rmse_train, 
                            'rmse_test': rmse_test,
                            'mae_train': mae_train,
                            'mae_test': mae_test})

        print('rmse_train = ', rmse_train, 
              '\n rmse_test', rmse_test,
              '\n mae_train', mae_train,
              '\n mae_test', mae_test)

        mlflow.log_artifact(local_path = local_path_save + run_name + '_ohe.pkl', artifact_path='preprocessing') 
        mlflow.sklearn.log_model(lr, artifact_path='model')
    
    return {'loss': rmse_test, 'status': STATUS_OK}
    

## Experiments

### Experiment 1: base (keeping all observations)

In [None]:
run_name = "base"

In [7]:
prepr = Preprocess(input_data_path_train)
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=seed)

#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe = prepr.preprocess(df=X_train, fit_ohe=True, drop_first_column=False)
X_test_ohe, _ = prepr.preprocess(df=X_test, fit_ohe=False, drop_first_column=False, ohe=ohe)
assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save + run_name + '_ohe.pkl', 'wb'))

In [8]:
objective_lr(X_train=X_train_ohe,
             X_test=X_test_ohe,
             Y_train=Y_train,
             Y_test=Y_test,
             run_name=run_name)

rmse_train =  3.340266027366716 
 rmse_test 16291.005898020625 
 mae_train 2.3235100430914657 
 mae_test 513.3030565867936




{'loss': 16291.005898020625, 'status': 'ok'}

### Experiment 2: no outliers (removing outliers from trip_distance)

In [None]:
run_name = "no_outliers"

In [9]:
prepr = Preprocess(input_data_path_train)
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=seed)

X_train.loc[X_train['trip_distance']>1000, 'trip_distance'] = X_train['trip_distance'].median()

#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe = prepr.preprocess(df=X_train, fit_ohe=True, drop_first_column=False)
X_test_ohe, _ = prepr.preprocess(df=X_test, fit_ohe=False, drop_first_column=False, ohe=ohe)
assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save + run_name + '_ohe.pkl', 'wb'))

In [10]:
objective_lr(X_train=X_train_ohe,
             X_test=X_test_ohe,
             Y_train=Y_train,
             Y_test=Y_test,
             run_name=run_name)

rmse_train =  3.340266027366716 
 rmse_test 84.88740387525604 
 mae_train 2.3235100430914657 
 mae_test 5.209576851415573


{'loss': 84.88740387525604, 'status': 'ok'}