In [1]:
import mlflow
import os
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

from pickle import dump

from scripts.Preprocessing import Preprocessing
from scripts.LinearRegressionTraining import LinearRegressionTraining

from scripts.config import (year_month_train, 
    input_data_path_train,
    seed)

In [2]:
local_path_save = './local_artifacts_tmp/01_Linear_Regression/'
year_month = year_month_train
input_data_path = input_data_path_train

### MLFlow setting

In [3]:
if not os.path.exists(local_path_save):
    os.makedirs(local_path_save)

#save all metadata in a sqlite db. Artifacts will be saved on local folder ./mlflow    
mlflow.set_tracking_uri("sqlite:///mlflow.db")

# Name of the experiment
exp_name = "01 - Linear Regression"
# set up MlFlow axperiment
experiment_id = mlflow.set_experiment(exp_name)

## Experiments

In [4]:
run_name = "base"

In [5]:
linear_regression_training = LinearRegressionTraining(
    input_data_path,
    local_path_save,
    year_month,
    'linear_regression')

In [6]:
prepr = Preprocessing(input_data_path_train)
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=seed)

#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe = prepr.preprocess_for_regression(df=X_train, fit_ohe=True, drop_first_column=True)
X_test_ohe, _ = prepr.preprocess_for_regression(df=X_test, fit_ohe=False, drop_first_column=True, ohe=ohe)
assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save + run_name + '_ohe.pkl', 'wb'))

  df['lpep_pickup_datetime_week'] = df['lpep_pickup_datetime'].dt.week
  df['lpep_pickup_datetime_week'] = df['lpep_pickup_datetime'].dt.week


In [7]:
best_result = linear_regression_training.objective_lr(X_train=X_train_ohe,
         X_test=X_test_ohe,
         Y_train=Y_train,
         Y_test=Y_test,
         run_name=run_name)

best_result



{'loss': 5.186934719528432, 'status': 'ok'}

In [12]:
run_name = "wrong_base_2"

prepr = Preprocessing(input_data_path_train)
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=seed)

#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe = prepr.preprocess_for_regression(df=X_train, fit_ohe=True, drop_first_column=True)
X_test_ohe, _ = prepr.preprocess_for_regression(df=X_test, fit_ohe=False, drop_first_column=True, ohe=ohe)
assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save + run_name + '_ohe.pkl', 'wb'))

best_result = linear_regression_training.objective_lr(X_train=X_train_ohe,
         X_test=X_train_ohe,
         Y_train=Y_train,
         Y_test=Y_train,
         run_name=run_name)

best_result

  df['lpep_pickup_datetime_week'] = df['lpep_pickup_datetime'].dt.week
  df['lpep_pickup_datetime_week'] = df['lpep_pickup_datetime'].dt.week


{'loss': 4.898901459708821, 'status': 'ok'}

In [11]:
X_train.columns

Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'lpep_pickup_datetime_week', 'lpep_pickup_datetime_day',
       'lpep_pickup_datetime_hour', 'lpep_pickup_datetime_minute',
       'lpep_pickup_datetime_dayofweek', 'PU_DO'],
      dtype='object')