In [1]:
import mlflow
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from pickle import dump

from hyperopt import STATUS_OK

from scripts.training_regression_models import Preprocess
from scripts.model_registry import ModelRegistry
from scripts.scoring import Scoring
from scripts.monitoring import Monitoring
from scripts.config_regression import *

### MLFlow setting

In [12]:
if not os.path.exists(local_path_save):
    os.makedirs(local_path_save)

#save all metadata in a sqlite db. Artifacts will be saved on local folder ./mlflow    
#mlflow.set_backend_store_uri("sqlite:///mlflow.db")

mlflow.set_tracking_uri("file:///Users/davideposillipo/mlruns")

# set up MlFlow axperiment
experiment_id = mlflow.set_experiment(exp_name)

In [13]:
year_month = year_month_train
input_data_path = input_data_path_train

In [14]:
def lr_evaluation(Y_train, Y_test, Y_pred_train, Y_pred_test):
        ''' 
        Function for evaluation of Linear Regression goodness
        '''
        rmse_train = mean_squared_error(Y_train,Y_pred_train)**0.5
        rmse_test = mean_squared_error(Y_test,Y_pred_test)**0.5
    
        return rmse_train, rmse_test

In [15]:
def objective_lr(X_train, X_test, Y_train, Y_test):
        ''' 
        Fitting function for Linear Regression
        '''
        with mlflow.start_run():
            mlflow.set_tag('model_type','LinearRegression')
            mlflow.set_tag('year_month',year_month)
            mlflow.log_param('model_type','LinearRegression')
            mlflow.log_param('data',input_data_path)

            lr = LinearRegression()

            lr.fit(X_train, Y_train)

            Y_pred_train = lr.predict(X_train)
            Y_pred_test = lr.predict(X_test)
            rmse_train, rmse_test = lr_evaluation(Y_train, Y_test, Y_pred_train, Y_pred_test)

            mlflow.log_metrics({'rmse_train':rmse_train, 'rmse_test':rmse_test})
            print('rmse_train = ', rmse_train, '\n rmse_test', rmse_test)

            mlflow.log_artifact(local_path = local_path_save+"ohe.pkl", artifact_path='preprocessing') 
            mlflow.sklearn.log_model(lr, artifact_path='model')
    
        #I'm minimizing test score. I could also minimize cross val score
        #(above I would need to do cross_val_score(gb, X_train, Y_train, 
        #cv=10, scoring='neg_root_mean_squared_error', njobs=-1).mean() 
        #and loss = -neg_root_mean_squared_error )
        return {'loss': rmse_test, 'status': STATUS_OK}

In [16]:
prepr = Preprocess(input_data_path_train)
X, Y = prepr.read_dataframe(request_tgt=True)

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=seed)

#preprocessing ohe
shapes_pre = (X_train.shape[0], X_test.shape[0])
X_train_ohe, ohe = prepr.preprocess(df=X_train, fit_ohe=True)
X_test_ohe, _ = prepr.preprocess(df=X_test, fit_ohe=False, ohe=ohe)
assert shapes_pre == (X_train.shape[0], X_test.shape[0])
dump(ohe, open(local_path_save+'ohe.pkl', 'wb'))

df shape (3576, 20)


In [17]:
objective_lr(X_train=X_train_ohe,
             X_test=X_test_ohe,
             Y_train=Y_train,
             Y_test=Y_test)

rmse_train =  3.340266027366716 
 rmse_test 16291.005898020625


{'loss': 16291.005898020625, 'status': 'ok'}

In [18]:
objective_lr(X_train=X_train_ohe,
             X_test=X_train_ohe,
             Y_train=Y_train,
             Y_test=Y_train)

rmse_train =  3.340266027366716 
 rmse_test 3.340266027366716


{'loss': 3.340266027366716, 'status': 'ok'}

In [19]:
X_train.head()

Unnamed: 0,VendorID,lpep_pickup_datetime,lpep_dropoff_datetime,store_and_fwd_flag,RatecodeID,PULocationID,DOLocationID,passenger_count,trip_distance,fare_amount,...,mta_tax,tip_amount,tolls_amount,ehail_fee,improvement_surcharge,total_amount,payment_type,trip_type,congestion_surcharge,PU_DO
2339,2,2022-01-17 05:58:27,2022-01-17 06:07:59,N,1.0,74,262,1.0,2.95,10.5,...,0.5,0.0,0.0,,0.3,14.55,2.0,1.0,2.75,74_262
3189,2,2022-01-18 20:34:52,2022-01-18 20:42:03,N,1.0,74,42,1.0,0.8,6.5,...,0.5,0.0,0.0,,0.3,7.8,2.0,1.0,0.0,74_42
2135,2,2022-01-09 18:42:59,2022-01-09 18:50:17,N,1.0,75,263,1.0,0.95,6.5,...,0.5,0.73,0.0,,0.3,8.03,1.0,1.0,0.0,75_263
779,2,2022-01-02 14:06:10,2022-01-02 14:14:16,N,1.0,74,263,1.0,1.74,8.0,...,0.5,1.0,0.0,,0.3,12.55,1.0,1.0,2.75,74_263
2545,2,2022-01-14 22:35:56,2022-01-14 22:40:04,N,1.0,134,134,1.0,0.72,4.5,...,0.5,1.16,0.0,,0.3,6.96,1.0,1.0,0.0,134_134


In [10]:
Y_train

2339     9.533333
3189     7.183333
2135     7.300000
779      8.100000
2545     4.133333
          ...    
2154    11.266667
3089    14.516667
1766     6.566667
1122    12.483333
1346    15.116667
Name: duration, Length: 2398, dtype: float64

In [11]:
input_data_path_train

'./data/input/green_tripdata_2022-01.parquet'