In [2]:
from preprocess_data import run_data_prep

import pandas as pd
import numpy as np
import datetime as dt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
import sys
from sklearn.feature_extraction import DictVectorizer
import pickle

In [2]:
def load_data(file_name, lower_threshold, upper_threshold):
    #read the file to dataframe
    df = pd.read_parquet(file_name)
    
    #calculation of trip duration in minutes
    df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
    df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])

    df['dur_min'] = (df['tpep_dropoff_datetime'] - df['tpep_pickup_datetime']).apply(lambda x: x.total_seconds()/60)
    
    # Create bool flags for outliers
    upper_out = np.where(df['dur_min']>upper_threshold)[0]
    lower_out = np.where(df['dur_min']<lower_threshold)[0]

    # Removing the outliers
    df.drop(index=upper_out, inplace=True)
    df.drop(index=lower_out, inplace=True)
    #alternative filtering: df1_without_out = df1[(df1.dur_min >= lower_threshold) & (df1.dur_min <= upper_threshold)]
    
    return df

In [3]:
#!conda info
import mlflow
mlflow.set_tracking_uri('sqlite:///mlflow.db')
mlflow.set_experiment('taxi-linear-regr-1')

<Experiment: artifact_location='/home/eugene/Documents/projects/mlops-zoomcamp-ep/2-mlflow/mlruns/1', creation_time=1684595945353, experiment_id='1', last_update_time=1684595945353, lifecycle_stage='active', name='taxi-linear-regr-1', tags={}>

In [4]:
lower_threshold = 1.0
upper_threshold = 60.0

#loading January data
df1 = load_data('../../data/taxi/yellow_tripdata_2022-01.parquet', lower_threshold, upper_threshold)
#loading February data
df2 = load_data('../../data/taxi/yellow_tripdata_2022-02.parquet', lower_threshold, upper_threshold)

In [5]:
#creating dummy variables
#categorical variables
cat = ['PULocationID', 'DOLocationID']

df1[cat] = df1[cat].astype(str)
train_dict = df1[cat].to_dict(orient = 'records')

dv = DictVectorizer()
X_train = dv.fit_transform(train_dict)

#the same for df2
df2[cat] = df2[cat].astype(str)
test_dict = df2[cat].to_dict(orient = 'records')
X_test = dv.transform(test_dict)

In [8]:
#creating target variables
target = 'dur_min'
y_train = df1[target].values
y_test = df2[target].values

In [10]:
with mlflow.start_run():
    
    mlflow.set_tag("developer", "eugene")
    mlflow.log_param('train-data-path', '../../data/taxi/yellow_tripdata_2022-01.parquet')
    mlflow.log_param('validation-data-path', '../../data/taxi/yellow_tripdata_2022-02.parquet')
    mlflow.log_param('outliers-lower-threshold', lower_threshold)
    mlflow.log_param('outliers-upper-threshold', upper_threshold)

    model = LinearRegression()
    #train linear regression model
    model.fit(X_train, y_train)
    
    #prediction on validation dataset for RMSE calculation
    predictions_test = model.predict(X_test)
    #calculate RMSE for validation data set
    rmse = metrics.mean_squared_error(y_test, predictions_test, squared = False)
    
    mlflow.log_metric('rmse', rmse)

In [11]:
with open('../../models/mlops-zoomcamp-ep/lin_reg.bin', 'wb') as f_out:
    pickle.dump((dv, model),f_out)