In [2]:
import pandas as pd
import mlflow

from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

import warnings
warnings.filterwarnings('ignore')

## Config MLFlow

In [None]:
mlflow.set_tracking_uri("https://127.0.0.1:5000")
mlflow.set_experiment("nyc_yellow_taxi_exp")

## Import datasets

In [5]:
y_t_jan = pd.read_parquet('../data/yellow_tripdata_2022-01.parquet')
y_t_fev = pd.read_parquet('../data/yellow_tripdata_2022-02.parquet')

### Number os Features

In [6]:
print(f'Taxi data set have {y_t_jan.shape[1]} columns')

Taxi data set have 19 columns


### Prepare Data

In [7]:
def preprate_taxi_data(taxi_data: pd.DataFrame, verbose:bool=True) -> pd.DataFrame:
    """
    :param taxi_data: yellow taxi data
    :param verbose: print duration feature description DEFAULT=True
    :return: DataFrame with 'PULocationID','DOLocationID', 'duration' columns
    """
    t_data = taxi_data.copy()
    t_data['duration'] = t_data.tpep_dropoff_datetime - t_data.tpep_pickup_datetime
    t_data['duration'] = t_data['duration'].astype('int64')/(60e9)
    print(t_data['duration'].describe(), "\n")
    t_data = t_data[t_data['duration'].between(1,60)]
    return t_data[['PULocationID','DOLocationID', 'duration']]

In [8]:
train_data = preprate_taxi_data(y_t_jan)

count    2.463931e+06
mean     1.421220e+01
std      4.644531e+01
min     -3.442400e+03
25%      6.316667e+00
50%      1.018333e+01
75%      1.616667e+01
max      8.513183e+03
Name: duration, dtype: float64 



In [9]:
test_data = preprate_taxi_data(y_t_fev)

count    2.979431e+06
mean     1.565368e+01
std      4.726394e+01
min     -9.833333e-01
25%      6.950000e+00
50%      1.125000e+01
75%      1.783333e+01
max      5.489383e+03
Name: duration, dtype: float64 



### Outliers removal

In [10]:
print(f'Records left ratio: {train_data.shape[0]/y_t_jan.shape[0]}')

Records left ratio: 0.9827547930522406


### Pipeline

In [11]:
pipe = Pipeline([("ohe", OneHotEncoder(drop="first", handle_unknown="ignore")),
                 ("lin_reg", LinearRegression())])

In [12]:
train_y = train_data['duration']
train_X = train_data[['PULocationID','DOLocationID']]
test_y = test_data['duration']
test_X = test_data[['PULocationID','DOLocationID']]
pipe.fit(train_X, train_y)

In [10]:

train_y_pred = pipe.predict(train_X)
print(f"RMSE for the training data is {mean_squared_error(train_y, train_y_pred, squared=False)}")

RMSE for the training data is 6.9861902204330235


In [11]:
test_y_pred = pipe.predict(test_X)
print(f'RMSE for the test data is {mean_squared_error(test_y, test_y_pred, squared=False)}')

RMSE for the test data is 7.786501997417462
