# Create model

# 1. Imports

## 1.1 Packages

In [1]:
# Essential
import os
import sys
import yaml

# Data science
import pandas as pd
from sklearn.metrics import mean_squared_error

# Plots
import seaborn as sns

In [2]:
sys.path.append('../src/nyc_taxi/pipelines/data_science')
from feature_engineering import create_hour_feat
from nodes import column_transformer, feature_imputer, pipe_estimator
from log_model import log_hgbr_model

## 1.2 Options

In [3]:
# Pandas option
pd.options.display.max_columns = 100

# Plots option
sns.set_style("white")
sns.set_color_codes(palette='deep')

In [4]:
%load_ext autoreload
%autoreload 2

In [5]:
path_data = "../data/03_primary"

In [6]:
# Load credentials
with open('../conf/local/credentials.yml') as file:
    credentials_local = yaml.safe_load(file)

## 1.3 Dataset

In [7]:
df_train = pd.read_pickle(os.path.join(path_data, "df_train.pkl"))
df_valid = pd.read_pickle(os.path.join(path_data, "df_valid.pkl"))

# 2. Create model

In [8]:
df_train = df_train.pipe(create_hour_feat, "tpep_pickup_datetime")
df_valid = df_valid.pipe(create_hour_feat, "tpep_pickup_datetime")

y_train = df_train["duration"]
y_valid = df_valid["duration"]

df_train.drop(columns=["duration"], inplace=True)
df_valid.drop(columns=["duration"], inplace=True)

In [9]:
df_train.sample(5)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee,Airport_fee,tpep_pickup_datetime_hour
751981,2.0,2023-02-08 16:47:36,2023-02-08 16:58:54,1.0,1.42,1.0,N,236.0,238.0,1.0,12.1,2.5,0.5,3.72,0.0,1.0,22.32,2.5,,0.0,16
2967197,2.0,2023-01-31 18:25:20,2023-01-31 18:34:24,1.0,1.44,1.0,N,163.0,140.0,1.0,10.7,2.5,0.5,4.3,0.0,1.0,21.5,2.5,0.0,,18
2983749,2.0,2023-01-31 21:54:48,2023-01-31 21:58:53,1.0,1.07,1.0,N,237.0,236.0,1.0,7.2,1.0,0.5,2.44,0.0,1.0,14.64,2.5,0.0,,21
1734877,2.0,2023-01-19 14:58:45,2023-01-19 15:13:23,1.0,1.51,1.0,N,264.0,264.0,1.0,13.5,0.0,0.5,3.5,0.0,1.0,21.0,2.5,0.0,,14
1131304,1.0,2023-02-11 23:17:59,2023-02-11 23:43:51,1.0,6.4,1.0,N,162.0,33.0,1.0,28.9,3.5,0.5,6.8,0.0,1.0,40.7,2.5,,0.0,23


In [10]:
feat_cat = [
    "VendorID", "RatecodeID", "store_and_fwd_flag", "PULocationID", "DOLocationID",
    "payment_type",
]

In [11]:
col_transf = column_transformer()
feat_imp = feature_imputer()

params = {
    "loss": 'squared_error',
    "learning_rate": 0.05,
    "max_iter": 1000,
    "max_depth": 7,
    "categorical_features": feat_cat,
    "random_state": 12,
}

model = pipe_estimator(feat_imp=feat_imp, col_transf=col_transf, **params)

In [12]:
model.fit(df_train, y_train)

In [13]:
pred_train = model.predict(df_train)
pred_valid = model.predict(df_valid)

In [14]:
rmse_train = mean_squared_error(y_true=y_train, y_pred=pred_train, squared=False)
rmse_valid = mean_squared_error(y_true=y_valid, y_pred=pred_valid, squared=False)
print("RMSE train:", rmse_train)
print("RMSE valid:", rmse_valid)

RMSE train: 2448.81495014428
RMSE valid: 2516.9346805301047


In [15]:
metrics = {
    "RMSE_train": rmse_train,
    "RMSE_valid": rmse_valid
}

In [16]:
# Load model to comet
log_hgbr_model(api_key=credentials_local['api_key'], params=params, metrics=metrics, model=model, model_name="HistGradientBoostingRegressor_model")



[1;38;5;39mCOMET INFO:[0m Experiment is live on comet.com https://www.comet.com/bwallyn/nyc-taxi-trip/7c1beb2412f24b65acddfb3695d1e61e

[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m Comet.ml Experiment Summary
[1;38;5;39mCOMET INFO:[0m ---------------------------------------------------------------------------------------
[1;38;5;39mCOMET INFO:[0m   Data:
[1;38;5;39mCOMET INFO:[0m     display_summary_level : 1
[1;38;5;39mCOMET INFO:[0m     url                   : https://www.comet.com/bwallyn/nyc-taxi-trip/7c1beb2412f24b65acddfb3695d1e61e
[1;38;5;39mCOMET INFO:[0m   Metrics:
[1;38;5;39mCOMET INFO:[0m     RMSE_train : 2448.81495014428
[1;38;5;39mCOMET INFO:[0m     RMSE_valid : 2516.9346805301047
[1;38;5;39mCOMET INFO:[0m   Parameters:
[1;38;5;39mCOMET INFO:[0m     categorical_features : ['VendorID', 'RatecodeID', 'store_and_fwd_flag', 'PULocationID', 'DOLocationID', 'pay

In [17]:
pred_train

array([ 599.77910924,  401.76508085,  809.93402149, ..., 2260.13100917,
       1013.42493027, 3972.34091828])

In [18]:
y_train

0            506.0
1            379.0
2            765.0
3            577.0
4            650.0
            ...   
2932889    57176.0
3033658      535.0
3040040     2549.0
3133953      885.0
3139871    43995.0
Name: duration, Length: 5980791, dtype: float64