In [11]:
import pandas as pd
import numpy as np

import yaml

from catboost import CatBoostRegressor, Pool, metrics
from sklearn.model_selection import train_test_split

In [12]:
# config params
def config():
    with open("../config/params.yaml", "r") as f:
            return yaml.safe_load(f)

cfg = config()

# data split cfg
test_size = cfg['data_split']['test_size']
test_split_seed = cfg['data_split']['test_split_seed']
valid_size = cfg['data_split']['valid_size']
valid_split_seed = cfg['data_split']['valid_split_seed']

# tuning params
test_size = cfg['data_split']['test_size']

In [13]:
df = pd.read_csv('https://github.com/EuMentality/datasets/raw/main/taxi_train.csv', sep=';')

In [14]:
df.head()

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,trip_duration,pickup_month,pickup_weekday,pickup_hour,high_traffic,anomaly,manh_length
0,2,1,-73.982155,40.767937,-73.96463,40.765602,N,6.122493,3,0,17,1,0,2.006876
1,1,1,-73.980415,40.738564,-73.999481,40.731152,N,6.498282,6,6,0,0,0,2.394278
2,2,1,-73.979027,40.763939,-74.005333,40.710087,N,7.661527,1,1,11,1,0,7.304991
3,2,1,-74.01004,40.719971,-74.012268,40.706718,N,6.063785,4,2,19,1,0,1.93013
4,2,1,-73.973053,40.793209,-73.972923,40.78252,N,6.077642,3,5,13,1,0,1.608787


In [15]:
df.store_and_fwd_flag.unique()

array(['N', 'Y'], dtype=object)

In [16]:
cat_feature_indices = np.array([0, 1, 6, 8, 9, 10, 11])
X = df.drop('trip_duration', axis=1)
X.iloc[:, cat_feature_indices] = X.iloc[:, cat_feature_indices].astype(str)
y = df.trip_duration

In [17]:
X.columns

Index(['vendor_id', 'passenger_count', 'pickup_longitude', 'pickup_latitude',
       'dropoff_longitude', 'dropoff_latitude', 'store_and_fwd_flag',
       'pickup_month', 'pickup_weekday', 'pickup_hour', 'high_traffic',
       'anomaly', 'manh_length'],
      dtype='object')

In [18]:
len(X.columns)

13

In [19]:
cfg = config()
cfg_split = cfg['data_split']
cfg_hprms = cfg['hyperparams']

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=20)
X_train, X_valid, y_train, y_valid = train_test_split(X_train, y_train, test_size=0.25, random_state=20)

In [72]:
model = CatBoostRegressor(
            loss_function='RMSE',
            random_seed=20,
            early_stopping_rounds=25,
            verbose=100
            )

In [75]:
model.fit(X_train, y_train,
         cat_features=cat_feature_indices,
         eval_set=(X_valid, y_valid))

Learning rate set to 0.142665
0:	learn: 0.6663251	test: 0.6682139	best: 0.6682139 (0)	total: 148ms	remaining: 2m 27s
100:	learn: 0.4226705	test: 0.4258532	best: 0.4258532 (100)	total: 12.6s	remaining: 1m 51s
200:	learn: 0.4132376	test: 0.4173542	best: 0.4173542 (200)	total: 26s	remaining: 1m 43s
300:	learn: 0.4086339	test: 0.4136738	best: 0.4136738 (300)	total: 38.5s	remaining: 1m 29s
400:	learn: 0.4052051	test: 0.4111904	best: 0.4111904 (400)	total: 50.5s	remaining: 1m 15s
500:	learn: 0.4028380	test: 0.4096838	best: 0.4096838 (500)	total: 1m 3s	remaining: 1m 3s
600:	learn: 0.4007728	test: 0.4085810	best: 0.4085810 (600)	total: 1m 17s	remaining: 51.6s
700:	learn: 0.3988359	test: 0.4075460	best: 0.4075460 (700)	total: 1m 32s	remaining: 39.4s
800:	learn: 0.3971837	test: 0.4069013	best: 0.4068966 (799)	total: 1m 46s	remaining: 26.5s
900:	learn: 0.3955470	test: 0.4062224	best: 0.4062216 (899)	total: 1m 59s	remaining: 13.2s
999:	learn: 0.3942625	test: 0.4056558	best: 0.4056558 (999)	total: 

<catboost.core.CatBoostRegressor at 0x7fbec9cd31c0>

In [79]:
model.save_model('../model/catboost_test.dump')

In [22]:
model = CatBoostRegressor().load_model('../model/catboost_test.dump')


In [23]:
X_test

Unnamed: 0,vendor_id,passenger_count,pickup_longitude,pickup_latitude,dropoff_longitude,dropoff_latitude,store_and_fwd_flag,pickup_month,pickup_weekday,pickup_hour,high_traffic,anomaly,manh_length
1014416,2,1,-73.994431,40.760929,-73.993378,40.718971,N,1,0,19,1,0,6.298186
867840,2,1,-73.971230,40.792828,-73.971848,40.786343,N,5,4,16,1,0,0.959240
55474,2,1,-73.998322,40.728653,-73.952553,40.784039,N,2,5,4,0,0,7.375032
1114250,2,1,-73.993011,40.747692,-73.980339,40.740200,N,2,2,20,1,0,1.702462
843792,1,1,-73.982811,40.769188,-73.946396,40.777248,N,3,3,22,0,0,4.309079
...,...,...,...,...,...,...,...,...,...,...,...,...,...
426637,1,1,-73.960175,40.765369,-73.958588,40.776539,N,1,3,8,1,0,1.635847
315360,1,1,-73.956146,40.778648,-73.968925,40.761040,N,6,0,6,0,0,2.258254
1067269,2,1,-73.988907,40.743629,-73.982933,40.721111,N,4,1,12,1,0,3.211386
182327,1,1,-73.997841,40.720352,-73.976959,40.775963,N,6,2,21,0,0,7.739591


In [25]:
X_test.shape

(314518, 13)

In [27]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 314518 entries, 1014416 to 398256
Data columns (total 13 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   vendor_id           314518 non-null  object 
 1   passenger_count     314518 non-null  object 
 2   pickup_longitude    314518 non-null  float64
 3   pickup_latitude     314518 non-null  float64
 4   dropoff_longitude   314518 non-null  float64
 5   dropoff_latitude    314518 non-null  float64
 6   store_and_fwd_flag  314518 non-null  object 
 7   pickup_month        314518 non-null  int64  
 8   pickup_weekday      314518 non-null  object 
 9   pickup_hour         314518 non-null  object 
 10  high_traffic        314518 non-null  object 
 11  anomaly             314518 non-null  object 
 12  manh_length         313485 non-null  float64
dtypes: float64(5), int64(1), object(7)
memory usage: 33.6+ MB


In [26]:
model.predict(X_test.iloc[1, :])

5.485760122718789

In [80]:
# asd = CatBoostRegressor()
# asd.load_model('../model/catboost_test.dump')

<catboost.core.CatBoostRegressor at 0x7fbec9cccf40>