## Vessel Dwell Time Prediction with CatBoost

In [1]:
import pandas as pd 
import numpy as np
from catboost import CatBoostRegressor, Pool, metrics, cv
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import train_test_split
import optuna

import warnings
warnings.filterwarnings("ignore")

RANDOM_STATE = 42

#### Data Preparation 

In [2]:
# Load vessel data
df_vessel = pd.read_csv('./data/cleansed/vessel_dwell_time.csv')
X = df_vessel.drop(columns=[ 'imo', 'vessel_name', 'time_seen', 'vessel_type', 'dwell_in_hr'])
y = df_vessel['dwell_in_hr']

In [3]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1634 entries, 0 to 1633
Data columns (total 11 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   target_terminal                   1634 non-null   object 
 1   avg_dwell_at_target_terminal      1634 non-null   float64
 2   num_of_vessel_at_target_terminal  1634 non-null   float64
 3   num_of_vessel_in_port             1634 non-null   float64
 4   weekday                           1634 non-null   int64  
 5   hour_of_day                       1634 non-null   int64  
 6   is_holiday                        1634 non-null   bool   
 7   vessel_operator                   1634 non-null   object 
 8   vessel_width                      1634 non-null   float64
 9   vessel_length                     1634 non-null   float64
 10  vessel_dwt                        1634 non-null   float64
dtypes: bool(1), float64(6), int64(2), object(2)
memory usage: 129.4+ KB


In [4]:
# create training set and testing set
# CatBoost has built-in support for categorical data. We are not required to handle it seperately
categorical_features_indices = [0,6,7]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)

# create data pool for training set. Pool is specific to CatBoost
train_pool = Pool(data=X_train, label=y_train, cat_features=categorical_features_indices)
test_pool = Pool(data=X_test, label=y_test, cat_features=categorical_features_indices)

Build a simple CatBoost Model

In [5]:
model = CatBoostRegressor(
    loss_function='MAE',
    random_seed=RANDOM_STATE
)

In [6]:
model.fit(
    X=train_pool,
    eval_set=test_pool,
    verbose=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 80.8138644	test: 87.6545834	best: 87.6545834 (0)	total: 162ms	remaining: 2m 41s
1:	learn: 79.6128316	test: 86.7105302	best: 86.7105302 (1)	total: 181ms	remaining: 1m 30s
2:	learn: 78.7398579	test: 86.0167636	best: 86.0167636 (2)	total: 200ms	remaining: 1m 6s
3:	learn: 77.6083004	test: 85.1102109	best: 85.1102109 (3)	total: 218ms	remaining: 54.3s
4:	learn: 76.7828059	test: 84.3539546	best: 84.3539546 (4)	total: 238ms	remaining: 47.3s
5:	learn: 75.8586098	test: 83.6303727	best: 83.6303727 (5)	total: 256ms	remaining: 42.4s
6:	learn: 75.0221757	test: 82.8666964	best: 82.8666964 (6)	total: 279ms	remaining: 39.5s
7:	learn: 74.3631443	test: 82.1779949	best: 82.1779949 (7)	total: 298ms	remaining: 37s
8:	learn: 73.6412371	test: 81.5654275	best: 81.5654275 (8)	total: 330ms	remaining: 36.3s
9:	learn: 72.8165816	test: 80.8771745	best: 80.8771745 (9)	total: 349ms	remaining: 34.6s
10:	learn: 72.2364955	test: 80.3080085	best: 80.3080085 (10)	total: 372ms	remaining: 33.5s
11:	learn: 71.58678

<catboost.core.CatBoostRegressor at 0x1a7d9063788>

In [7]:
train_mae = mean_absolute_error(y_train, model.predict(X_train))
test_mae = mean_absolute_error(y_test, model.predict(X_test))

print(f'MAE for training set: {train_mae:.3f}')
print(f'MAE for test set: {test_mae:.3f}')

MAE for training set: 33.901
MAE for test set: 55.472


In [8]:
feature_importances = model.get_feature_importance()
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print(f'{name}: {score:.2f}')

num_of_vessel_in_port: 24.47
avg_dwell_at_target_terminal: 21.76
vessel_operator: 18.01
target_terminal: 8.69
vessel_dwt: 5.53
vessel_length: 5.43
hour_of_day: 5.32
num_of_vessel_at_target_terminal: 4.70
weekday: 3.50
vessel_width: 2.37
is_holiday: 0.23


### Hyperparameters Tuning 

We will use Optuna for hyperparameters tuning.  
Visit [Optuna.org](https://optuna.org/) for more detail

In [9]:
def optuna_objective(trial):

    params = {
        'iterations':trial.suggest_int("iterations", 500, 3000), # iterations is essentially the nubmer of trees
        'learning_rate' : trial.suggest_float('learning_rate',0.0001, 1),
        'depth': trial.suggest_int('depth',1, 10),
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1e-3, 1e0, log=True)
    }

    model = CatBoostRegressor(
        loss_function='MAE',
        random_seed=RANDOM_STATE,
        verbose=False,
        **params
    )
    
    cv_data = cv(
        pool=train_pool,
        params=model.get_params(), 
        fold_count=10,
        logging_level='Silent',
    )
    
    best_accuracy = np.min(cv_data['test-MAE-mean'])
    
    return best_accuracy 

In [10]:
study = optuna.create_study(direction="minimize")
# make sure your env has enough CPU to support the setting of n_jobs 
study.optimize(optuna_objective, n_trials=50, n_jobs=4) 
study.best_params

[32m[I 2022-10-10 16:17:19,124][0m A new study created in memory with name: no-name-1af3b9ce-ec3a-444e-97c9-7c329e49f577[0m
[32m[I 2022-10-10 16:19:39,393][0m Trial 0 finished with value: 56.815271115423606 and parameters: {'iterations': 1212, 'learning_rate': 0.4812028442900207, 'depth': 2, 'l2_leaf_reg': 0.009369205637532494}. Best is trial 0 with value: 56.815271115423606.[0m
[32m[I 2022-10-10 16:20:14,096][0m Trial 1 finished with value: 68.82876858207236 and parameters: {'iterations': 2880, 'learning_rate': 0.005761194181596011, 'depth': 1, 'l2_leaf_reg': 0.04859078760194356}. Best is trial 0 with value: 56.815271115423606.[0m
[32m[I 2022-10-10 16:21:57,778][0m Trial 3 finished with value: 60.04395039711894 and parameters: {'iterations': 2419, 'learning_rate': 0.9948279985727957, 'depth': 2, 'l2_leaf_reg': 0.0767988956970419}. Best is trial 0 with value: 56.815271115423606.[0m
[32m[I 2022-10-10 16:29:50,841][0m Trial 2 finished with value: 57.114949305130985 and para

{'iterations': 1538,
 'learning_rate': 0.052313318764570016,
 'depth': 5,
 'l2_leaf_reg': 0.02252067954779648}

In [11]:
print(f"Number of completed trials: {len(study.trials)}")
print(f"Best trial: {study.best_trial.number}")
print(f"Best Score: {study.best_trial.value}")
print("Best Params: ")
for key, value in study.best_trial.params.items():
    print(f"    {key}: {value}")

Number of completed trials: 50
Best trial: 30
Best Score: 52.05599777614075
Best Params: 
    iterations: 1538
    learning_rate: 0.052313318764570016
    depth: 5
    l2_leaf_reg: 0.02252067954779648


In [12]:
# Show the visualization for parameter importances 
optuna.visualization.plot_param_importances(study)

This is the best parameters we've got after hyperparameters tuning.
Also from the above optuna plot we know learning_rate and depth are the most important hyperparameters.
```
model_params ={'iterations': 1538,
    'learning_rate': 0.052313318764570016,
    'depth': 5,
    'l2_leaf_reg': 0.02252067954779648,
    'loss_function': 'MAE',
    'random_seed': 42,
    'verbose': False}
```

In [13]:
# train a model with the best param

# save the best params
best_params = study.best_params

best_model = CatBoostRegressor(
        loss_function='MAE',
        random_seed=RANDOM_STATE,
        verbose=False,
        **best_params
)

In [14]:
best_model.fit(
    X=train_pool,
    eval_set=test_pool,
    verbose=True,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 80.0718881	test: 87.0502826	best: 87.0502826 (0)	total: 16ms	remaining: 24.7s
1:	learn: 78.2719173	test: 85.2442312	best: 85.2442312 (1)	total: 34.8ms	remaining: 26.7s
2:	learn: 76.4921155	test: 83.6467685	best: 83.6467685 (2)	total: 50.3ms	remaining: 25.8s
3:	learn: 75.0040598	test: 82.4586087	best: 82.4586087 (3)	total: 82.8ms	remaining: 31.8s
4:	learn: 74.5292946	test: 82.0987919	best: 82.0987919 (4)	total: 93.1ms	remaining: 28.6s
5:	learn: 73.0944281	test: 80.8940292	best: 80.8940292 (5)	total: 112ms	remaining: 28.5s
6:	learn: 71.9557193	test: 79.8675637	best: 79.8675637 (6)	total: 127ms	remaining: 27.7s
7:	learn: 70.4673315	test: 78.3962741	best: 78.3962741 (7)	total: 143ms	remaining: 27.3s
8:	learn: 69.8191889	test: 77.6215673	best: 77.6215673 (8)	total: 158ms	remaining: 26.9s
9:	learn: 68.8063558	test: 76.7860888	best: 76.7860888 (9)	total: 179ms	remaining: 27.4s
10:	learn: 68.2048035	test: 76.1127616	best: 76.1127616 (10)	total: 197ms	remaining: 27.3s
11:	learn: 67.59

<catboost.core.CatBoostRegressor at 0x1a7dc0127c8>

In [15]:
train_mae = mean_absolute_error(y_train, best_model.predict(X_train))
test_mae = mean_absolute_error(y_test, best_model.predict(X_test))

print(f'MAE for training set: {train_mae:.3f}')
print(f'MAE for test set: {test_mae:.3f}')

MAE for training set: 33.103
MAE for test set: 54.475


### Find the CV score for the best params model 

In [16]:
best_model.get_params()

{'iterations': 1538,
 'learning_rate': 0.052313318764570016,
 'depth': 5,
 'l2_leaf_reg': 0.02252067954779648,
 'loss_function': 'MAE',
 'random_seed': 42,
 'verbose': False}

In [17]:
cv_data = cv(
    pool=train_pool,
    params=best_model.get_params(), # get params from the best model 
    fold_count=10,
    logging_level='Silent',
)

In [18]:
cv_data.tail()

Unnamed: 0,iterations,test-MAE-mean,test-MAE-std,train-MAE-mean,train-MAE-std
1533,1533,54.321525,11.543025,26.302379,0.861408
1534,1534,54.321185,11.543103,26.298282,0.861873
1535,1535,54.318501,11.5419,26.292712,0.861953
1536,1536,54.317159,11.541449,26.289147,0.861932
1537,1537,54.325687,11.532876,26.278719,0.856171


In [19]:
# since we have set the iterations param (essential this is nubmer of tree)
# The last row, which is the value we set for iterations, have the cv scores we want
index = best_params['iterations'] - 1

print('The CV MAE is {:.3f}±{:.3f}'.format(cv_data.at[index, 'test-MAE-mean'], cv_data.at[index, 'test-MAE-std']))

The CV MAE is 54.326±11.533


In [20]:
# save model
best_model.save_model('./model/catboost_20221009.cbm')

<a style='text-decoration:none;line-height:16px;display:flex;color:#5B5B62;padding:10px;justify-content:end;' href='https://deepnote.com?utm_source=created-in-deepnote-cell&projectId=6b18b33d-3a56-4f49-ad6e-71ecea9f0183' target="_blank">
 </img>
Created in <span style='font-weight:600;margin-left:4px;'>Deepnote</span></a>