In [4]:
import xgboost as xgb
import pandas as pd
import model_helper_functions as mod
from sklearn.model_selection import train_test_split
from sklearn.metrics import root_mean_squared_error,  r2_score
from sklearn.preprocessing import LabelEncoder
import numpy as np


In [5]:
df =  pd.read_parquet("all_cleaned_data_augmented.parquet")
df = mod.necessary_fields(df)
df['Hour'] = df['DropoffDatetime'].apply(mod.round_time_to_int)
df.head()

Unnamed: 0,PickupDatetime,DropoffDatetime,TripDuration,TripDistance,PULocationID,DOLocationID,FareAmount,TipAmount,NextPU,Hour
0,2023-10-01 00:57:33,2023-10-01 01:07:58,10.416667,1.45,166.0,74.0,12.1,2.92,74,1
1,2023-10-01 01:00:16,2023-10-01 01:06:13,5.95,0.89,74.0,42.0,7.9,0.0,82,1
2,2023-10-01 00:51:52,2023-10-01 01:00:32,8.666667,2.38,83.0,129.0,13.5,0.0,116,1
3,2023-10-01 00:03:39,2023-10-01 00:11:20,7.683333,2.26,74.0,263.0,11.4,3.33,7,0
4,2023-10-01 00:27:42,2023-10-01 00:39:10,11.466667,2.14,74.0,236.0,13.5,2.81,80,1


In [6]:

# X = df.fillna(-999)

y = df["NextPU"]

X = df.drop(columns = ['NextPU', 'DropoffDatetime', 'PickupDatetime','TripDuration', 'TripDistance', 'FareAmount', 'TipAmount'])

le = LabelEncoder()
encoded_y = le.fit_transform(y)

train_X, test_X, train_y, test_y = train_test_split(X, encoded_y, test_size=0.1, random_state=42, stratify=y)

num_classes = len(np.unique(train_y))

print('X, y generated', X)



X, y generated           PULocationID  DOLocationID  Hour
0                166.0          74.0     1
1                 74.0          42.0     1
2                 83.0         129.0     1
3                 74.0         263.0     0
4                 74.0         236.0     1
...                ...           ...   ...
24145193          74.0         247.0    13
24145194          74.0         236.0    13
24145195          66.0          37.0     0
24145196         166.0         116.0     0
24145197          97.0         256.0     0

[24145198 rows x 3 columns]


In [9]:
model = xgb.XGBClassifier(
    objective='multi:softmax',
    num_class=num_classes,
    eval_metric = 'mlogloss',
    use_label_encoder=False,
    n_estimators = 500, 
    max_depth = 4, 
    eta = 0.1, 
    subsample = 0.1, 
    colsample_bytree = 1.0,
    random_state=42,
    verbosity=1
) # Need to fine tune all these parameters

from tqdm import tqdm

from xgboost.callback import TrainingCallback

class TQDMProgressBar(TrainingCallback):
    def __init__(self, total):
        self.pbar = tqdm(total=total, desc="Training")
    
    def after_iteration(self, model, epoch, evals_log):
        self.pbar.update(1)
        return False  # return True to stop training early

    def after_training(self, model):
        self.pbar.close()

dtrain = xgb.DMatrix(train_X, label=train_y)
dvalid = xgb.DMatrix(test_X, label=test_y)

params = {
    'objective': 'multi:softmax',
    'num_class': num_classes,
    'eval_metric': 'mlogloss',
    'learning_rate': 0.05,
    'max_depth': 6,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'random_state': 42,
}

# booster = xgb.train(
#     params,
#     dtrain,
#     num_boost_round=num_rounds,
#     evals=[(dvalid, "eval")],
#     early_stopping_rounds=20,
#     callbacks=[TQDMProgressBar(total=num_rounds)]
# )

In [11]:
model.fit(
    train_X, 
    train_y,
    eval_set=[(test_X, test_y)],
    verbose=True,
    # early_stopping_rounds=20,
)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-mlogloss:3.77850
[1]	validation_0-mlogloss:3.62862
[2]	validation_0-mlogloss:3.51702
[3]	validation_0-mlogloss:3.42790
[4]	validation_0-mlogloss:3.35402
[5]	validation_0-mlogloss:3.29113
[6]	validation_0-mlogloss:3.23670
[7]	validation_0-mlogloss:3.18900
[8]	validation_0-mlogloss:3.14676
[9]	validation_0-mlogloss:3.10914
[10]	validation_0-mlogloss:3.07540
[11]	validation_0-mlogloss:3.04490
[12]	validation_0-mlogloss:3.01728
[13]	validation_0-mlogloss:2.99204
[14]	validation_0-mlogloss:2.96906
[15]	validation_0-mlogloss:2.94802
[16]	validation_0-mlogloss:2.92861
[17]	validation_0-mlogloss:2.91076
[18]	validation_0-mlogloss:2.89421
[19]	validation_0-mlogloss:2.87890
[20]	validation_0-mlogloss:2.86469
[21]	validation_0-mlogloss:2.85159
[22]	validation_0-mlogloss:2.83942
[23]	validation_0-mlogloss:2.82803
[24]	validation_0-mlogloss:2.81740
[25]	validation_0-mlogloss:2.80751
[26]	validation_0-mlogloss:2.79826
[27]	validation_0-mlogloss:2.78959
[28]	validation_0-mlogloss:2.7

In [12]:
pred_y = model.predict(test_X)

print(r2_score(test_y, pred_y))
print(root_mean_squared_error(test_y, pred_y))

-0.03329154902513132
14.165278979094346


In [17]:
from sklearn.metrics import mean_squared_error
rmse = mean_squared_error(test_y, pred_y, squared=False)
rms_percent = (rmse / np.mean(test_y)) * 100
rms_percent



59.774683449434264

In [14]:
model.save_model('classifier.json')