In [1]:
import pandas as pd
import os
import matplotlib.pyplot as plt
import numpy as np
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
import statsmodels.api as sm
from statsmodels.tsa.stattools import adfuller
# new import statements
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
import optuna
from catboost import Pool, CatBoostRegressor
from xgboost import XGBRegressor

### Retailrocket recommender system dataset

Source: https://www.kaggle.com/retailrocket/ecommerce-dataset

Load the dataset files.

In [2]:
events = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'events.csv'))
item_properties = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'item_properties_part1.csv'))
category_tree = pd.read_csv(os.path.join(\
    '..', 'timeseries_lec_data', 'category_tree.csv'))

In [3]:
events['event_datetime'] = pd.to_datetime(events['timestamp'], unit='ms')
events['event_date'] = events['event_datetime'].dt.date
events

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_datetime,event_date
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02
...,...,...,...,...,...,...,...
2756096,1438398785939,591435,view,261427,,2015-08-01 03:13:05.939,2015-08-01
2756097,1438399813142,762376,view,115946,,2015-08-01 03:30:13.142,2015-08-01
2756098,1438397820527,1251746,view,78144,,2015-08-01 02:57:00.527,2015-08-01
2756099,1438398530703,1184451,view,283392,,2015-08-01 03:08:50.703,2015-08-01


In [4]:
item_properties

Unnamed: 0,timestamp,itemid,property,value
0,1435460400000,460429,categoryid,1338
1,1441508400000,206783,888,1116713 960601 n277.200
2,1439089200000,395014,400,n552.000 639502 n720.000 424566
3,1431226800000,59481,790,n15360.000
4,1431831600000,156781,917,828513
...,...,...,...,...
10999994,1439694000000,86599,categoryid,618
10999995,1435460400000,153032,1066,n1020.000 424566
10999996,1440298800000,421788,888,35975 856003 37346
10999997,1437879600000,159792,400,n552.000 639502 n720.000 424566


In [5]:
category_tree

Unnamed: 0,categoryid,parentid
0,1016,213.0
1,809,169.0
2,570,9.0
3,1691,885.0
4,536,1691.0
...,...,...
1664,49,1125.0
1665,1112,630.0
1666,1336,745.0
1667,689,207.0


Extract `categoryid` data from `item_properties`.

In [6]:
category_items = item_properties[item_properties['property'] == 'categoryid'][['itemid', 'value']]
# 'value' column contains the category IDs, so let's rename value to categoryid
category_items.columns = ['itemid', 'categoryid']  
category_items['categoryid'] = category_items['categoryid'].astype(int)
category_items

Unnamed: 0,itemid,categoryid
0,460429,1338
140,281245,1277
151,35575,1059
189,8313,1147
197,55102,47
...,...,...
10999880,441523,1167
10999917,250848,769
10999932,116380,1509
10999960,84186,209


Merge `events` with `category_items`.

In [7]:
events_with_categories = pd.merge(events, category_items, on="itemid", how="left")
events_with_categories

Unnamed: 0,timestamp,visitorid,event,itemid,transactionid,event_datetime,event_date,categoryid
0,1433221332117,257597,view,355908,,2015-06-02 05:02:12.117,2015-06-02,1173.0
1,1433224214164,992329,view,248676,,2015-06-02 05:50:14.164,2015-06-02,
2,1433221999827,111016,view,318965,,2015-06-02 05:13:19.827,2015-06-02,
3,1433221955914,483717,view,253185,,2015-06-02 05:12:35.914,2015-06-02,
4,1433221337106,951259,view,367447,,2015-06-02 05:02:17.106,2015-06-02,
...,...,...,...,...,...,...,...,...
4439576,1438398785939,591435,view,261427,,2015-08-01 03:13:05.939,2015-08-01,
4439577,1438399813142,762376,view,115946,,2015-08-01 03:30:13.142,2015-08-01,1616.0
4439578,1438397820527,1251746,view,78144,,2015-08-01 02:57:00.527,2015-08-01,969.0
4439579,1438398530703,1184451,view,283392,,2015-08-01 03:08:50.703,2015-08-01,


In [8]:
grouped_data = (
    events_with_categories[events_with_categories['event'] == 'view']
    .groupby(['timestamp', 'event_datetime', 'categoryid'])
    .size()
    .reset_index(name='view_count') 
)
grouped_data

Unnamed: 0,timestamp,event_datetime,categoryid,view_count
0,1430622028399,2015-05-03 03:00:28.399,421.0,5
1,1430622028399,2015-05-03 03:00:28.399,1674.0,13
2,1430622033686,2015-05-03 03:00:33.686,1339.0,1
3,1430622036210,2015-05-03 03:00:36.210,368.0,1
4,1430622040988,2015-05-03 03:00:40.988,683.0,1
...,...,...,...,...
1459107,1442545134195,2015-09-18 02:58:54.195,1196.0,1
1459108,1442545152365,2015-09-18 02:59:12.365,1293.0,1
1459109,1442545153842,2015-09-18 02:59:13.842,956.0,1
1459110,1442545174109,2015-09-18 02:59:34.109,491.0,1


## Gradient Boosting techniques

### Time series forecasting requirements: supervised learning

While boosting techniques can be used for time series forecasting, it requires that the time series dataset be transformed into a supervised learning problem first. It also requires the use of a specialized technique for evaluating the model called walk-forward validation, as evaluating the model using k-fold cross validation would result in optimistically biased results.

### Walk-forward validation

In walk-forward validation, the dataset is first split into train and test sets by selecting a cut point, e.g. all data except the last 12 days is used for training and the last 12 days is used for testing.

- Splitting the data into multiple overlapping train-validation folds, where each fold includes:
    - A growing training set (e.g., using data from t=0 to t=n).
    - A validation set that follows the training set (e.g., data from t=n+1 to t=n+m).
Training and evaluating the model on each fold.

In [9]:
# Train-Test Split
train, test = train_test_split(grouped_data, test_size=0.2, random_state=42, shuffle=False)

first_date = test['event_datetime'].min()
last_date = test['event_datetime'].max()

print(f"Testing period: {first_date} to {last_date}")

# Assumption: the validation set should be the same length as the test set 
# and immediately precede it
validation_duration = last_date - first_date
val_start_date = first_date - validation_duration
val_end_date = first_date - pd.Timedelta(seconds=1)

# Filter the train set to define the validation set
val = train[(train['event_datetime'] >= val_start_date) & (train['event_datetime'] <= val_end_date)]

print(f"Validation period: {val_start_date} to {val_end_date}")
print(f"Number of records in validation set: {len(val)}")

# Remove validation set from training set to prevent data leakage
train = train[~train.index.isin(val.index)]

Testing period: 2015-08-17 22:28:00.761000 to 2015-09-18 02:59:41.778000
Validation period: 2015-07-17 17:56:19.744000 to 2015-08-17 22:27:59.761000
Number of records in validation set: 330165


**When you might not take the immediately‑preceding data for validation?**

when your dataset has severe seasonality: e.g. e-commerce sales time period like holiday season or prime day, etc.,

### Time series forecasting with gradient boosting: LightGBM, XGBoost 

Tree-based machine learning methods designed to improve gradient boosting decision trees (GBDT) --- for regression and classification tasks.

Key benefits of using gradient boosting models for forecasting include:

- The ease with which exogenous variables can be included in the model, in addition to autoregressive variables.
- The ability to capture non-linear relationships between variables.
- High scalability, allowing models to handle large volumes of data.
- Some implementations allow the inclusion of categorical variables without the need for additional encoding, such as one-hot encoding.

Challenges of using boosting techniques for forecasting:

- Transforming the data so that it can be used as a regression problem.
- Depending on how many future predictions are needed (prediction horizon), an iterative process may be required where each new prediction is based on previous ones.
- Model validation requires specific strategies such as backtesting, walk-forward validation or time series cross-validation. Traditional cross-validation cannot be used.

### LightGBM

LightGBM is a gradient boosting framework that uses tree based learning algorithms. It is designed to be distributed and efficient with the following advantages:

Faster training speed and higher efficiency.
- Lower memory usage.
- Better accuracy.
- Support of parallel, distributed, and GPU learning.
- Capable of handling large-scale data.

LightGBM builds decision trees iteratively. Key feature: Uses leaf-wise growth, where the tree expands its most significant leaf nodes first. This can result in deeper, more focused trees and faster convergence.

Optimizations:
- Handles large datasets efficiently.
- Uses a histogram-based algorithm for faster computation.
- Includes mechanisms for dealing with categorical features natively.

Categorical features:
- LightGBM does better when you have the original categorical features as such and not have them one-hot encoded

In [10]:
grouped_data.columns

Index(['timestamp', 'event_datetime', 'categoryid', 'view_count'], dtype='object')

In [11]:
# Define features and labels
features = ['timestamp', 'categoryid']  # Use the numerical timestamp
Y_train = train['view_count']
X_train = train[features]
X_train.loc[:, 'categoryid'] = X_train['categoryid'].astype(int)  
X_train.loc[:, 'categoryid'] = X_train['categoryid'].astype("category")
# LightGBM does better when you have original categorical feature instead of one-hot encoded version

Y_val = val['view_count']
X_val = val[features]
X_val.loc[:, 'categoryid'] = X_val['categoryid'].astype(int) 
X_val.loc[:, 'categoryid'] = X_val['categoryid'].astype("category")

Y_test = test['view_count']
X_test = test[features]
X_test.loc[:, 'categoryid'] = X_test['categoryid'].astype(int)  
X_test.loc[:, 'categoryid'] = X_test['categoryid'].astype("category")

print(Y_train.shape, X_train.shape, Y_val.shape, X_val.shape)

(837124,) (837124, 2) (330165,) (330165, 2)


Length: 837124
Categories (1077, float64): [0.0, 1.0, 2.0, 3.0, ..., 1690.0, 1694.0, 1695.0, 1697.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, 'categoryid'] = X_train['categoryid'].astype("category")
Length: 330165
Categories (1044, float64): [0.0, 1.0, 2.0, 3.0, ..., 1690.0, 1694.0, 1695.0, 1697.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  X_val.loc[:, 'categoryid'] = X_val['categoryid'].astype("category")
Length: 291823
Categories (1044, float64): [0.0, 1.0, 2.0, 3.0, ..., 1690.0, 1694.0, 1695.0, 1697.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  X_test.loc[:, 'categoryid'] = X_test['categoryid'].astype("category")


In [12]:
# LightGBM Dataset
lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=features)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=features)

In [13]:
lgbtrain

<lightgbm.basic.Dataset at 0x7add30662800>

In [14]:
lgbval

<lightgbm.basic.Dataset at 0x7add30663f70>

In [15]:
# Define RMSLE Metric
def rmsle(y_pred, y_true):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

def lgbm_rmsle(preds, train_data):
    labels = train_data.get_label()
    rmsle_val = rmsle(preds, labels)
    return 'RMSLE', rmsle_val, False

In [16]:
# LightGBM Parameters
lgb_params = {
    'num_leaves': 15,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'max_depth': 6,
    'verbose': 1,
    'num_boost_round': 5000,
    'early_stopping_rounds': 100,
    'nthread': -1,
}

# Train LightGBM Model
model = lgb.train(
    lgb_params,
    lgbtrain,
    valid_sets=[lgbtrain, lgbval],
    feval=lgbm_rmsle
)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007088 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 960
[LightGBM] [Info] Number of data points in the train set: 837124, number of used features: 2
[LightGBM] [Info] Start training from score 2.082992
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[55]	training's l2: 2.7401	training's RMSLE: 0.294389	valid_1's l2: 2.73785	valid_1's RMSLE: 0.295956


In [17]:
# Predictions on Validation Set
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
val_rmsle = rmsle(y_pred_val, Y_val)
print(f"Validation RMSLE: {val_rmsle}")

# Predictions on Test Set
test_preds = model.predict(X_test, num_iteration=model.best_iteration)
test_rmsle = rmsle(test_preds, Y_test)
print(f"Test RMSLE: {test_rmsle}")

# Output predictions
test['predicted_view_count'] = test_preds
print(test[['event_datetime', 'categoryid', 'view_count', 'predicted_view_count']].head())

Validation RMSLE: 0.2959559444152678
Test RMSLE: 0.3028402919334681
                 event_datetime  categoryid  view_count  predicted_view_count
1167289 2015-08-17 22:28:00.761       195.0           1              1.401071
1167290 2015-08-17 22:28:00.923       282.0           4              3.959921
1167291 2015-08-17 22:28:00.923       929.0          14             10.521571
1167292 2015-08-17 22:28:05.159        84.0           1              1.096222
1167293 2015-08-17 22:28:13.380       769.0           1              1.093405


#### Feature Engineering for Boosting Models

#### Date transformation

- **Periodicity**: 
  - `day_of_week`, `week_of_year`, `month`, `quarter`, `hour`, `is_weekend`

- **Cyclic Seasonality**: 
  - `sin(2π·day_of_year/365.25)`, `cos(2π·day_of_year/365.25)` (Fourier pairs)

- **Special Dates**: 
  - Binary flags for holidays, promos, pay-days, month-end, etc.

- **Ordinal Trend**: 
  - `days_since_start` or `year` (often cast to category in CatBoost)

- **Lagged Demand**: 
  - `sales_t-1`, `sales_t-7`, rolling means, moving standard deviations, etc.

In [18]:
# Extract meaningful date-based features from 'event_datetime'
grouped_data['event_datetime_year'] = grouped_data['event_datetime'].dt.year
grouped_data['event_datetime_month'] = grouped_data['event_datetime'].dt.month
grouped_data['event_datetime_day'] = grouped_data['event_datetime'].dt.day
grouped_data['event_datetime_dayofweek'] = grouped_data['event_datetime'].dt.dayofweek
grouped_data['event_datetime_is_weekend'] = (grouped_data['event_datetime_dayofweek'] >= 5).astype(int)
grouped_data['event_datetime_quarter'] = grouped_data['event_datetime'].dt.quarter
grouped_data['event_datetime_hour'] = grouped_data['event_datetime'].dt.hour
grouped_data['event_datetime_minute'] = grouped_data['event_datetime'].dt.minute
grouped_data['event_datetime_second'] = grouped_data['event_datetime'].dt.second

In [19]:
grouped_data.columns

Index(['timestamp', 'event_datetime', 'categoryid', 'view_count',
       'event_datetime_year', 'event_datetime_month', 'event_datetime_day',
       'event_datetime_dayofweek', 'event_datetime_is_weekend',
       'event_datetime_quarter', 'event_datetime_hour',
       'event_datetime_minute', 'event_datetime_second'],
      dtype='object')

In [20]:
# Define features and labels
features = [col for col in grouped_data.columns if col not in ['timestamp', 'event_datetime', 'view_count']]
print(features)

['categoryid', 'event_datetime_year', 'event_datetime_month', 'event_datetime_day', 'event_datetime_dayofweek', 'event_datetime_is_weekend', 'event_datetime_quarter', 'event_datetime_hour', 'event_datetime_minute', 'event_datetime_second']


In [21]:
# Train-Test Split
train, test = train_test_split(grouped_data, test_size=0.2, random_state=42, shuffle=False)

first_date = test['event_datetime'].min()
last_date = test['event_datetime'].max()

print(f"Testing period: {first_date} to {last_date}")

# Assumption: the validation set should be the same length as the test set 
# and immediately precede it
validation_duration = last_date - first_date
val_start_date = first_date - validation_duration
val_end_date = first_date - pd.Timedelta(seconds=1)

# Filter the train set to define the validation set
val = train[(train['event_datetime'] >= val_start_date) & (train['event_datetime'] <= val_end_date)]

print(f"Validation period: {val_start_date} to {val_end_date}")
print(f"Number of records in validation set: {len(val)}")

# Remove validation set from training set to prevent data leakage
train = train[~train.index.isin(val.index)]

Testing period: 2015-08-17 22:28:00.761000 to 2015-09-18 02:59:41.778000
Validation period: 2015-07-17 17:56:19.744000 to 2015-08-17 22:27:59.761000
Number of records in validation set: 330165


In [22]:
Y_train = train['view_count']
X_train = train[features]
X_train.loc[:, 'categoryid'] = X_train['categoryid'].astype(int)  
X_train.loc[:, 'categoryid'] = X_train['categoryid'].astype("category")
# LightGBM does better when you have original categorical feature instead of one-hot encoded version

Y_val = val['view_count']
X_val = val[features]
X_val.loc[:, 'categoryid'] = X_val['categoryid'].astype(int) 
X_val.loc[:, 'categoryid'] = X_val['categoryid'].astype("category")

Y_test = test['view_count']
X_test = test[features]
X_test.loc[:, 'categoryid'] = X_test['categoryid'].astype(int)  
X_test.loc[:, 'categoryid'] = X_test['categoryid'].astype("category")

print(Y_train.shape, X_train.shape, Y_val.shape, X_val.shape)

(837124,) (837124, 10) (330165,) (330165, 10)


Length: 837124
Categories (1077, float64): [0.0, 1.0, 2.0, 3.0, ..., 1690.0, 1694.0, 1695.0, 1697.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  X_train.loc[:, 'categoryid'] = X_train['categoryid'].astype("category")
Length: 330165
Categories (1044, float64): [0.0, 1.0, 2.0, 3.0, ..., 1690.0, 1694.0, 1695.0, 1697.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  X_val.loc[:, 'categoryid'] = X_val['categoryid'].astype("category")
Length: 291823
Categories (1044, float64): [0.0, 1.0, 2.0, 3.0, ..., 1690.0, 1694.0, 1695.0, 1697.0]' has dtype incompatible with float64, please explicitly cast to a compatible dtype first.
  X_test.loc[:, 'categoryid'] = X_test['categoryid'].astype("category")


In [23]:
# LightGBM Dataset
lgbtrain = lgb.Dataset(data=X_train, label=Y_train, feature_name=features)
lgbval = lgb.Dataset(data=X_val, label=Y_val, reference=lgbtrain, feature_name=features)

# Define RMSLE Metric
def rmsle(y_pred, y_true):
    return np.sqrt(np.mean(np.square(np.log1p(y_pred) - np.log1p(y_true))))

def lgbm_rmsle(preds, train_data):
    labels = train_data.get_label()
    rmsle_val = rmsle(preds, labels)
    return 'RMSLE', rmsle_val, False

# LightGBM Parameters
lgb_params = {
    'num_leaves': 15,
    'learning_rate': 0.05,
    'feature_fraction': 0.8,
    'max_depth': 6,
    'verbose': 1,
    'num_boost_round': 5000,
    'early_stopping_rounds': 100,
    'nthread': -1,
}

# Train LightGBM Model
model = lgb.train(
    lgb_params,
    lgbtrain,
    valid_sets=[lgbtrain, lgbval],
    feval=lgbm_rmsle
)

# Predictions on Validation Set
y_pred_val = model.predict(X_val, num_iteration=model.best_iteration)
val_rmsle = rmsle(y_pred_val, Y_val)
print(f"Validation RMSLE: {val_rmsle}")

# Predictions on Test Set
test_preds = model.predict(X_test, num_iteration=model.best_iteration)
test_rmsle = rmsle(test_preds, Y_test)
print(f"Test RMSLE: {test_rmsle}")

# Output predictions
test['predicted_view_count'] = test_preds
print(test[['event_datetime', 'categoryid', 'view_count', 'predicted_view_count']].head())

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.052013 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 897
[LightGBM] [Info] Number of data points in the train set: 837124, number of used features: 9
[LightGBM] [Info] Start training from score 2.082992
Training until validation scores don't improve for 100 rounds
Early stopping, best iteration is:
[75]	training's l2: 2.73908	training's RMSLE: 0.294527	valid_1's l2: 2.73882	valid_1's RMSLE: 0.294519
Validation RMSLE: 0.29451856550348077
Test RMSLE: 0.3017657745356095
                 event_datetime  categoryid  view_count  predicted_view_count
1167289 2015-08-17 22:28:00.761       195.0           1              1.339717
1167290 2015-08-17 22:28:00.923       282.0           4              3.965328
1167291 2015-08-17 22:28:00.923       929.0          14             10.669858
1167292 2015-08-17 22:28:05.159        84.0           1              1.085745


#### Previous RMSLE values

- Validation RMSLE: 0.2959559444152678
- Test RMSLE: 0.3028402919334681

### CatBoost (Categorical Boosting)

CatBoost is a gradient‑boosting library from Yandex designed to handle categorical data natively.  
It builds oblivious (symmetric) decision trees one after another, each correcting the residuals of the ensemble so far.

Key features:
- Native categorical handling  
  - Uses ordered target statistics and permutation‑driven encoding to turn high‑cardinality categories into powerful numerical features without one‑hot expansion.  
  - Eliminates the need for manual target or frequency encoding.
- Oblivious trees  
  - Every split depth shares the same condition across all branches, leading to balanced trees that are GPU‑friendly and fast to evaluate.
- Robust default settings  
  - Automatic handling of missing values, text features, and monotonic constraints; fewer “gotchas” compared with other GBMs.

Optimizations:
- Ordered boosting  
  - Trains with different random permutations of data to reduce prediction shift and overfitting on small datasets.
- Efficient GPU implementation  
  - Supports GPU training out of the box with minimal parameter changes (`task_type="GPU"`), often orders‑of‑magnitude faster on large data.
- Built‑in regularization  
  - L2 leaf regularization, depth‑wise shrinkage, and “random strength” parameters to prevent overfitting.
- Quantization & sparse features  
  - Converts continuous features to integer bins on‑the‑fly, enabling cache‑friendly training and low inference latency.
- Model analysis utilities  
  - Integrated SHAP value computation (`get_feature_importance(type="ShapValues")`), loss curves, and evaluation plots make interpretation straightforward.

When to choose CatBoost

- Datasets with many categorical variables (high‑cardinality IDs, strings).  
- Need for quick, strong baseline with minimal preprocessing.  
- Situations where GPU acceleration is available and large training sets must be handled efficiently.


In [25]:
catboost_params = {
    'iterations': 1000,           # Number of boosting rounds
    'learning_rate': 0.05,        # Learning rate for gradient boosting
    'depth': 6,                   # Depth of each tree
    'loss_function': 'RMSE',      # Loss function (Root Mean Squared Error for regression)
    'eval_metric': 'RMSE',        # Evaluation metric
    'random_seed': 42,            # Ensures reproducibility
    'early_stopping_rounds': 50,  # Stops training if no improvement after 50 rounds
    'verbose': 100                # Prints training progress every 100 rounds
}

In [26]:
def fix_cat(df, col):
    # convert float->int safely, then to str (CatBoost likes strings best)
    df[col] = df[col].astype(float).astype('Int64')   # nullable pandas int
    df[col] = df[col].astype(str)                     
    return df

# Applying changes to all dataframes
for _df in (X_train, X_val, X_test):
    fix_cat(_df, 'categoryid')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(float).astype('Int64')   # nullable pandas int
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(float).astype('Int64')   # nullable pandas int
A value is trying to be set on 

In [27]:
model = CatBoostRegressor(**catboost_params)

cat_features = ["categoryid"]
model.fit(
    X_train, 
    Y_train, 
    cat_features=cat_features,     # Specify categorical feature indices if any
    eval_set=(X_val, Y_val),       # Validation set for early stopping
    use_best_model=True            # Save the best model (optional)
)

0:	learn: 2.8023437	test: 2.6731517	best: 2.6731517 (0)	total: 305ms	remaining: 5m 4s
100:	learn: 1.7991163	test: 1.7610380	best: 1.7610380 (100)	total: 14.1s	remaining: 2m 5s
200:	learn: 1.7591253	test: 1.7311115	best: 1.7311115 (200)	total: 31.4s	remaining: 2m 4s
300:	learn: 1.7383029	test: 1.7162541	best: 1.7162369 (299)	total: 49s	remaining: 1m 53s
400:	learn: 1.7260171	test: 1.7091558	best: 1.7091558 (400)	total: 1m 7s	remaining: 1m 40s
500:	learn: 1.7165680	test: 1.7030569	best: 1.7030412 (499)	total: 1m 40s	remaining: 1m 39s
600:	learn: 1.7098133	test: 1.6994275	best: 1.6994275 (600)	total: 2m 14s	remaining: 1m 29s
700:	learn: 1.7044064	test: 1.6972425	best: 1.6972425 (700)	total: 2m 48s	remaining: 1m 11s
800:	learn: 1.6998333	test: 1.6947979	best: 1.6947979 (800)	total: 3m 22s	remaining: 50.2s
900:	learn: 1.6958957	test: 1.6931465	best: 1.6931465 (900)	total: 3m 57s	remaining: 26.1s
999:	learn: 1.6918573	test: 1.6913270	best: 1.6913270 (999)	total: 4m 33s	remaining: 0us

bestTe

<catboost.core.CatBoostRegressor at 0x7add2ad7f730>

In [28]:
test_preds = model.predict(X_test)

In [29]:
test['predicted_view_count'] = test_preds
print(test[['event_datetime', 'categoryid', 'view_count', 'predicted_view_count']].head())

                 event_datetime  categoryid  view_count  predicted_view_count
1167289 2015-08-17 22:28:00.761       195.0           1              1.701340
1167290 2015-08-17 22:28:00.923       282.0           4              4.009186
1167291 2015-08-17 22:28:00.923       929.0          14              8.445875
1167292 2015-08-17 22:28:05.159        84.0           1              1.014705
1167293 2015-08-17 22:28:13.380       769.0           1              1.029508


### XGBoost (Extreme Gradient Boosting)

XGBoost builds decision trees sequentially, improving the predictions of the previous trees. Key feature: Uses level-wise growth, which grows the tree evenly level by level.

Optimizations:
- Regularization techniques (L1, L2) to prevent overfitting.
- Support for distributed computing.
- Highly customizable hyperparameters.

In [30]:
for df in (X_train, X_val, X_test):       
    df['categoryid'] = df['categoryid'].astype('category')  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['categoryid'] = df['categoryid'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['categoryid'] = df['categoryid'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['categoryid'] = df['categoryid'].astype('category')


In [31]:
model = XGBRegressor(random_state=42, enable_categorical=True)
model.fit(X_train, Y_train)

In [32]:
test_preds = model.predict(X_test)

In [33]:
test['predicted_view_count'] = test_preds
print(test[['event_datetime', 'categoryid', 'view_count', 'predicted_view_count']].head())

                 event_datetime  categoryid  view_count  predicted_view_count
1167289 2015-08-17 22:28:00.761       195.0           1              5.741546
1167290 2015-08-17 22:28:00.923       282.0           4              1.023822
1167291 2015-08-17 22:28:00.923       929.0          14              5.268720
1167292 2015-08-17 22:28:05.159        84.0           1              5.906624
1167293 2015-08-17 22:28:13.380       769.0           1              0.956117


## Gradient‑Boosted Decision‑Tree Libraries — LightGBM vs CatBoost vs XGBoost
*A historical and technical comparison*

| Aspect | **XGBoost** | **LightGBM** | **CatBoost** |
|---|---|---|---|
| **Year Introduced / Paper** | 2014 (Chen & Guestrin, arXiv 2016) | 2016 (Ke et al., NIPS 2017) | 2017 (Prokhorenkova et al., NIPS 2018) |
| **Core Goal** | Highly scalable, regularized gradient boosting | Faster training & lower memory via novel tree growth | Handle categorical features natively & reduce overfitting |
| **Tree‑Growing Strategy** | *Level‑wise* (breadth‑first) – expands all candidates at the same depth before moving deeper | *Leaf‑wise* with depth limit – splits the leaf with max loss‑reduction (can create deeper, unbalanced trees) | Symmetric (oblivious) trees – all leaves at same depth use identical split; simplifies fast inference |
| **Handling Categorical Features** | One‑hot or label encoding (external preprocessing) | Same as XGBoost; no built‑in categorical handling | **Built‑in**: target statistics + ordered boosting to fight target leakage |
| **Major Speed Tricks** | • Block structure for cache‑aware access  <br>• Out‑of‑core & sparsity‑aware ops  <br>• Weighted quantile sketch for histograms | • Gradient‑based One‑Side Sampling (GOSS)  <br>• Exclusive Feature Bundling (EFB) to cut sparsity  <br>• Histogram‑based splits in GPU/CPU | • Ordered boosting (permutation‑driven) to reduce prediction shift  <br>• Efficient oblivious trees for SIMD/GPU |
| **Typical Strengths** | • Robust baseline across tasks  <br>• Fine‑grained regularization  <br>• Mature ecosystem | • Very fast on large tabular data  <br>• Lower memory footprint  <br>• Often wins Kaggle when tuned | • Minimal preprocessing (categoricals)  <br>• Strong default accuracy  <br>• Symmetric trees give stable latency |
| **Typical Weaknesses** | • Categorical handling manual  <br>• Slower than LightGBM on massive data | • Leaf‑wise growth can overfit on small data unless depth constrained | • Longer single‑CPU training  <br>• Smaller community / fewer third‑party tutorials |

---

### Key Take‑Aways
* **When speed & memory dominate** (very large, mostly numerical data) → **LightGBM**  
* **When categorical features dominate** or you want strong out‑of‑the‑box accuracy → **CatBoost**  
* **When you need maximum flexibility** (custom objectives, wide community support) → **XGBoost**