In [27]:
!pip install catboost -q

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

from catboost import CatBoostRegressor, Pool
import time

In [30]:
df = pd.read_csv("preprocessed_daily_transport_journeys.csv")
df['Date'] = pd.to_datetime(df['Date'])
df = df.sort_values('Date').reset_index(drop=True)

In [31]:
df['Total_Daily'] = (
    df['Local Route'] +
    df['Light Rail'] +
    df['Peak Service'] +
    df['Rapid Route'] +
    df['School'] +
    df['Other']
)

# Calendar features
df['dayofweek'] = df['Date'].dt.dayofweek      # 0=Monday, 6=Sunday
df['month'] = df['Date'].dt.month              # 1-12
df['day'] = df['Date'].dt.day                  # 1-31
df['weekofyear'] = df['Date'].dt.isocalendar().week.astype(int)
df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int)

# Lag features (autoregressive)
for lag in [1, 7, 14, 28]:
    df[f'lag_{lag}'] = df['Total_Daily'].shift(lag)

# Rolling statistics
for window in [7, 14, 28]:
    df[f'rolling_mean_{window}'] = df['Total_Daily'].shift(1).rolling(window=window).mean()
    df[f'rolling_std_{window}'] = df['Total_Daily'].shift(1).rolling(window=window).std()

# Drop NaN rows from lag/rolling operations
df_clean = df.dropna().reset_index(drop=True)

print("Data preparation complete!")
print(f"Shape: {df_clean.shape}")
print(f"Date range: {df_clean['Date'].min().date()} to {df_clean['Date'].max().date()}")

Data preparation complete!
Shape: (1879, 23)
Date range: 2019-07-29 to 2024-09-29


In [32]:
feature_columns = [
    # Calendar features
    'dayofweek', 'month', 'day', 'weekofyear', 'is_weekend',

    # Lag features
    'lag_1', 'lag_7', 'lag_14', 'lag_28',

    # Rolling features
    'rolling_mean_7', 'rolling_std_7',
    'rolling_mean_14', 'rolling_std_14',
    'rolling_mean_28', 'rolling_std_28'
]

categorical_features = ['dayofweek', 'month', 'is_weekend', 'weekofyear']

X = df_clean[feature_columns].copy()
y = df_clean['Total_Daily'].copy()

print(f"Features shape: {X.shape}")
print(f"Target shape: {y.shape}")
print(f"\nFeature columns ({len(feature_columns)}):")
for i, col in enumerate(feature_columns, 1):
    print(f"  {i:2d}. {col}")

Features shape: (1879, 15)
Target shape: (1879,)

Feature columns (15):
   1. dayofweek
   2. month
   3. day
   4. weekofyear
   5. is_weekend
   6. lag_1
   7. lag_7
   8. lag_14
   9. lag_28
  10. rolling_mean_7
  11. rolling_std_7
  12. rolling_mean_14
  13. rolling_std_14
  14. rolling_mean_28
  15. rolling_std_28


In [33]:
test_size = 180
train_size = len(df_clean) - test_size

X_train = X.iloc[:train_size].copy()
y_train = y.iloc[:train_size].copy()
dates_train = df_clean['Date'].iloc[:train_size].copy()

X_test = X.iloc[train_size:].copy()
y_test = y.iloc[train_size:].copy()
dates_test = df_clean['Date'].iloc[train_size:].copy()

print(f"Train set: {len(X_train)} samples ({dates_train.min().date()} to {dates_train.max().date()})")
print(f"Test set: {len(X_test)} samples ({dates_test.min().date()} to {dates_test.max().date()})")

Train set: 1699 samples (2019-07-29 to 2024-04-02)
Test set: 180 samples (2024-04-03 to 2024-09-29)


In [34]:
start_time = time.time()

# Initialize CatBoost regressor
model = CatBoostRegressor(
    iterations=500,                    # Number of boosting rounds
    learning_rate=0.05,                # Learning rate
    depth=6,                           # Tree depth
    loss_function='RMSE',              # Loss function (MAE also good)
    eval_metric='MAE',                 # Evaluation metric
    subsample=0.8,                     # Subsample ratio
    colsample_bylevel=0.8,            # Feature subsample
    random_state=42,
    verbose=0,                         # Set to 0 to reduce output
    cat_features=categorical_features, # Categorical features
    early_stopping_rounds=50,          # Early stopping
)

# Train on training set
print("\nTraining CatBoost model...")
model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    verbose=False
)

training_time = time.time() - start_time
print(f"✓ Model trained in {training_time:.2f} seconds")


Training CatBoost model...
✓ Model trained in 3.28 seconds


In [36]:
y_pred_test = model.predict(X_test)

mae_test = mean_absolute_error(y_test, y_pred_test)
# Calculate RMSE by taking the square root of MSE
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred_test))
r2_test = r2_score(y_test, y_pred_test)
mape_test = np.mean(np.abs((y_test - y_pred_test) / y_test)) * 100

print(f"\nTest Set Performance:")
print(f"  MAE:   {mae_test:>10,.0f} passengers")
print(f"  RMSE:  {rmse_test:>10,.0f} passengers")
print(f"  MAPE:  {mape_test:>10.1f}%")
print(f"  R²:    {r2_test:>10.3f}")

# Also evaluate on train set to check for overfitting
y_pred_train = model.predict(X_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
# Calculate RMSE by taking the square root of MSE
rmse_train = np.sqrt(mean_squared_error(y_train, y_pred_train))

print(f"\nTrain Set Performance:")
print(f"  MAE:   {mae_train:>10,.0f} passengers")
print(f"  RMSE:  {rmse_train:>10,.0f} passengers")

# Check for overfitting
if rmse_test / rmse_train > 1.2:
    print(f"\n⚠ Warning: Possible overfitting (test RMSE / train RMSE = {rmse_test/rmse_train:.2f})")
else:
    print(f"\n✓ No significant overfitting detected")


Test Set Performance:
  MAE:        3,064 passengers
  RMSE:       5,823 passengers
  MAPE:       135.2%
  R²:         0.880

Train Set Performance:
  MAE:        2,005 passengers
  RMSE:       3,556 passengers

