## Modules and Import

In [None]:
# import modules
import pandas as pd
from google.colab import drive
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
!pip install category-encoders
from category_encoders import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import xgboost as xgb
from sklearn.model_selection import RandomizedSearchCV
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

# connect drive
drive.mount('/content/drive', force_remount=True)

Collecting category-encoders
  Downloading category_encoders-2.8.0-py3-none-any.whl.metadata (7.9 kB)
Downloading category_encoders-2.8.0-py3-none-any.whl (85 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m85.7/85.7 kB[0m [31m2.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: category-encoders
Successfully installed category-encoders-2.8.0
Mounted at /content/drive


In [None]:
# import df
df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/clean_merge_v2.xls', sep=";")

## First Look and Preprocess

In [None]:
df.dtypes

Unnamed: 0,0
IncidentNumber,int64
CalYear_compat_mobs,int64
HourOfCall_compat_mobs,int64
PerformanceReporting,object
DateAndTimeMobilised,object
DateAndTimeMobile,object
DateAndTimeArrived,object
TurnoutTimeSeconds,float64
TravelTimeSeconds,float64
AttendanceTimeSeconds,float64


In [None]:
# dropping irrelevant columns
df_drop = df.drop(["IncidentNumber", "DateAndTimeMobilised", "DateAndTimeMobile", "DateAndTimeArrived", "TurnoutTimeSeconds", "TravelTimeSeconds", "Incident_Date", "DateOfCall", "IncidentStationGround", "ProperCase", "FirstPumpArriving_AttendanceTime", "FirstPumpArriving_DeployedFromStation", "SecondPumpArriving_AttendanceTime", "SecondPumpArriving_DeployedFromStation", "Postcode_district", "PerformanceReporting", "Notional Cost (£)", "PumpMinutesRounded"], axis=1)

In [None]:
# get the count of categories for each object feature
df_object = df_drop.select_dtypes(include=['object'])

for i in df_object.columns:
  print(f"feature {i} nunique: {df_object[i].nunique()}" )

feature DeployedFromStation_Name nunique: 116
feature DeployedFromLocation nunique: 2
feature DelayCode_Description nunique: 12
feature Weekday_compat_mobs nunique: 7
feature IncidentGroup nunique: 3
feature StopCodeDescription nunique: 28
feature PropertyCategory nunique: 9
feature AddressQualifier nunique: 11


In [None]:
# train test split before encoding / standardization
# Define target and features
target = "AttendanceTimeSeconds"
X = df_drop.drop(columns=[target])  # Features
y = df[target]  # Target variable

# Perform the split (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# encode the object feature variables
# low cardinality features (≤ 10 unique values) → Label Encoding
label_cols = ["DeployedFromLocation", "IncidentGroup",
              "Weekday_compat_mobs", "PropertyCategory", "AddressQualifier"]

le = LabelEncoder()
for col in label_cols:
    X_train[col] = le.fit_transform(X_train[col])
    X_test[col] = le.transform(X_test[col])  # Use the same encoder for test data

# Medium-cardinality categorical features (Target Encoding)
target_cols = ["DelayCode_Description", "StopCodeDescription", "DeployedFromStation_Name"]

te = TargetEncoder(cols=target_cols)
X_train[target_cols] = te.fit_transform(X_train[target_cols], y_train)
X_test[target_cols] = te.transform(X_test[target_cols])  # Use the same encoder for test data


In [None]:
# Select numerical columns that need scaling
num_cols_to_scale = ["distance_km"]  # Only continuous features

scaler = StandardScaler()
X_train[num_cols_to_scale] = scaler.fit_transform(X_train[num_cols_to_scale])
X_test[num_cols_to_scale] = scaler.transform(X_test[num_cols_to_scale])  # Use same scaler for test data


## Modelling

## Basic LR Model + Default XGBOOST

In [None]:
# Initialize model
lr = LinearRegression()

# Train the model
lr.fit(X_train, y_train)

# Make predictions
y_pred = lr.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R² Score: {r2:.2f}")

MAE: 78.40
MSE: 12397.93
R² Score: 0.46


In [None]:
# Initialize XGBoost Regressor
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1, max_depth=6, random_state=42)

# Train the model
xgb_reg.fit(X_train, y_train)

# Predict on test set
y_pred = xgb_reg.predict(X_test)

# Evaluate the model
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R² Score: {r2:.2f}")


MAE: 69.54
MSE: 10431.16
R² Score: 0.55


In [None]:
# Predictions on training data
y_train_pred = xgb_reg.predict(X_train)

# Training set performance
train_mae = mean_absolute_error(y_train, y_train_pred)
train_mse = mean_squared_error(y_train, y_train_pred)
train_r2 = r2_score(y_train, y_train_pred)

# Test set performance
test_mae = mean_absolute_error(y_test, y_pred)
test_mse = mean_squared_error(y_test, y_pred)
test_r2 = r2_score(y_test, y_pred)

print(f"Training MAE: {train_mae:.2f}, Test MAE: {test_mae:.2f}")
print(f"Training MSE: {train_mse:.2f}, Test MSE: {test_mse:.2f}")
print(f"Training R²: {train_r2:.2f}, Test R²: {test_r2:.2f}")

Training MAE: 69.18, Test MAE: 69.54
Training MSE: 10257.28, Test MSE: 10431.16
Training R²: 0.55, Test R²: 0.55


## XGBOOST RandomSearch

In [None]:
# Define parameter grid
param_grid = {
    "n_estimators": [100, 200, 300, 500],  # Number of trees
    "learning_rate": [0.01, 0.05, 0.1, 0.2],  # Step size
    "max_depth": [3, 5, 6, 8],  # Tree depth
    "subsample": [0.6, 0.8, 1.0],  # Fraction of data used per tree
    "colsample_bytree": [0.6, 0.8, 1.0],  # Fraction of features per tree
    "reg_lambda": [0, 1, 10],  # L2 regularization
    "reg_alpha": [0, 0.1, 1]  # L1 regularization
}

# Initialize XGBoost regressor
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", random_state=42)

# Randomized search with 5-fold cross-validation
random_search = RandomizedSearchCV(
    xgb_reg, param_distributions=param_grid,
    n_iter=20, cv=5, scoring="neg_mean_absolute_error",
    n_jobs=-1, random_state=42, verbose=1
)

# Fit the model with hyperparameter tuning
random_search.fit(X_train, y_train)

# Best parameters
print("Best Parameters:", random_search.best_params_)

# Evaluate the best model
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae:.2f}")
print(f"MSE: {mse:.2f}")
print(f"R² Score: {r2:.2f}")


Fitting 5 folds for each of 20 candidates, totalling 100 fits




Best Parameters: {'subsample': 1.0, 'reg_lambda': 0, 'reg_alpha': 0.1, 'n_estimators': 300, 'max_depth': 8, 'learning_rate': 0.2, 'colsample_bytree': 0.6}
MAE: 67.25
MSE: 9939.09
R² Score: 0.57
