In [2]:
# 1. Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from xgboost import XGBRegressor


XGBoostError: 
XGBoost Library (libxgboost.dylib) could not be loaded.
Likely causes:
  * OpenMP runtime is not installed
    - vcomp140.dll or libgomp-1.dll for Windows
    - libomp.dylib for Mac OSX
    - libgomp.so for Linux and other UNIX-like OSes
    Mac OSX users: Run `brew install libomp` to install OpenMP runtime.

  * You are running 32-bit Python on a 64-bit OS

Error message(s): ["dlopen(/Users/tracythandaaye/.pyenv/versions/3.12.0/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib, 0x0006): Library not loaded: @rpath/libomp.dylib\n  Referenced from: <3438F411-CE75-3B9D-AC3F-79994953DB2D> /Users/tracythandaaye/.pyenv/versions/3.12.0/lib/python3.12/site-packages/xgboost/lib/libxgboost.dylib\n  Reason: tried: '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/usr/local/opt/libomp/lib/libomp.dylib' (no such file), '/Users/tracythandaaye/.pyenv/versions/3.12.0/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Users/tracythandaaye/.pyenv/versions/3.12.0/lib/libomp.dylib' (no such file), '/Users/tracythandaaye/.pyenv/versions/3.12.0/lib/libomp.dylib' (no such file), '/System/Volumes/Preboot/Cryptexes/OS/Users/tracythandaaye/.pyenv/versions/3.12.0/lib/libomp.dylib' (no such file)"]


In [None]:
# 2. Load and filter data
df = pd.read_csv('cleaned_wait_times.csv')
df_model = df[(df["Metric"] == "50th Percentile") & (df['Reporting level'] == 'Provincial')].copy()

# 3. Create lag and rolling features
df_model['Lag_1'] = df_model.groupby(['Province', 'Indicator'])['Indicator result'].shift(1)
df_model['Lag_2'] = df_model.groupby(['Province', 'Indicator'])['Indicator result'].shift(2)
df_model['Lag_3'] = df_model.groupby(['Province', 'Indicator'])['Indicator result'].shift(3)
df_model['Rolling_Mean_2'] = df_model.groupby(['Province', 'Indicator'])['Indicator result'].shift(1).rolling(2).mean().reset_index(level=[0,1], drop=True)

# Drop rows with NaN (from lags)
df_model.dropna(subset=['Lag_1', 'Lag_2', 'Lag_3', 'Rolling_Mean_2'], inplace=True)

# 4. Create feature matrix X and target y
X = df_model[["Province", "Indicator", "Data year", "Lag_1", "Lag_2", "Lag_3", "Rolling_Mean_2"]]
y = df_model["Indicator result"]

# Encode categorical features
encoder = OneHotEncoder(drop='first', sparse_output=False)
X_encoded = encoder.fit_transform(X[["Province", "Indicator"]])
X_encoded_df = pd.DataFrame(X_encoded, columns=encoder.get_feature_names_out(["Province", "Indicator"]))
X_final = pd.concat([X_encoded_df.reset_index(drop=True), X[["Data year", "Lag_1", "Lag_2", "Lag_3", "Rolling_Mean_2"]].reset_index(drop=True)], axis=1)


In [None]:

# 5. Time-aware train/test split
train = df_model[df_model['Data year'] < 2020]
test = df_model[df_model['Data year'] >= 2020]

X_train = X_final.loc[train.index]
y_train = y.loc[train.index]
X_test = X_final.loc[test.index]
y_test = y.loc[test.index]
naive_pred = test['Lag_1']  # naive baseline



In [None]:
# 6. XGBoost with hyperparameter tuning
param_grid = {
    'n_estimators': [50, 100],
    'max_depth': [3, 5],
    'learning_rate': [0.05, 0.1]
}
model = XGBRegressor(random_state=42)
tscv = TimeSeriesSplit(n_splits=5)
grid_search = GridSearchCV(model, param_grid, cv=tscv, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_

# 7. Evaluate
preds = best_model.predict(X_test)
mae = mean_absolute_error(y_test, preds)
mse = mean_squared_error(y_test, preds)
naive_mae = mean_absolute_error(y_test, naive_pred)
naive_mse = mean_squared_error(y_test, naive_pred)

print("Model MAE:", mae)
print("Model MSE:", mse)
print("Naive MAE:", naive_mae)
print("Naive MSE:", naive_mse)

In [None]:
# Optional: plot predictions vs actuals
plt.figure(figsize=(10,5))
plt.plot(y_test.values, label='Actual')
plt.plot(preds, label='Predicted')
plt.plot(naive_pred.values, label='Naive', linestyle='--')
plt.legend()
plt.title('Model vs Actual vs Naive')
plt.show()