In [1]:
# ================================
# Singapore — RF & XGB (log-arrivals)
# ================================
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_absolute_error
from math import sqrt

# ------- 1) Load data -------
# Expect columns: date, visitor_arrivals, (optional) hotel_occupancy, google_trends
df_sg = pd.read_csv("singapore_with_recovery_index.csv", parse_dates=["date"])

# ------- 2) Feature builder (standardized) -------
def build_features(df, use_exogenous=True):
    d = df.copy()
    d = d.sort_values("date").reset_index(drop=True)
    d["date"] = pd.to_datetime(d["date"])
    d = d.set_index("date").asfreq("MS")

    # Target (log1p to be safe even if small values appear)
    d["log_arrivals"] = np.log1p(d["visitor_arrivals"])

    # Lags
    for L in [1, 3, 6, 12]:
        d[f"log_lag{L}"] = d["log_arrivals"].shift(L)

    # Rolling means (SHIFTED to avoid leakage)
    d["log_roll3"] = d["log_arrivals"].rolling(3).mean().shift(1)
    d["log_roll6"] = d["log_arrivals"].rolling(6).mean().shift(1)

    # Month dummies
    d["month"] = d.index.month
    month_dummies = pd.get_dummies(d["month"], prefix="m", drop_first=True)
    d = pd.concat([d, month_dummies], axis=1)

    # Optional exogenous (log1p + lag1)
    if use_exogenous:
        for col in ["hotel_occupancy", "google_trends"]:
            if col in d.columns:
                d[f"{col}_log"] = np.log1p(pd.to_numeric(d[col], errors="coerce"))
                d[f"{col}_lag1"] = d[f"{col}_log"].shift(1)

    return d

# ------- 3) Train / Test split -------
def train_test_split_std(d):
    mask_train = ((d.index.year >= 2017) & (d.index.year <= 2019)) | (d.index.year == 2023)
    mask_test  = (d.index.year == 2024)
    train = d.loc[mask_train].dropna()
    test  = d.loc[mask_test].dropna()
    return train, test

# ------- 4) Metrics (in arrivals space) -------
def evaluate(name, y_true, y_pred):
    r2   = r2_score(y_true, y_pred)
    mae  = mean_absolute_error(y_true, y_pred)
    rmse = sqrt(np.mean((y_true - y_pred)**2))
    mape = float(np.mean(np.abs((y_true - y_pred) / y_true)) * 100)
    print(f"\n{name}")
    print(f"R²   : {r2:.3f}")
    print(f"MAE  : {mae:,.0f}")
    print(f"RMSE : {rmse:,.0f}")
    print(f"MAPE : {mape:.2f}%")
    return {"R2": r2, "MAE": mae, "RMSE": rmse, "MAPE": mape}

# ------- 5) Build features, split, train, evaluate -------
d = build_features(df_sg, use_exogenous=True)
train, test = train_test_split_std(d)

feature_cols = [c for c in train.columns if c.startswith(("log_lag","log_roll","m_","hotel","google"))]
target_col = "log_arrivals"

X_train, y_train = train[feature_cols], train[target_col]
X_test,  y_test  = test[feature_cols],  test[target_col]

# Models
rf = RandomForestRegressor(
    n_estimators=600, max_depth=None, min_samples_leaf=2,
    random_state=42, n_jobs=-1
).fit(X_train, y_train)

xgb = XGBRegressor(
    n_estimators=600, learning_rate=0.05, max_depth=4,
    subsample=0.9, colsample_bytree=0.9, random_state=42
).fit(X_train, y_train)

# Invert log1p to evaluate in arrivals space
def inv(y_log): return np.expm1(y_log)

y_true = inv(y_test)
y_rf   = inv(rf.predict(X_test))
y_xgb  = inv(xgb.predict(X_test))

m_rf  = evaluate("Random Forest (log-arrivals)", y_true, y_rf)
m_xgb = evaluate("XGBoost (log-arrivals)",      y_true, y_xgb)

# ------- 6) Month-by-month table (2024) -------
out = pd.DataFrame({
    "date": X_test.index,
    "actual": y_true.round().astype(int),
    "rf_pred": np.round(y_rf).astype(int),
    "xgb_pred": np.round(y_xgb).astype(int),
})
out["rf_abs_err"]  = (out["actual"] - out["rf_pred"]).abs()
out["rf_ape_%"]    = (out["rf_abs_err"] / out["actual"] * 100).round(2)
out["xgb_abs_err"] = (out["actual"] - out["xgb_pred"]).abs()
out["xgb_ape_%"]   = (out["xgb_abs_err"] / out["actual"] * 100).round(2)

print("\n=== 2024 month-by-month — Singapore ===")
print(out.set_index("date"))



Random Forest (log-arrivals)
R²   : -4.850
MAE  : 70,276
RMSE : 95,456
MAPE : 5.04%

XGBoost (log-arrivals)
R²   : -7.883
MAE  : 104,926
RMSE : 117,628
MAPE : 7.56%

=== 2024 month-by-month — Singapore ===
             actual  rf_pred  xgb_pred  rf_abs_err  rf_ape_%  xgb_abs_err  \
date                                                                        
2024-01-01  1439569  1205982   1203961      233587     16.23       235608   
2024-02-01  1436562  1268328   1367204      168234     11.71        69358   
2024-03-01  1403706  1307979   1381952       95727      6.82        21754   
2024-04-01  1350161  1381276   1260981       31115      2.30        89180   
2024-05-01  1346214  1398739   1272642       52525      3.90        73572   
2024-06-01  1297310  1383042   1253456       85732      6.61        43854   
2024-07-01  1348400  1371825   1261316       23425      1.74        87084   
2024-08-01  1410318  1419662   1265102        9344      0.66       145216   
2024-09-01  1362487  14