## CGH ER ARRIVAL PREDICTION - INFERENCE AND APPEND

In [1]:
# IMPORT LIB
import re
import pickle
import numpy as np
import pandas as pd
import fastf1

In [2]:
# CONFIG
MODEL_PKL_PATH = "CGH_ER_SARIMAX_TemporalExog_ProdModel_FullFit_2022to2025.pkl"
MOM_URL = "https://www.mom.gov.sg/employment-practices/public-holidays"

# Concert list (Please update accordingly)
CONCERT_DATES = [
        "2025-10-01", "2025-11-29", "2025-12-31"
]

In [3]:
# LOAD MODEL BUNDLE
def load_cgh_er_model(path=MODEL_PKL_PATH):
    with open(path, "rb") as f:
        return pickle.load(f)

In [4]:
# MOM HOLIDAY SCRAPER 
def clean_date_text(s: str) -> str:
    s = str(s)
    s = re.sub(r"\(.*?\)", "", s)
    s = re.sub(r"\s+", " ", s).strip()
    return s

def fetch_tables():
    return pd.read_html(MOM_URL)

def extract_year(year: int, tables):
    out = []
    date_pattern = re.compile(r"\d{1,2}\s+[A-Za-z]+\s+20\d{2}")

    for df in tables:
        df.columns = [str(c).strip().lower() for c in df.columns]
        if "date" not in df.columns:
            continue

        rows = df[df["date"].astype(str).str.contains(str(year))]
        if rows.empty:
            continue

        rows = rows.copy()
        rows["date_clean"] = rows["date"].astype(str).map(clean_date_text)

        for _, r in rows.iterrows():
            text = r["date_clean"]
            matches = date_pattern.findall(text)
            parts = matches if matches else [text]

            for part in parts:
                dt = pd.to_datetime(part, dayfirst=True, errors="coerce")
                if pd.isna(dt) or dt.year != year:
                    continue
                out.append(dt.normalize())

    out = sorted(set(out))
    return out

def build_holiday_flag(full_index: pd.DatetimeIndex) -> pd.Series:
    full_index = pd.to_datetime(full_index).tz_localize(None).normalize()
    years = sorted(set(full_index.year.tolist()))
    y0, y1 = years[0], years[-1]

    try:
        tables = fetch_tables()
        hol_dates = []
        for y in range(y0, y1 + 1):
            hol_dates.extend(extract_year(y, tables))
        hol_set = set(hol_dates)
        return pd.Series([1 if d in hol_set else 0 for d in full_index], index=full_index, dtype=int)
    except Exception:
        return pd.Series(0, index=full_index, dtype=int)

In [5]:
# CONCERT EVENT WINDOW 
def build_concert_event_window(full_index: pd.DatetimeIndex) -> pd.Series:
    full_index = pd.to_datetime(full_index).tz_localize(None).normalize()
    ds = pd.to_datetime(CONCERT_DATES, errors="coerce")
    ds = pd.to_datetime(ds).tz_localize(None).normalize()
    ds = ds[~pd.isna(ds)]

    base = pd.Series(0, index=full_index, dtype=int)
    base.loc[base.index.isin(set(ds))] = 1

    temp = base.replace(0, np.nan)
    return (
        temp.bfill(limit=1)
            .ffill(limit=2)
            .fillna(0)
            .astype(int)
    )

In [6]:
# F1 EVENT WINDOW 
def build_f1_event_window(full_index: pd.DatetimeIndex) -> pd.Series:
    full_index = pd.to_datetime(full_index).tz_localize(None).normalize()
    years = sorted(set(full_index.year.tolist()))

    all_races = []
    for year in years:
        df_f1_year = fastf1.get_event_schedule(year)
        df_f1_year = df_f1_year[df_f1_year["Country"] == "Singapore"]
        df_f1_year = df_f1_year[["EventDate", "Session1Date"]]
        all_races.append(df_f1_year)

    if len(all_races) == 0:
        return pd.Series(0, index=full_index, dtype=int)

    df_f1 = pd.concat(all_races, ignore_index=True)
    if df_f1.empty:
        return pd.Series(0, index=full_index, dtype=int)

    df_f1["Session1Date"] = pd.to_datetime(df_f1["Session1Date"]).dt.tz_localize(None).dt.normalize()
    df_f1["EventDate"] = pd.to_datetime(df_f1["EventDate"]).dt.tz_localize(None).dt.normalize()

    def get_f1_dates(row):
        return pd.date_range(start=row["Session1Date"], end=row["EventDate"], freq="D")

    f1_dates = df_f1.apply(get_f1_dates, axis=1)
    f1_dates = pd.to_datetime(pd.Series(f1_dates.explode().values)).dt.normalize()
    f1_dates = set(f1_dates.dropna().tolist())

    base = pd.Series(0, index=full_index, dtype=int)
    base.loc[base.index.isin(f1_dates)] = 1

    temp = base.replace(0, np.nan)
    window = (
        temp.bfill(limit=4)
            .ffill(limit=2)
            .fillna(0)
            .astype(int)
    )
    return window

In [7]:
# Get last observed date from model
def get_model_last_observed_date(model_bundle) -> pd.Timestamp:
    lod = model_bundle.get("last_observed_date", None)
    if lod is not None and str(lod).strip() != "" and str(lod).lower() != "none":
        return pd.to_datetime(lod).tz_localize(None).normalize()

    ffe = model_bundle.get("full_fit_end", None)
    if ffe is not None and str(ffe).strip() != "" and str(ffe).lower() != "none":
        return pd.to_datetime(ffe).tz_localize(None).normalize()

    raise ValueError(
        "Cannot detect last observed date from the saved model bundle. "
        "Expected 'last_observed_date' or 'full_fit_end'."
    )

In [8]:
# Detect datetime column + compute daily arrivals
_DT_REGEX = re.compile(r"\b\d{1,2}/\d{1,2}/\d{4}\b")  # e.g., 30/9/2025

def _read_raw_visits(path: str) -> pd.DataFrame:
    try:
        df = pd.read_csv(path, sep=None, engine="python", dtype=str, encoding_errors="ignore")
        if df.shape[1] >= 2:
            return df
    except Exception:
        pass

    # Fallback: assume tab-separated, no header
    df = pd.read_csv(path, sep="\t", engine="python", header=None, dtype=str, encoding_errors="ignore")
    return df

def _pick_best_datetime_column(df: pd.DataFrame) -> str:
    best_col = None
    best_rate = -1.0

    # Work on a sample for speed if huge
    sample = df.copy()
    if len(sample) > 50000:
        sample = sample.sample(50000, random_state=42)

    for col in sample.columns:
        s = sample[col].astype(str).str.strip()

        # quick filter: must contain a date-like token
        has_date = s.str.contains(_DT_REGEX, na=False)
        if has_date.mean() < 0.05:  # too few date-like values
            continue

        parsed = pd.to_datetime(s, errors="coerce", dayfirst=True)
        rate = parsed.notna().mean()

        if rate > best_rate:
            best_rate = rate
            best_col = col

    if best_col is None:
        raise ValueError(
            "Could not detect a datetime column. "
            "If your file has no headers, consider adding one or tell me which column is the visit time."
        )
    return best_col

def count_arrivals_for_date(raw_visits_path: str, observed_date: str) -> int:
    df = _read_raw_visits(raw_visits_path)

    dt_col = _pick_best_datetime_column(df)
    dt = pd.to_datetime(df[dt_col].astype(str).str.strip(), errors="coerce", dayfirst=True)

    obs_date = pd.to_datetime(observed_date).date()
    arrivals = int((dt.dt.date == obs_date).sum())

    if arrivals == 0:
        # Useful debug info if you're accidentally using the wrong date column
        non_null = int(dt.notna().sum())
        raise ValueError(
            f"Arrivals count is 0 for observed_date={observed_date}. "
            f"Detected datetime column='{dt_col}'. Parsed non-null datetimes={non_null}/{len(df)}. "
            f"Double-check the observed_date or which datetime field represents arrival/registration time."
        )

    return arrivals

In [9]:
# BUILD EXOG FOR FUTURE DATES
def build_future_exog(model_bundle, future_index: pd.DatetimeIndex) -> pd.DataFrame:
    future_index = pd.to_datetime(future_index).tz_localize(None).normalize()

    dom_mean = float(model_bundle["dom_mean"])
    dom_std = float(model_bundle["dom_std"])
    exog_cols = list(model_bundle["exogenous_feature_columns"])

    df = pd.DataFrame(index=future_index)

    # temporal features
    df["dow"] = df.index.dayofweek
    df["dom"] = df.index.day
    df["qtr"] = df.index.quarter

    # holiday flag (MOM scrape)
    holiday = build_holiday_flag(future_index)
    df["holiday"] = holiday.values

    # dom_norm using SAVED mean/std
    df["dom_norm"] = (df["dom"] - dom_mean) / (dom_std + 1e-8)

    # holiday_window (+/-2)
    temp_hol = pd.Series(df["holiday"].values, index=future_index).replace(0, np.nan)
    df["holiday_window"] = temp_hol.bfill(limit=2).ffill(limit=2).fillna(0).astype(int).values

    # holiday_post_window_2d (+1 to +2 days after each holiday)
    hol = pd.Series(df["holiday"].values, index=future_index).fillna(0).astype(int)
    post2 = pd.Series(0, index=future_index, dtype=int)
    hol_pos = np.where(hol.values == 1)[0]
    for p in hol_pos:
        start = p + 1
        end = min(p + 2, len(post2) - 1)
        if start <= end:
            post2.iloc[start:end + 1] = 1
    df["holiday_post_window_2d"] = post2.values

    # cny_post_window: detect 2 consecutive holiday days as CNY block, then +1 to +4 days after
    cny_flag = pd.Series(0, index=future_index, dtype=int)
    i = 0
    while i < len(hol) - 1:
        if hol.iloc[i] == 1 and hol.iloc[i + 1] == 1:
            cny_flag.iloc[i:i+2] = 1
            i += 2
        else:
            i += 1

    cny_post = pd.Series(0, index=future_index, dtype=int)
    idx = np.where(cny_flag.values == 1)[0]
    if idx.size > 0:
        breaks = np.where(np.diff(idx) != 1)[0]
        block_ends = np.r_[breaks, len(idx) - 1]
        for be in block_ends:
            end_pos = idx[be]
            cny_post.iloc[end_pos + 1 : min(end_pos + 4 + 1, len(cny_post))] = 1
    df["cny_post_window"] = cny_post.values

    # event windows
    df["f1_event"] = build_f1_event_window(future_index).values
    df["concert_event"] = build_concert_event_window(future_index).values

    # dummies 
    dow_dummies = pd.get_dummies(df["dow"], prefix="dow", drop_first=True)
    qtr_dummies = pd.get_dummies(df["qtr"], prefix="qtr", drop_first=True)

    exog = pd.concat(
        [
            dow_dummies,
            qtr_dummies,
            df[
                [
                    "dom_norm",
                    "holiday",
                    "holiday_window",
                    "holiday_post_window_2d",
                    "cny_post_window",
                    "f1_event",
                    "concert_event",
                ]
            ],
        ],
        axis=1
    )

    exog = (
        exog.apply(pd.to_numeric, errors="coerce")
            .replace([np.inf, -np.inf], np.nan)
            .fillna(0.0)
            .astype("float64")
    )

    # Force columns to match training bundle
    for c in exog_cols:
        if c not in exog.columns:
            exog[c] = 0.0
    exog = exog[exog_cols]

    return exog

In [10]:
# FORECAST
def forecast_cgh_er_arrivals_next_3_days(
    model_bundle,
    observed_date=None,
    new_observed_arrivals=None,
    new_observed_exog=None,
):
    sarimax_res = model_bundle["sarimax_results"]
    has_append = (new_observed_arrivals is not None) and (len(new_observed_arrivals) > 0)

    # Determine anchor date
    if has_append:
        if observed_date is None:
            raise ValueError("If you append new data, you must provide observed_date (e.g. '2025-09-01').")
        last_observed_date = pd.to_datetime(observed_date).tz_localize(None).normalize()

        # Append (no refit)
        arrivals_array = np.asarray(new_observed_arrivals, float)
        exog_array = np.asarray(new_observed_exog, float)

        # Safety reshape (prevents silent shape bugs)
        if arrivals_array.ndim != 1:
            arrivals_array = arrivals_array.reshape(-1)
        if exog_array.ndim == 1:
            exog_array = exog_array.reshape(1, -1)

        sarimax_res = sarimax_res.append(endog=arrivals_array, exog=exog_array, refit=False)
    else:
        # No append -> use model's last date
        last_observed_date = get_model_last_observed_date(model_bundle)

    # Build next 3 target dates
    future_index = pd.date_range(start=last_observed_date + pd.Timedelta(days=1), periods=3, freq="D")

    future_exog = build_future_exog(model_bundle, future_index).values
    fc = sarimax_res.get_forecast(steps=3, exog=future_exog)

    yhat = np.asarray(fc.predicted_mean, dtype=float)
    yhat = np.clip(yhat, 0.0, None)

    return pd.DataFrame(
        {
            "Last_Observed_Date_Used": [last_observed_date] * 3,
            "Target_Date": future_index,
            "Horizon_Days": [1, 2, 3],
            "Predicted_Arrivals": yhat,
        }
    )

In [11]:
# SAVE UPDATED BUNDLE (ONLY IF YOU APPEND)
def save_updated_model_bundle(model_bundle, save_path: str, updated_results=None, last_observed_date=None):
    new_bundle = dict(model_bundle)

    if updated_results is not None:
        new_bundle["sarimax_results"] = updated_results

    if last_observed_date is not None:
        new_bundle["last_observed_date"] = str(pd.to_datetime(last_observed_date).tz_localize(None).normalize())

    with open(save_path, "wb") as f:
        pickle.dump(new_bundle, f)

    return save_path

In [12]:
# EXAMPLE USAGE
# OPTION 1: NO APPEND
# This will forecast the NEXT 3 DAYS after the last date already inside the model.
if __name__ == "__main__":
    bundle = load_cgh_er_model(MODEL_PKL_PATH)
    pred_df = forecast_cgh_er_arrivals_next_3_days(
        bundle,
        observed_date=None,
        new_observed_arrivals=None,
        new_observed_exog=None
    )
    print(pred_df.to_string(index=False))



Last_Observed_Date_Used Target_Date  Horizon_Days  Predicted_Arrivals
             2025-09-30  2025-10-01             1          397.511278
             2025-09-30  2025-10-02             2          391.717559
             2025-09-30  2025-10-03             3          386.677290


In [13]:
# EXAMPLE USAGE
# OPTION 2A: APPEND 1 DAY THEN FORECAST (manual entry)
if __name__ == "__main__":
    bundle = load_cgh_er_model(MODEL_PKL_PATH)

    OBSERVED_DATE = "2025-10-01"  # sample date

    # Build the correct exog row for the observed day
    observed_idx = pd.DatetimeIndex([pd.to_datetime(OBSERVED_DATE)]).tz_localize(None).normalize()
    observed_exog_row = build_future_exog(bundle, observed_idx).iloc[0].values.astype(float)

    pred_df = forecast_cgh_er_arrivals_next_3_days(
        bundle,
        observed_date=OBSERVED_DATE,
        new_observed_arrivals=[405.0],              # replace with actual arrivals
        new_observed_exog=[observed_exog_row],
    )

    print(pred_df.to_string(index=False))

Last_Observed_Date_Used Target_Date  Horizon_Days  Predicted_Arrivals
             2025-10-01  2025-10-02             1          386.928576
             2025-10-01  2025-10-03             2          381.176968
             2025-10-01  2025-10-04             3          339.650892


In [14]:
# EXAMPLE USAGE
# OPTION 2B: APPEND 1 DAY THEN FORECAST
if __name__ == "__main__":
    bundle = load_cgh_er_model(MODEL_PKL_PATH)

    OBSERVED_DATE = "2025-10-01" # sample date

    # Path to your raw visit-level export
    RAW_VISITS_PATH = "cgh_raw_visits_2025_10_01.csv"  # change to your file

    # Derive the daily arrivals count from raw rows
    arrivals_count = count_arrivals_for_date(RAW_VISITS_PATH, OBSERVED_DATE)

    # Build the correct exog row for that observed day
    observed_idx = pd.DatetimeIndex([pd.to_datetime(OBSERVED_DATE)]).tz_localize(None).normalize()
    observed_exog_row = build_future_exog(bundle, observed_idx).iloc[0].values.astype(float)

    # Append + forecast next 3 days
    pred_df = forecast_cgh_er_arrivals_next_3_days(
        bundle,
        observed_date=OBSERVED_DATE,
        new_observed_arrivals=[float(arrivals_count)],
        new_observed_exog=[observed_exog_row],      # correct exog row (NOT zeros)
    )
    print(f"Observed arrivals for {OBSERVED_DATE}: {arrivals_count}")
    print(pred_df.to_string(index=False))

Observed arrivals for 2025-10-01: 409
Last_Observed_Date_Used Target_Date  Horizon_Days  Predicted_Arrivals
             2025-10-01  2025-10-02             1          387.642607
             2025-10-01  2025-10-03             2          381.734022
             2025-10-01  2025-10-04             3          340.200448
