In [215]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.patches import Patch
from matplotlib import gridspec
from statsmodels.tsa.api import ExponentialSmoothing
import datetime


import warnings
from statsmodels.tools.sm_exceptions import ConvergenceWarning
warnings.simplefilter('ignore', ConvergenceWarning)

In [216]:
def seasonal_average_forecast(data, target_dates, years_back=5, day_window=5):
    df = data.copy()
    df["created_date"] = pd.to_datetime(df["created_date"])
    df["doy"] = df["created_date"].dt.dayofyear
    df["year"] = df["created_date"].dt.year

    forecasts = []
    for target_date in target_dates:
        target_doy = target_date.dayofyear
        target_year = target_date.year
        mask = (
            (df["year"] >= target_year - years_back) &
            (df["year"] < target_year) &
            (np.abs(df["doy"] - target_doy) <= day_window)
        )

        forecasts.append(df.loc[mask, "count"].mean())

    return pd.Series(forecasts, index=target_dates)

In [217]:
metrics_table = pd.DataFrame(columns=['cut_off', 'last_day', 'rmse_hw', 'rss_hw', 'rmse_sa', 'rss_sa', 'rmse_hw_better'])

In [218]:
df = pd.read_csv('../scr/data/cleaned_rat_sightings_data/daily_borough_rs.csv')
df['created_date'] = pd.to_datetime(df['created_date']) 

In [219]:
# rs = pd.read_csv("../scr/data/cleaned_rat_sightings_data/daily_borough_rs.csv")
# rs_test = rs[rs['created_date']>=cut_off]
# rs_test = rs_test[rs_test['created_date']<=last_day]
# rs_train = rs[rs['created_date']<cut_off]
# rs_train = rs_train[rs_train['created_date']>='2020-01-01']

# rs_test["created_date"] = pd.to_datetime(rs_test["created_date"])
# rs_train["created_date"] = pd.to_datetime(rs_train["created_date"])

In [220]:
borough = 'MANHATTAN'

In [221]:
last_cut_off = '2025-12-30'

date_list = pd.date_range(start="2025-01-01", end=last_cut_off, freq='7D').strftime("%Y-%m-%d").tolist()


In [222]:
for cut_off in date_list:
    second_date_list = pd.date_range(start=cut_off, end='2025-12-30', freq='MS').strftime("%Y-%m-%d").tolist()
    for last_day in second_date_list:
        rs = pd.read_csv("../scr/data/cleaned_rat_sightings_data/daily_borough_rs.csv")
        rs_test = rs[rs['created_date']>=cut_off]
        rs_test = rs_test[rs_test['created_date']<=last_day]
        rs_train = rs[rs['created_date']<cut_off]
        rs_train = rs_train[rs_train['created_date']>='2020-01-01']

        rs_test["created_date"] = pd.to_datetime(rs_test["created_date"])
        rs_train["created_date"] = pd.to_datetime(rs_train["created_date"])

        
        day_before_obj = datetime.datetime.strptime(cut_off, '%Y-%m-%d') - datetime.timedelta(days=1)
        # Convert back to string (date only)
        before_cut_off = day_before_obj.strftime('%Y-%m-%d')

        dfb = df[df['borough'] == borough].copy()
        dfb['created_date'] = pd.to_datetime(dfb['created_date'])

        # Fill missing dates with 0 for count
        full_dates = pd.date_range('2020-01-01', last_day, freq='D')
        full_index = pd.MultiIndex.from_product([[borough], full_dates], names=['borough', 'created_date'])

        dfb = dfb.set_index(['borough', 'created_date']).reindex(full_index).fillna({'count': 0}).reset_index()
        dfb['count'] = dfb['count'].astype(int)

        # Split into training and testing data.
        df_train = dfb[dfb['created_date'] < cut_off]
        df_test  = dfb[dfb['created_date'] >= cut_off]

        # Holt-Winters
        model = ExponentialSmoothing(df_train['count'].values, trend='add', seasonal='add', seasonal_periods=365).fit(optimized=True)
        if len(df_test['created_date'])==0:
            break
        forecast = model.forecast(len(df_test))


        rmse_hw = np.sqrt(np.mean((df_test['count'].values - forecast) ** 2))
        rss_hw =  np.sum((df_test['count'].values - forecast)**2)

        full_range = pd.date_range(start="2020-01-01", end=before_cut_off, freq="D")

        test_range = pd.date_range(start=cut_off, end = last_day, freq="D")

        borough_data = (rs_train[rs_train["borough"] == borough]
                        .assign(created_date=lambda df: pd.to_datetime(df["created_date"]))
                        .sort_values("created_date").set_index("created_date"))
        # reindex and fill missing days with 0
        borough_data = (borough_data.reindex(full_range)
                        .assign(count=lambda df: df["count"].fillna(0),borough=borough)
                        .rename_axis("created_date")
                        .reset_index())
            
        borough_data_test = (rs_test[rs_test["borough"] == borough]
                        .assign(created_date=lambda df: pd.to_datetime(df["created_date"]))
                        .sort_values("created_date").set_index("created_date"))

        borough_data_test = (borough_data_test.reindex(test_range)
                        .assign(count=lambda df: df["count"].fillna(0),borough=borough)
                        .rename_axis("created_date")
                        .reset_index())


        last_date = rs_train["created_date"].max()
        future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=len(borough_data_test), freq="D")
            
        # compute seasonal-average forecast
        forecast = seasonal_average_forecast(borough_data,future_dates,years_back=5)

            
        actual_series = borough_data_test.set_index('created_date')['count']
        actual_aligned = actual_series.reindex(forecast.index, fill_value=0)

        rmse_sa = np.sqrt(np.mean((actual_aligned - forecast.values)**2))
        rss_sa = np.sum((actual_aligned - forecast.values)**2)

        metrics_table = pd.concat([metrics_table, pd.DataFrame({'cut_off': cut_off,'last_day': last_day, 
                                                                'rmse_hw': [rmse_hw],
                                                                'rss_hw': [rss_hw], 
                                                                'rmse_sa': [rmse_sa], 
                                                                'rss_sa': [rss_sa],
                                                                'rmse_hw_better': [rmse_hw<rmse_sa]
                                                                })], ignore_index=True)


KeyboardInterrupt: 

In [None]:
metrics_table['rmse_hw_better'].values

array([False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, True, True, False, False, False, False, False,
       True, True, True, True, True, True, True, True, True, True, True,
       True, True, True, True, True, True, True, True, True, True, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       F

In [None]:
metrics_table['rmse_hw_better'].value_counts()

rmse_hw_better
False    263
True      77
Name: count, dtype: int64

In [None]:
metrics_table

Unnamed: 0,cut_off,last_day,rmse_hw,rss_hw,rmse_sa,rss_sa,rmse_hw_better
0,2025-01-01,2025-12-30,6.663361,16161.737847,6.200743,13995.511996,False
1,2025-01-08,2025-12-30,6.568977,15405.072017,6.247257,13933.076648,False
2,2025-01-15,2025-12-30,6.561944,15070.688032,6.285879,13829.294168,False
3,2025-01-22,2025-12-30,6.650889,15172.371568,6.318156,13692.248548,False
4,2025-01-29,2025-12-30,7.008051,16501.894684,6.257913,13158.256152,False
...,...,...,...,...,...,...,...
335,2025-10-29,2025-12-01,4.973307,840.94854,7.402416,1863.055868,True
336,2025-11-05,2025-12-01,4.630339,578.881165,6.631459,1187.358678,True
337,2025-11-12,2025-12-01,4.822891,465.205613,6.508431,847.193388,True
338,2025-11-19,2025-12-01,5.550485,400.50251,5.566992,402.888264,True


In [None]:
metrics_table.drop_duplicates(inplace=True)

## Cross Validated

In [283]:
def seasonal_average_forecast(data, target_dates, years_back=5, day_window=5):
    df = data.copy()
    df["index"] = pd.to_datetime(df["index"])
    df["doy"] = df["index"].dt.dayofyear
    df["year"] = df["index"].dt.year

    forecasts = []
    for target_date in target_dates:
        target_doy = target_date.dayofyear
        target_year = target_date.year
        mask = (
            (df["year"] >= target_year - years_back) &
            (df["year"] < target_year) &
            (np.abs(df["doy"] - target_doy) <= day_window)
        )

        forecasts.append(df.loc[mask, "count"].mean())

    return pd.Series(forecasts, index=target_dates)

In [284]:
metrics_table = pd.DataFrame(columns=['cut_off', 'last_day', 'rmse_hw', 'rss_hw', 'rmse_sa', 'rss_sa', 'rmse_hw_better','difference'])

In [285]:
df = pd.read_csv('../scr/data/cleaned_rat_sightings_data/daily_borough_rs.csv')
df['created_date'] = pd.to_datetime(df['created_date']) 

df = df[df['borough']=='MANHATTAN']

In [286]:
from sklearn.model_selection import TimeSeriesSplit

In [317]:
df["created_date"] = pd.to_datetime(df["created_date"])


# Convert dates to ordinal numbers
date_index = df["created_date"].map(pd.Timestamp.toordinal)

df = (
    df
    .sort_values("created_date", ascending=True)
    .reset_index(drop=True)
)

tscv = TimeSeriesSplit(
    n_splits=273,
    test_size=7
)

for fold, (train_idx, test_idx) in enumerate(tscv.split(date_index)):
    train_dates = df.iloc[train_idx]["created_date"]
    test_dates = df.iloc[test_idx]["created_date"]
    print(f"Fold {fold + 1}")
    print("Train:", train_dates.min(), "→", train_dates.max())
    print("Test: ", test_dates.min(), "→", test_dates.max())
    print()


Fold 1
Train: 2020-01-01 00:00:00 → 2020-10-05 00:00:00
Test:  2020-10-06 00:00:00 → 2020-10-12 00:00:00

Fold 2
Train: 2020-01-01 00:00:00 → 2020-10-12 00:00:00
Test:  2020-10-13 00:00:00 → 2020-10-19 00:00:00

Fold 3
Train: 2020-01-01 00:00:00 → 2020-10-19 00:00:00
Test:  2020-10-20 00:00:00 → 2020-10-26 00:00:00

Fold 4
Train: 2020-01-01 00:00:00 → 2020-10-26 00:00:00
Test:  2020-10-27 00:00:00 → 2020-11-02 00:00:00

Fold 5
Train: 2020-01-01 00:00:00 → 2020-11-02 00:00:00
Test:  2020-11-03 00:00:00 → 2020-11-09 00:00:00

Fold 6
Train: 2020-01-01 00:00:00 → 2020-11-09 00:00:00
Test:  2020-11-10 00:00:00 → 2020-11-16 00:00:00

Fold 7
Train: 2020-01-01 00:00:00 → 2020-11-16 00:00:00
Test:  2020-11-17 00:00:00 → 2020-11-23 00:00:00

Fold 8
Train: 2020-01-01 00:00:00 → 2020-11-23 00:00:00
Test:  2020-11-24 00:00:00 → 2020-11-30 00:00:00

Fold 9
Train: 2020-01-01 00:00:00 → 2020-11-30 00:00:00
Test:  2020-12-01 00:00:00 → 2020-12-07 00:00:00

Fold 10
Train: 2020-01-01 00:00:00 → 2020-12-0

In [318]:
# # for cut_off in date_list:
# #     second_date_list = pd.date_range(start=cut_off, end='2025-12-30', freq='MS').strftime("%Y-%m-%d").tolist()
# #     for last_day in second_date_list:
# for fold, (train_idx, test_idx) in enumerate(tscv.split(date_index)):
#         rs = pd.read_csv("../scr/data/cleaned_rat_sightings_data/daily_borough_rs.csv")
#         rs_test = test_idx
#         rs_train = train_idx

#         rs_test["created_date"] = pd.to_datetime(rs_test["created_date"])
#         rs_train["created_date"] = pd.to_datetime(rs_train["created_date"])

#         day_before_obj = datetime.datetime.strptime(rs_test['created_date'].min(), '%Y-%m-%d') - datetime.timedelta(days=1)
#         # Convert back to string (date only)
#         before_cut_off = day_before_obj.strftime('%Y-%m-%d')

#         dfb = df[df['borough'] == borough].copy()
#         dfb['created_date'] = pd.to_datetime(dfb['created_date'])

#         # Fill missing dates with 0 for count
#         full_dates = pd.date_range('2020-01-01', last_day, freq='D')
#         full_index = pd.MultiIndex.from_product([[borough], full_dates], names=['borough', 'created_date'])

#         dfb = dfb.set_index(['borough', 'created_date']).reindex(full_index).fillna({'count': 0}).reset_index()
#         dfb['count'] = dfb['count'].astype(int)

#         # Split into training and testing data.
#         df_train = dfb[dfb['created_date'] < cut_off]
#         df_test  = dfb[dfb['created_date'] >= cut_off]

#         # Holt-Winters
#         model = ExponentialSmoothing(df_train['count'].values, trend='add', seasonal='add', seasonal_periods=365).fit(optimized=True)
#         if len(df_test['created_date'])==0:
#             break
#         forecast = model.forecast(len(df_test))


#         rmse_hw = np.sqrt(np.mean((df_test['count'].values - forecast) ** 2))
#         rss_hw =  np.sum((df_test['count'].values - forecast)**2)

#         full_range = pd.date_range(start="2020-01-01", end=before_cut_off, freq="D")

#         test_range = pd.date_range(start=cut_off, end = last_day, freq="D")

#         borough_data = (rs_train[rs_train["borough"] == borough]
#                         .assign(created_date=lambda df: pd.to_datetime(df["created_date"]))
#                         .sort_values("created_date").set_index("created_date"))
#         # reindex and fill missing days with 0
#         borough_data = (borough_data.reindex(full_range)
#                         .assign(count=lambda df: df["count"].fillna(0),borough=borough)
#                         .rename_axis("created_date")
#                         .reset_index())
            
#         borough_data_test = (rs_test[rs_test["borough"] == borough]
#                         .assign(created_date=lambda df: pd.to_datetime(df["created_date"]))
#                         .sort_values("created_date").set_index("created_date"))

#         borough_data_test = (borough_data_test.reindex(test_range)
#                         .assign(count=lambda df: df["count"].fillna(0),borough=borough)
#                         .rename_axis("created_date")
#                         .reset_index())


#         last_date = rs_train["created_date"].max()
#         future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=len(borough_data_test), freq="D")
            
#         # compute seasonal-average forecast
#         forecast = seasonal_average_forecast(borough_data,future_dates,years_back=5)

            
#         actual_series = borough_data_test.set_index('created_date')['count']
#         actual_aligned = actual_series.reindex(forecast.index, fill_value=0)

#         rmse_sa = np.sqrt(np.mean((actual_aligned - forecast.values)**2))
#         rss_sa = np.sum((actual_aligned - forecast.values)**2)

#         metrics_table = pd.concat([metrics_table, pd.DataFrame({'cut_off': cut_off,'last_day': last_day, 
#                                                                 'rmse_hw': [rmse_hw],
#                                                                 'rss_hw': [rss_hw], 
#                                                                 'rmse_sa': [rmse_sa], 
#                                                                 'rss_sa': [rss_sa],
#                                                                 'rmse_hw_better': [rmse_hw<rmse_sa]
#                                                                 })], ignore_index=True)


In [319]:
for fold, (train_idx, test_idx) in enumerate(tscv.split(date_index)):

    # Load once per fold
    rs = pd.read_csv(
        "../scr/data/cleaned_rat_sightings_data/daily_borough_rs.csv",
        parse_dates=["created_date"]
    )

    # Properly split using indices
    rs_train = rs.iloc[train_idx].copy()
    rs_test  = rs.iloc[test_idx].copy()

    # Define cutoff from test set
    cut_off = rs_test["created_date"].min()
    last_day = rs_test["created_date"].max()

    before_cut_off = cut_off - pd.Timedelta(days=1)

    # ===============================
    # Borough-specific full time series
    # ===============================
    dfb = rs[rs["borough"] == borough].copy()

    full_dates = pd.date_range(
        start=dfb["created_date"].min(),
        end=last_day,
        freq="D"
    )

    full_index = pd.MultiIndex.from_product(
        [[borough], full_dates],
        names=["borough", "created_date"]
    )

    dfb = (
        dfb.set_index(["borough", "created_date"])
           .reindex(full_index)
           .fillna({"count": 0})
           .reset_index()
    )
    dfb["count"] = dfb["count"].astype(int)

    # Train / test split using cutoff
    df_train = dfb[dfb["created_date"] < cut_off]
    df_test  = dfb[dfb["created_date"] >= cut_off]

    if df_test.empty:
        break

    # ===============================
    # Holt–Winters model
    # ===============================
    if len(df_train['count']) > 365*2:
        hw_model = ExponentialSmoothing(
            df_train["count"].values,
            trend="add",
            seasonal="add",
            seasonal_periods=365
        ).fit(optimized=True)
    else:
        hw_model = ExponentialSmoothing(
            df_train["count"].values,
            trend="add",
            seasonal="add", 
            seasonal_periods=30
        ).fit(optimized=True)
        

    hw_forecast = hw_model.forecast(len(df_test))

    rmse_hw = np.sqrt(np.mean((df_test["count"].values - hw_forecast) ** 2))
    rss_hw  = np.sum((df_test["count"].values - hw_forecast) ** 2)

    # ===============================
    # Seasonal average model
    # ===============================
    full_range = pd.date_range(
        start=df_train["created_date"].min(),
        end=before_cut_off,
        freq="D"
    )

    test_range = pd.date_range(
        start=cut_off,
        end=last_day,
        freq="D"
    )

    borough_train = (
        rs_train[rs_train["borough"] == borough]
        .set_index("created_date")
        .reindex(full_range)
        .assign(count=lambda x: x["count"].fillna(0), borough=borough)
        .reset_index()
    )

    borough_test = (
        rs_test[rs_test["borough"] == borough]
        .set_index("created_date")
        .reindex(test_range)
        .assign(count=lambda x: x["count"].fillna(0), borough=borough)
        .reset_index()
    )

    future_dates = borough_test['index']

    sa_forecast = seasonal_average_forecast(
        borough_train,
        future_dates,
        years_back=5
    )

    actual = borough_test.set_index("index")["count"]
    actual = actual.reindex(sa_forecast.index, fill_value=0)

    rmse_sa = np.sqrt(np.mean((actual.values - sa_forecast.values) ** 2))
    rss_sa  = np.sum((actual.values - sa_forecast.values) ** 2)

    # ===============================
    # Store metrics
    # ===============================
    metrics_table = pd.concat(
        [
            metrics_table,
            pd.DataFrame({
                "cut_off": [cut_off],
                "last_day": [last_day],
                "rmse_hw": [rmse_hw],
                "rss_hw": [rss_hw],
                "rmse_sa": [rmse_sa],
                "rss_sa": [rss_sa],
                "rmse_hw_better": [rmse_hw < rmse_sa],
                'difference' : [rmse_sa-rmse_hw]
            })
        ],
        ignore_index=True
    )

In [320]:
metrics_table

Unnamed: 0,cut_off,last_day,rmse_hw,rss_hw,rmse_sa,rss_sa,rmse_hw_better,difference
0,2020-04-06 00:00:00,2020-04-08 00:00:00,3.859031,44.676371,,,False,
1,2020-04-08 00:00:00,2020-04-09 00:00:00,2.723444,14.834297,,,False,
2,2020-04-10 00:00:00,2020-04-11 00:00:00,4.559046,41.569806,,,False,
3,2020-04-11 00:00:00,2020-04-13 00:00:00,3.84662,44.389454,,,False,
4,2020-04-13 00:00:00,2020-04-14 00:00:00,5.712883,65.274063,,,False,
...,...,...,...,...,...,...,...,...
387,2021-04-06 00:00:00,2021-04-07 00:00:00,12.469653,310.9845,13.354047,356.661157,True,0.884394
388,2021-04-07 00:00:00,2021-04-08 00:00:00,10.470265,219.252914,17.283489,597.438017,True,6.813224
389,2021-04-09 00:00:00,2021-04-10 00:00:00,4.838539,46.82292,7.188719,103.355372,True,2.35018
390,2021-04-10 00:00:00,2021-04-11 00:00:00,8.905092,158.601309,5.613222,63.016529,False,-3.291869


In [321]:
metrics_table = metrics_table.dropna()

metrics_table

Unnamed: 0,cut_off,last_day,rmse_hw,rss_hw,rmse_sa,rss_sa,rmse_hw_better,difference
325,2021-01-02 00:00:00,2021-01-03 00:00:00,4.193778,35.175549,6.464989,83.592156,True,2.271211
326,2021-01-03 00:00:00,2021-01-04 00:00:00,4.407938,38.859838,3.05355,18.648341,False,-1.354388
327,2021-01-04 00:00:00,2021-01-06 00:00:00,16.761875,842.881357,17.313662,899.288722,True,0.551787
328,2021-01-06 00:00:00,2021-01-07 00:00:00,1.178671,2.77853,2.501239,12.512397,True,1.322569
329,2021-01-07 00:00:00,2021-01-09 00:00:00,1.479574,6.567421,7.829097,183.884298,True,6.349523
...,...,...,...,...,...,...,...,...
387,2021-04-06 00:00:00,2021-04-07 00:00:00,12.469653,310.9845,13.354047,356.661157,True,0.884394
388,2021-04-07 00:00:00,2021-04-08 00:00:00,10.470265,219.252914,17.283489,597.438017,True,6.813224
389,2021-04-09 00:00:00,2021-04-10 00:00:00,4.838539,46.82292,7.188719,103.355372,True,2.35018
390,2021-04-10 00:00:00,2021-04-11 00:00:00,8.905092,158.601309,5.613222,63.016529,False,-3.291869


In [322]:
metrics_table.drop_duplicates(inplace=True)
metrics_table['rmse_hw_better'].value_counts()

rmse_hw_better
True     54
False    13
Name: count, dtype: int64

In [323]:
metrics_table['difference'].describe()

count     67.000000
unique    67.000000
top        2.271211
freq       1.000000
Name: difference, dtype: float64

In [324]:
s = metrics_table['difference']

In [325]:
s = pd.to_numeric(s, errors="coerce")
s.describe()

count    67.000000
mean      2.653215
std       2.833228
min      -3.968536
25%       0.898217
50%       2.929405
75%       4.679981
max       7.956062
Name: difference, dtype: float64