# Baseline Models

We make some baseline models. Our method of evaluating our model's performance in this notebook will be on how well it performs against data observed from 2025-01-01 to 2025-12-31. We will consider both RMSE and RSS.
In the process of making these models, it become clear very quickly that the Seasonal Naive Forecast was performing far worse than the Seasonal Average Forecast and so we focused on the latter.

In [None]:
# import packages

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import gridspec
from matplotlib.patches import Patch
import datetime


## Baseline for Daily Citywide Rat Sightings Data

In [None]:
rs = pd.read_csv('../scr/data/cleaned_rat_sightings_data/cleaned_rat_sightings.csv')

In [None]:

rs['created_date'] = pd.to_datetime(rs['created_date']) 
rs['closed_date'] = pd.to_datetime(rs['closed_date'])
rs['resolution_action_updated_date'] = pd.to_datetime(rs['resolution_action_updated_date'])


# Start by cutting off datas before 2020-01-01 and after 2025-12-31.

rs = rs[rs['created_date']<='2025-12-31']
rs = rs[rs['created_date']>='2020-01-01']

cut_off = '2025-01-01'

day_before_obj = datetime.datetime.strptime(cut_off, '%Y-%m-%d') - datetime.timedelta(days=1)
# Convert back to string (date only)
before_cut_off = day_before_obj.strftime('%Y-%m-%d')

rs_train = rs[rs['created_date']<cut_off]
rs_test = rs[rs['created_date']>=cut_off]


### Baseline: Seasonal Average Forecast

Our EDA indicates that the number of rat sightings has seasonality, but no trend. We consider a baseline model of a seasonal average forecast i.e. we simply forecast the average of all observations in the past seaons. We assume here that each season is 1 year or 365 days long.

In [None]:
nrs_test = rs_test.groupby([rs_test['created_date'].dt.date]).size().reset_index(name='count')

nrs_train = rs_train.groupby([rs_train['created_date'].dt.date]).size().reset_index(name='count')

full_range = pd.date_range(start="2020-01-01", end=before_cut_off, freq='D')

# reindex and fill missing dates with 0
nrs_train = nrs_train.set_index('created_date').reindex(full_range, fill_value=0).rename_axis('created_date').reset_index()

full_range = pd.date_range(start=cut_off, end="2025-12-31",freq='D')

# reindex and fill missing dates with 0
nrs_test = nrs_test.set_index('created_date').reindex(full_range, fill_value=0).rename_axis('created_date').reset_index()

def seasonal_average_forecast(data, target_dates, years_back=5, day_window=5):
    df = data.copy()
    # ensure datetime type
    df["created_date"] = pd.to_datetime(df["created_date"])
    df["doy"] = df["created_date"].dt.dayofyear
    df["year"] = df["created_date"].dt.year

    forecasts = []
    for target_date in target_dates:
        target_doy = target_date.dayofyear
        target_year = target_date.year
        mask = ((df["year"] >= target_year - years_back) & (df["year"] < target_year) & (np.abs(df["doy"] - target_doy) <= day_window))
        forecasts.append(df.loc[mask, "count"].mean())
    return pd.Series(forecasts, index=target_dates)


# define future dates
# number of years to forecast
years_to_forecast = 1

last_date = nrs_train["created_date"].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=len(nrs_test), freq="D")

fig = plt.figure(figsize=(100,30))

# compute seasonal-average forecast
forecast = seasonal_average_forecast(nrs_train, future_dates, years_back=5)


    # plot observed data
plt.plot(nrs_train["created_date"], nrs_train["count"], "o", color="b", markersize=10, label="Observed")

# plot forecast
plt.plot(forecast.index, forecast.values, color="black", linewidth=5, linestyle = "-", label="Seasonal Avg Forecast")

plt.plot(nrs_test["created_date"], nrs_test["count"], "o",color="b", markersize=10, alpha=0.3, label="Observed")

plt.grid(True)
plt.tick_params(axis='x', labelsize=24)
plt.tick_params(axis='y', labelsize=24)


rmse = np.sqrt(np.mean((nrs_test['count'] - forecast.values)**2))
rss = np.sqrt(np.sum((nrs_test['count'] - forecast.values)**2))

text_box = Patch(facecolor='white', edgecolor='black', label=f'RMSE: {rmse}')
text_box2 = Patch(facecolor='white', edgecolor='black', label=f'RSS: {rss}')


handles, labels = plt.gca().get_legend_handles_labels()
handles.extend([text_box, text_box2])
labels.extend([f"RMSE: {rmse:.6f}", f"RSS: {rss:.6f}"])

plt.legend(handles=handles, labels=labels, fontsize=36)

plt.title("Daily Rat Sightings in NYC: Seasonal Average Forecast", fontsize=36)
plt.show()

### Baseline: Seasonal Naive Forecast

In [None]:
fig = plt.figure(figsize=(100,30))

# plot observed data
plt.plot(nrs_train["created_date"], nrs_train["count"], "o", color="b", markersize=10, label="Observed")

# plot naive forecast

forecast = nrs_train[nrs_train['created_date']>="2024-01-01"]
forecast = forecast[forecast['created_date']!="2024-02-29"]
forecast['created_date'] = forecast['created_date'] + pd.DateOffset(years=1)


plt.plot(forecast['created_date'],forecast['count'], color="black",
    linewidth=5, linestyle = "--", alpha=0.5, label="Seasonal Avg Forecast")

plt.plot(nrs_test["created_date"], nrs_test["count"], "o",
    color="b", markersize=10, alpha=0.3, label="Observed",)

plt.grid(True)
plt.tick_params(axis='x', labelsize=30)
plt.tick_params(axis='y', labelsize=30)

forecast = forecast.reset_index(drop=True)
rmse = np.sqrt(np.mean((nrs_test['count'] - forecast['count'])**2))
rss = np.sqrt(np.sum((nrs_test['count'] - forecast['count'])**2))

text_box = Patch(facecolor='white', edgecolor='black', label=f'RMSE: {rmse}')
text_box2 = Patch(facecolor='white', edgecolor='black', label=f'RSS: {rss}')


handles, labels = plt.gca().get_legend_handles_labels()
handles.extend([text_box, text_box2])
labels.extend([f"RMSE: {rmse:.6f}", f"RSS: {rss:.6f}"])

plt.legend(handles=handles, labels=labels, fontsize=36)

plt.title("Daily Rat Sightings in NYC: Seasonal Naive Forecast", fontsize=46)
plt.show()




## Baseline for Weekly Citywide Rat Sightings

### Baseline: Seasonal Average Forecast

### Baseline: Seasonal Naive Forecast

## Baseline for Monthly Citywide Rat Sightings

### Baseline: Seasonal Average Forecast

### Baseline: Seasonal Naive Forecast

## Baseline for Daily Borough Rat Sightings

In [None]:
daily_borough_rs = pd.read_csv("../scr/data/cleaned_rat_sightings_data/daily_borough_rs.csv")

### Baseline: Seasonal Average Forecast

In [None]:
cdate_borough_test = daily_borough_rs[daily_borough_rs['created_date']>='2025-01-01']
cdate_borough_test = cdate_borough_test[cdate_borough_test['created_date']<='2025-12-31']

cdate_borough = daily_borough_rs[daily_borough_rs['created_date']<'2025-01-01']
cdate_borough = cdate_borough[cdate_borough['created_date']>='2020-01-01']


boroughs = [b for b in rs['borough'].unique() if pd.notnull(b) and b != 'Unspecified']


def seasonal_average_forecast(data, target_dates, years_back=5, day_window=5):
    df = data.copy()
    df["created_date"] = pd.to_datetime(df["created_date"])
    df["doy"] = df["created_date"].dt.dayofyear
    df["year"] = df["created_date"].dt.year

    forecasts = []
    for target_date in target_dates:
        target_doy = target_date.dayofyear
        target_year = target_date.year
        mask = (
            (df["year"] >= target_year - years_back) &
            (df["year"] < target_year) &
            (np.abs(df["doy"] - target_doy) <= day_window)
        )

        forecasts.append(df.loc[mask, "count"].mean())

    return pd.Series(forecasts, index=target_dates)


# ensure global dataframe is datetime
cdate_borough["created_date"] = pd.to_datetime(cdate_borough["created_date"])

# define future dates
# number of years to forecast
years_to_forecast = 1

last_date = cdate_borough["created_date"].max()
future_dates = pd.date_range(start=last_date + pd.Timedelta(days=1), periods=365 * years_to_forecast, freq="D")

fig = plt.figure(figsize=(50,80))
gs = gridspec.GridSpec(5,1, figure=fig, wspace=0.3, hspace=0.3)

colors = ["r", "b", "g", "purple", "b"]

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])

    borough_data = cdate_borough[cdate_borough["borough"] == borough].assign(created_date=lambda df: pd.to_datetime(df["created_date"])).sort_values("created_date").set_index("created_date")

    # create a complete daily date range
    full_range = pd.date_range(start="2020-01-01", end="2024-12-31", freq="D")

    # reindex and fill missing days with 0
    borough_data = borough_data.reindex(full_range).assign(count=lambda df: df["count"].fillna(0),borough=borough).rename_axis("created_date").reset_index()

    borough_data_test = cdate_borough_test[cdate_borough_test["borough"] == borough].sort_values("created_date").copy()

    # compute seasonal-average forecast
    forecast = seasonal_average_forecast(borough_data,future_dates,years_back=5)

    # plot observed data
    ax.plot(borough_data["created_date"].dt.to_pydatetime(), borough_data["count"], "o", color=colors[i], markersize=10, label="Observed")

    # plot forecast
    ax.plot(forecast.index, forecast.values, color="black", linewidth=5, linestyle = "-", label="Seasonal Avg Forecast")

    borough_data_test["created_date"] = pd.to_datetime(borough_data_test["created_date"])
    ax.plot(borough_data_test["created_date"], borough_data_test["count"], "o", color=colors[i], markersize=10, alpha=0.3, label="Observed")

    actual_series = borough_data_test.set_index('created_date')['count']
    actual_aligned = actual_series.reindex(forecast.index, fill_value=0)

    rmse = np.sqrt(np.mean((actual_aligned - forecast.values)**2))
    rss = np.sqrt(np.sum((actual_aligned - forecast.values)**2))

    ax.set_title(f"{borough}", fontsize=35)
    ax.set_xlabel("Date", fontsize=15)
    ax.set_ylabel("Number of Rat Sightings", fontsize=25)
    ax.grid(True)
    ax.set_ylim(0,70)
    ax.tick_params(axis='x', labelsize=24)
    ax.tick_params(axis='y', labelsize=24)
    text_box = Patch(facecolor='white', edgecolor='black', label=f'RMSE: {rmse}')
    text_box2 = Patch(facecolor='white', edgecolor='black', label=f'RSS: {rss}')
    ax.legend(handles=[ax.lines[0], text_box, text_box2], fontsize=22)
    
    

plt.suptitle("Daily Rat Sightings in NYC: Seasonal Average Forecast", fontsize=36)
plt.show()





### Baseline: Seasonal Naive Forecast

## Baseline for Weekly Borough Rat Sightings

In [None]:
weekly_borough_rs = pd.read_csv("../scr/data/cleaned_rat_sightings_data/weekly_borough_rs.csv")

# Convert weekly range to start-of-week datetime
weekly_borough_rs["week_start"] = pd.to_datetime(weekly_borough_rs["created_date"].str.split("/").str[0])

### Baseline: Seasonal Average Forecast

In [None]:
# Split train/test
train = weekly_borough_rs[(weekly_borough_rs["week_start"] >= "2020-01-01") & (weekly_borough_rs["week_start"] < "2025-01-01")]
test = weekly_borough_rs[(weekly_borough_rs["week_start"] >= "2025-01-01") & (weekly_borough_rs["week_start"] <= "2025-12-31")]

# Boroughs
boroughs = [b for b in weekly_borough_rs["borough"].unique() if pd.notnull(b) and b != "Unspecified"]

# Define seasonal average forecast
def seasonal_average_forecast_weekly(data, target_weeks, years_back=5):
    df = data.copy()
    df["year"] = df["week_start"].dt.year
    df["week"] = df["week_start"].dt.isocalendar().week

    forecasts = []
    for d in target_weeks:
        target_year = d.year
        target_week = d.isocalendar().week
        mask = (df["year"] >= target_year - years_back) & (df["year"] < target_year) & (df["week"] == target_week)
        forecasts.append(df.loc[mask, "count"].mean())
    return pd.Series(forecasts, index=target_weeks)


future_weeks = test["week_start"].drop_duplicates().sort_values() # future weeks to forecast


fig = plt.figure(figsize=(20, 5 * len(boroughs)))
colors = ["r", "b", "g", "purple", "orange"]

# plot for each borough

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])

    # Training data
    b_train = train[train["borough"] == borough].copy()
    b_train = b_train.sort_values("week_start")

    # Test data
    b_test = test[test["borough"] == borough].copy()
    b_test = b_test.sort_values("week_start")

    # forecast
    forecast = seasonal_average_forecast_weekly(b_train, future_weeks, years_back=5)

    # plot the training data
    ax.plot(b_train["week_start"], b_train["count"], "o", color=colors[i % len(colors)], label="Observed (Train)")
    # plot our forecast
    ax.plot(forecast.index, forecast.values, "--", color="black", linewidth=2, label="Seasonal Avg Forecast")
    # plot the testing data
    ax.plot(b_test["week_start"], b_test["count"], "o", color=colors[i % len(colors)], alpha=0.3, label="Observed (Test)")

    # RMSE & RSS to include in legend
    actual = b_test.set_index("week_start")["count"]
    actual_aligned = actual.reindex(forecast.index, fill_value=0)
    rmse = np.sqrt(np.mean((actual_aligned - forecast.values)**2))
    rss = np.sqrt(np.sum((actual_aligned - forecast.values)**2))

    text_box = Patch(facecolor='white', edgecolor='black', label=f'RMSE: {rmse:.4f}')
    text_box2 = Patch(facecolor='white', edgecolor='black', label=f'RSS: {rss:.4f}')

    ax.set_title(f"{borough}", fontsize=16)
    ax.set_ylabel("Rat Sightings")
    ax.set_xlabel("Week Start")
    ax.grid(True)
    ax.tick_params(axis="x", rotation=45)
    ax.legend(handles=[ax.lines[0], text_box, text_box2])

plt.suptitle("NYC Rat Sightings: Weekly Seasonal Average Forecast", fontsize=20)
plt.show()

### Baseline: Seasonal Naive Forecast

In [None]:
def seasonal_naive_forecast_weekly(train_data, forecast_weeks, lag_years=1):
    df = train_data.copy()
    df["year"] = df["week_start"].dt.year
    df["week"] = df["week_start"].dt.isocalendar().week
    forecasts = []
    for d in forecast_weeks:
        target_week = d.isocalendar().week
        source_year = d.year - lag_years
        mask = (df["year"] == source_year) & (df["week"] == target_week)
        value = df.loc[mask, "count"].mean()  # mean just in case duplicates...can probably get rid of it
        forecasts.append(value)
    return pd.Series(forecasts, index=forecast_weeks)

future_weeks = test["week_start"].drop_duplicates().sort_values() # future weeks to forecast


fig = plt.figure(figsize=(20, 5 * len(boroughs)))
colors = ["r", "b", "g", "purple", "orange"]

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])

    # Training data
    b_train = train[train["borough"] == borough].copy()
    b_train = b_train.sort_values("week_start")

    # Test data
    b_test = test[test["borough"] == borough].copy()
    b_test = b_test.sort_values("week_start")

    # Seasonal naive forecast (use last year)
    forecast = seasonal_naive_forecast_weekly(b_train, future_weeks, lag_years=1)

    # Plot training data
    ax.plot(b_train["week_start"], b_train["count"], "o", color=colors[i % len(colors)], label="Observed (Train)")

    # Plot forecast
    ax.plot(forecast.index, forecast.values, "--", color="black", linewidth=2, label="Seasonal Naive Forecast")

    # Plot test data
    ax.plot(b_test["week_start"], b_test["count"], "o", color=colors[i % len(colors)], alpha=0.3, label="Observed (Test)")

    # align test data to compute errors
    actual = b_test.set_index("week_start")["count"]
    actual_aligned = actual.reindex(forecast.index)

    rmse = np.sqrt(np.mean((actual_aligned - forecast) ** 2))
    rss = np.sqrt(np.sum((actual_aligned - forecast) ** 2))

    # Legend text boxes
    text_box_rmse = Patch(facecolor="white", edgecolor="black", label=f"RMSE: {rmse:.2f}")
    text_box_rss = Patch(facecolor="white", edgecolor="black", label=f"RSS: {rss:.2f}")

    ax.set_title(f"{borough}", fontsize=16)
    ax.set_ylabel("Rat Sightings")
    ax.set_xlabel("Week Start")
    ax.grid(True)
    ax.tick_params(axis="x", rotation=45)

    ax.legend(handles=[ax.lines[0], ax.lines[1], text_box_rmse, text_box_rss])

plt.suptitle("NYC Rat Sightings: Weekly Seasonal Naive Forecast", fontsize=20)
plt.show()

## Baseline for Monthly Borough Rat Sightings

In [None]:
monthly_borough_rs = pd.read_csv("../scr/data/cleaned_rat_sightings_data/monthly_borough_rs.csv")

### Baseline: Seasonal Average Forecast

In [None]:
# Split train/test
train = monthly_borough_rs[(monthly_borough_rs["created_date"] >= "2020-01") & (monthly_borough_rs["created_date"] < "2025-01")]
test = monthly_borough_rs[(monthly_borough_rs["created_date"] >= "2025-01") & (monthly_borough_rs["created_date"] <= "2025-12")]

train["created_date"] = pd.to_datetime(train["created_date"])
test["created_date"] = pd.to_datetime(test["created_date"])

In [None]:
# Boroughs
boroughs = [b for b in monthly_borough_rs["borough"].unique() if pd.notnull(b) and b != "Unspecified"]

In [None]:
# Define seasonal average forecast
def seasonal_average_forecast_monthly(data, target_months, years_back=5):
    df = data.copy()
    df["year"] = df["created_date"].dt.year
    df["month"] = df["created_date"].dt.month

    forecasts = []
    for d in target_months:
        target_year = d.year
        target_month = d.month
        mask = (df["year"] >= target_year - years_back) & (df["year"] < target_year) & (df["month"] == target_month)
        forecasts.append(df.loc[mask, "count"].mean())
    return pd.Series(forecasts, index=target_months)

future_months = test["created_date"].drop_duplicates().sort_values() # future weeks to forecast


fig = plt.figure(figsize=(20, 5 * len(boroughs)))
colors = ["r", "b", "g", "purple", "orange"]

# plot for each borough

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])

    # Training data
    b_train = train[train["borough"] == borough].copy()
    b_train = b_train.sort_values("created_date")

    # Test data
    b_test = test[test["borough"] == borough].copy()
    b_test = b_test.sort_values("created_date")

    # forecast
    forecast = seasonal_average_forecast_monthly(b_train, future_months, years_back=5)

    # plot the training data
    ax.plot(b_train["created_date"], b_train["count"], "o", color=colors[i % len(colors)], label="Observed (Train)")
    # plot our forecast
    ax.plot(forecast.index, forecast.values, "--", color="black", linewidth=2, label="Seasonal Avg Forecast")
    # plot the testing data
    ax.plot(b_test["created_date"], b_test["count"], "o", color=colors[i % len(colors)], alpha=0.3, label="Observed (Test)")

    # RMSE & RSS to include in legend
    actual = b_test.set_index("created_date")["count"]
    actual_aligned = actual.reindex(forecast.index, fill_value=0)
    rmse = np.sqrt(np.mean((actual_aligned - forecast.values)**2))
    rss = np.sqrt(np.sum((actual_aligned - forecast.values)**2))

    text_box = Patch(facecolor='white', edgecolor='black', label=f'RMSE: {rmse:.2f}')
    text_box2 = Patch(facecolor='white', edgecolor='black', label=f'RSS: {rss:.2f}')

    ax.set_title(f"{borough}", fontsize=16)
    ax.set_ylabel("Rat Sightings")
    ax.set_xlabel("Month")
    ax.grid(True)
    ax.tick_params(axis="x", rotation=45)
    ax.legend(handles=[ax.lines[0], text_box, text_box2])

plt.suptitle("NYC Rat Sightings: Monthly Seasonal Average Forecast", fontsize=20)
plt.show()

### Baseline: Seasonal Naive Forecast

In [None]:
def seasonal_naive_forecast_monthly(train_data, forecast_months, lag_years=1):
    df = train_data.copy()
    df["year"] = df["created_date"].dt.year
    df["month"] = df["created_date"].dt.month
    forecasts = []
    for d in forecast_months:
        target_month = d.month
        source_year = d.year - lag_years
        mask = (df["year"] == source_year) & (df["month"] == target_month)
        value = df.loc[mask, "count"].mean()  # mean just in case duplicates...can probably get rid of it
        forecasts.append(value)
    return pd.Series(forecasts, index=forecast_months)

future_monthss = test["created_date"].drop_duplicates().sort_values() # future weeks to forecast


fig = plt.figure(figsize=(20, 5 * len(boroughs)))
colors = ["r", "b", "g", "purple", "orange"]

for i, borough in enumerate(boroughs):
    ax = fig.add_subplot(gs[i])

    # Training data
    b_train = train[train["borough"] == borough].copy()
    b_train = b_train.sort_values("created_date")

    # Test data
    b_test = test[test["borough"] == borough].copy()
    b_test = b_test.sort_values("created_date")

    # Seasonal naive forecast (use last year)
    forecast = seasonal_naive_forecast_monthly(b_train, future_months, lag_years=1)

    # Plot training data
    ax.plot(b_train["created_date"], b_train["count"], "o", color=colors[i % len(colors)], label="Observed (Train)")

    # Plot forecast
    ax.plot(forecast.index, forecast.values, "--", color="black", linewidth=2, label="Seasonal Naive Forecast")

    # Plot test data
    ax.plot(b_test["created_date"], b_test["count"], "o", color=colors[i % len(colors)], alpha=0.3, label="Observed (Test)")

    # align test data to compute errors
    actual = b_test.set_index("created_date")["count"]
    actual_aligned = actual.reindex(forecast.index)

    rmse = np.sqrt(np.mean((actual_aligned - forecast) ** 2))
    rss = np.sqrt(np.sum((actual_aligned - forecast) ** 2))

    # Legend text boxes
    text_box_rmse = Patch(facecolor="white", edgecolor="black", label=f"RMSE: {rmse:.2f}")
    text_box_rss = Patch(facecolor="white", edgecolor="black", label=f"RSS: {rss:.2f}")

    ax.set_title(f"{borough}", fontsize=16)
    ax.set_ylabel("Rat Sightings")
    ax.set_xlabel("Month")
    ax.grid(True)
    ax.tick_params(axis="x", rotation=45)

    ax.legend(handles=[ax.lines[0], ax.lines[1], text_box_rmse, text_box_rss])

plt.suptitle("NYC Rat Sightings: Monthly Seasonal Naive Forecast", fontsize=20)
plt.show()

# Data Splits

In [None]:
from sklearn.model_selection import TimeSeriesSplit

Our cross-validation will work as follows. 

We first train on data from 2020-01-01 to 2020-12-31, and test using 2021-01-01 to 2021-12-31. Then we train on 2020-01-01 to 2021-12-13 and test from 2022-01-01 to 2022-12-13. We continue until we test on 2025-01-01 to 2025-12-13. 

In other words, our forecasting horizon is 1 year and so we have 6 splits.

In [None]:
kfold = TimeSeriesSplit(n_splits = 6,
                           test_size = 365)

In [None]:
df = rs.copy()
df["created_date"] = pd.to_datetime(df["created_date"])


# Convert dates to ordinal numbers
date_index = df["created_date"].map(pd.Timestamp.toordinal)

df = (
    df
    .sort_values("created_date", ascending=True)
    .reset_index(drop=True)
)

tscv = TimeSeriesSplit(n_splits=6)

for fold, (train_idx, test_idx) in enumerate(tscv.split(date_index)):
    train_dates = df.iloc[train_idx]["created_date"]
    test_dates = df.iloc[test_idx]["created_date"]
    print(f"Fold {fold + 1}")
    print("Train:", train_dates.min(), "→", train_dates.max())
    print("Test: ", test_dates.min(), "→", test_dates.max())
    print()

# SARIMA Models

We might consider a SARIMA model. However, it is by nature, only good at forecasting with a short horizon. It would not be ideal for forecasting a year out. However, it we are simply forcasting for a a month in advance, then it does a pretty good job.

In [None]:
import statsmodels.api as sm
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [None]:
cdate_rat = rs.groupby(rs['created_date'].dt.date).size().reset_index(name='count')


fig, ax = plt.subplots(1, 1, figsize=(55,36))
sm.graphics.tsa.plot_acf(cdate_rat['count'], 
                         lags = 365*2,
                         ax=ax)
plt.xlabel("Lag",fontsize=24)
plt.ylabel("Autocorrelation",fontsize=30)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.show()

fig, ax = plt.subplots(1, 1, figsize=(55,36))
sm.graphics.tsa.plot_pacf(cdate_rat['count'], 
                         lags = 365*2,
                         ax=ax)
plt.xlabel("Lag",fontsize=24)
plt.ylabel("Partial Autocorrelation",fontsize=30)
plt.xticks(fontsize=24)
plt.yticks(fontsize=24)
plt.show()

In [None]:
import statsmodels.tsa.api as sm
from pmdarima import auto_arima

In [None]:
y = rs.groupby([rs['created_date'].dt.date]).size().reset_index(name='count')

full_range = pd.date_range(
    start="2020-01-01",
    end=y['created_date'].max(),
    freq='D'
)

# reindex and fill missing dates with 0
y = (
    y
    .set_index('created_date')
    .reindex(full_range, fill_value=0)
    .rename_axis('created_date')
    .reset_index()
)

plt.figure(figsize=(30,5))
plt.plot(y['created_date'], y['count'])
plt.show()

In the codebloc below, we have set the cuf off date for our training/test data as 2025-01-01. If one continues running this, then one finds that the SARIMA model gives a really bad prediction. 

However, if one changes it to 2025-12-01 and try to forcast only a month out, then the model does an okay job.

In [None]:
cut_off = "2025-12-01"
y_train = y[y['created_date']< cut_off]
y_test = y[y['created_date']>= cut_off]

In [None]:
# A SARIMA model is not wise for seasonality of 1 year with daily data. Picking m = 7 is NOT appropriate here.
# See https://alkaline-ml.com/pmdarima/2.0.1/tips_and_tricks.html?highlight=seasonal


# Uncomment the line below to find AIC minimizing values to use for the ARIMA model. 
# In current state of this code, the optimal choice is (2,1,1)(1,0,1).
z = y_train['count'].to_numpy()
#auto_arima(z, trace=True, error_action="ignore", stepwise=True, seasonal=True, m = 7)

In [None]:
model = sm.ARIMA(z, order = (2, 1, 1), seasonal_order=(1,0,1,7)).fit()
print(model.summary())
plt.figure(figsize=(40,10))
plt.plot(y_train['created_date'], y_train['count'], label="Training Data")
plt.plot(y_test['created_date'], y_test['count'], label="Test Data")

plt.plot(y_train['created_date'], model.fittedvalues, label="Fitted SARIMA Model")
plt.plot(y_test['created_date'], model.forecast(len(y_test['created_date'])), label="SARIMA Forecast")


rmse = np.sqrt(np.mean((y_test['count'] - model.forecast(len(y_test['created_date'])))**2))
rss = np.sqrt(np.sum((y_test['count'] - model.forecast(len(y_test['created_date'])))**2))
text_box = Patch(facecolor='white', edgecolor='black', label=f'RMSE: {rmse:.2f}')
text_box2 = Patch(facecolor='white', edgecolor='black', label=f'RSS: {rss:.2f}')

handles, labels = plt.gca().get_legend_handles_labels()
handles.extend([text_box, text_box2])
labels.extend([f"RMSE: {rmse:.6f}", f"RSS: {rss:.6f}"])

plt.legend(handles=handles, labels=labels, fontsize=18)
plt.show()

# Nonlinear Regression on Fourier Series Models

In [None]:
df_train = nrs_train
df_test = nrs_test

We can use nonlinear regression and try to fit a model of the following form
$$
f(t) = \beta_0 + \beta_1t + \sum_{j=1}^T \left[ \phi_j \cos\left(\frac{2j\pi}{365} t\right) + \psi_j\sin\left(\frac{2j\pi}{365} t\right) \right]
$$
with a linear component to capture any upward trend, and a Fourier polynomial term to capture the periodic part. 

In [None]:
from sklearn.linear_model import LinearRegression


T = 9 # picks out the number of fourier terms

train_times = df_train.index.values
cosines = np.stack([np.cos(2*np.pi*train_times*j/365) for j in range(1,T)]  , axis = 1)
sines = np.stack([np.sin(2*np.pi*train_times*j/365) for j in range(1,T)]  , axis = 1)
train_features = np.concatenate([cosines, sines], axis = 1)

lr = LinearRegression()
lr.fit(train_features,df_train['count'])
lr_train_preds = lr.predict(train_features)





test_times = df_test.index.values
n_train = len(df_train)
test_times = np.arange(n_train, n_train + len(df_test)) 
cosines_test = np.stack([np.cos(2*np.pi*test_times*j/365) for j in range(1,T)], axis=1)
sines_test = np.stack([np.sin(2*np.pi*test_times*j/365) for j in range(1,T)], axis=1)
test_features = np.concatenate([cosines_test, sines_test], axis=1)

lr_test_preds = lr.predict(test_features)



fig = plt.figure(figsize=(24,8))
plt.plot(df_train['created_date'], df_train['count'], label = 'data')
plt.plot(df_test['created_date'], df_test['count'], label='test data')
plt.plot(df_train['created_date'], lr_train_preds, 'k',label = 'regression')
plt.plot(df_test['created_date'],lr_test_preds,'k', label='regression' )


plt.gcf().autofmt_xdate()


train_r2 = lr.score(train_features, df_train['count'])
test_r2 = lr.score(test_features, df_test['count'])

rmse = np.sqrt(np.mean((df_test['count'] - lr_test_preds)**2))
rss = np.sqrt(np.sum((df_test['count'] - lr_test_preds)**2))

text_box = Patch(facecolor='white', edgecolor='black', label=f'RMSE: {rmse:.2f}')
text_box2 = Patch(facecolor='white', edgecolor='black', label=f'RSS: {rss:.2f}')
text_box3 = Patch(facecolor='white', edgecolor='black', label=f'R² on training data: {train_r2:.4f}')
text_box4 = Patch(facecolor='white', edgecolor='black', label=f'R² on testing data: {test_r2:.4f}')

handles, labels = plt.gca().get_legend_handles_labels()
handles.extend([text_box, text_box2, text_box3, text_box4])
labels.extend([f'RMSE: {rmse:.4f}', f'RSS: {rss:.4f}', 
               f'R² on training data: {train_r2:.4f}', f'R² on testing data: {test_r2:.4f}'])

plt.legend(handles=handles, labels=labels, fontsize=11)

plt.show()


We observe that even as we increase $T$, the modeling actually doesn't do significantly better whatsover as the following table will show. In fact, it actually starts doing worse on the test data due to overfitting to the training data!

In [None]:
results = []
for T in range(2,60):

    train_times = df_train.index.values
    cosines = np.stack([np.cos(2*np.pi*train_times*j/365) for j in range(1,T)]  , axis = 1)
    sines = np.stack([np.sin(2*np.pi*train_times*j/365) for j in range(1,T)]  , axis = 1)
    train_features = np.concatenate([cosines, sines], axis = 1)

    lr = LinearRegression()
    lr.fit(train_features,df_train['count'])
    lr_train_preds = lr.predict(train_features)


    train_r2 = lr.score(train_features, df_train['count'])

    test_times = df_test.index.values
    n_train = len(df_train)
    test_times = np.arange(n_train, n_train + len(df_test)) 
    cosines_test = np.stack([np.cos(2*np.pi*test_times*j/365) for j in range(1,T)], axis=1)
    sines_test = np.stack([np.sin(2*np.pi*test_times*j/365) for j in range(1,T)], axis=1)
    test_features = np.concatenate([cosines_test, sines_test], axis=1)


    test_r2 = lr.score(test_features, df_test['count'])
    lr_test_preds = lr.predict(test_features)


    rmse = np.sqrt(np.mean((df_test['count'] - lr_test_preds)**2))
    rss = np.sqrt(np.sum((df_test['count'] - lr_test_preds)**2))

    results.append({
        'T': T,
        'Train R²': train_r2,
        'Test R²': test_r2,
        'Test RMSE': rmse,
        'Test RSS': rss
    })

results_df = pd.DataFrame(results)
display(results_df)