In [1]:
# --- Import Required Libraries ---
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error
import warnings
warnings.filterwarnings("ignore")

In [4]:
# --- Load Dataset ---
data = pd.read_csv("./datasets/ratnapark_pm25_after_imputation.csv")

print("âœ… Dataset Loaded:", data.shape)
display(data.head())

âœ… Dataset Loaded: (26304, 15)


Unnamed: 0.1,Unnamed: 0,PM2.5,YEAR,MO,DY,HR,PS,WS2M,WD2M,WS10M,WD10M,PRECTOTCORR,RH2M,QV2M,T2M
0,0,75.780952,2022,1,1,0,88.03,0.47,317.6,0.77,319.2,0.0,83.08,6.7,8.85
1,1,56.584127,2022,1,1,1,88.0,0.4,310.0,0.64,313.7,0.0,80.28,6.48,8.86
2,2,49.53871,2022,1,1,2,87.96,0.26,319.6,0.46,323.9,0.0,78.29,6.27,8.74
3,3,47.398438,2022,1,1,3,87.93,0.28,300.3,0.46,306.1,0.0,77.32,6.02,8.32
4,4,42.821875,2022,1,1,4,87.91,0.29,329.0,0.51,330.4,0.0,75.66,5.8,8.09


In [5]:
# --- Ensure Date/Time Column Exists ---
# Replace 'Date' with the actual date/time column name in your dataset
date_col = 'Date'  # change if your column name differs
data[date_col] = pd.to_datetime(data[date_col])

# --- Sort and Set as Index ---
data = data.sort_values(by=date_col)
data.set_index(date_col, inplace=True)

# --- Extract Target Series ---
pm_series = data['PM2.5'].asfreq('H')  # hourly frequency, change if daily ('D')

# --- Check for Missing Values ---
# pm_series = pm_series.interpolate()  # smooth fill for time gaps

KeyError: 'Date'

In [None]:




# --- Visualize Time Series ---
plt.figure(figsize=(12,4))
plt.plot(pm_series, label="PM2.5 Concentration")
plt.title("PM2.5 Time Series (Ratna Park)")
plt.xlabel("Time")
plt.ylabel("PM2.5 (Âµg/mÂ³)")
plt.legend()
plt.show()

# --- Train-Test Split (last 20% for testing) ---
train_size = int(len(pm_series) * 0.8)
train, test = pm_series[:train_size], pm_series[train_size:]

# --- SARIMA Model Configuration ---
# order(p,d,q): AR, differencing, MA
# seasonal_order(P,D,Q,s): seasonal ARIMA, where s = seasonal period (24 for hourly, 12 for monthly)
model = SARIMAX(train,
                order=(1,1,1),
                seasonal_order=(1,1,1,24),  # assuming 24-hour seasonality
                enforce_stationarity=False,
                enforce_invertibility=False)

sarima_fit = model.fit(disp=False)

print("\nâœ… SARIMA Model Summary:")
print(sarima_fit.summary())

# --- Forecast on Test Data ---
forecast = sarima_fit.forecast(steps=len(test))

# --- Evaluation Metrics ---
mae = mean_absolute_error(test, forecast)
rmse = np.sqrt(mean_squared_error(test, forecast))

print("\nðŸ“Š Model Performance:")
print(f"MAE  : {mae:.3f}")
print(f"RMSE : {rmse:.3f}")

# --- Plot Actual vs Predicted ---
plt.figure(figsize=(12,4))
plt.plot(train.index, train, label="Train")
plt.plot(test.index, test, label="Actual")
plt.plot(test.index, forecast, label="Predicted", color='red')
plt.title("PM2.5 Actual vs Predicted (SARIMA)")
plt.xlabel("Time")
plt.ylabel("PM2.5 (Âµg/mÂ³)")
plt.legend()
plt.show()

# --- Forecast Future Values (e.g., next 7 days) ---
future_steps = 24 * 7  # 7 days ahead (if hourly)
future_forecast = sarima_fit.forecast(steps=future_steps)

plt.figure(figsize=(12,4))
plt.plot(pm_series, label="Observed")
plt.plot(pd.date_range(pm_series.index[-1], periods=future_steps+1, freq='H')[1:], 
         future_forecast, label="Future Forecast", color='green')
plt.title("Future PM2.5 Forecast (7 Days Ahead)")
plt.xlabel("Time")
plt.ylabel("PM2.5 (Âµg/mÂ³)")
plt.legend()
plt.show()
