In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from statsmodels.graphics.tsaplots import plot_acf, plot_pacf

In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def sarima_forecast_district(series, district_name, seasonal=True, 
                            max_p=3, max_q=3, max_P=2, max_Q=2, m=12):
    """
    Complete SARIMA forecasting with data storage and plot saving
    """
    # Create directory structure
    os.makedirs('SARIMA', exist_ok=True)
    
    # 1. Stationarity Check
    adf_result = adfuller(series.dropna())
    is_stationary = adf_result[1] < 0.05

    # 2. Train-Test Split (80:20)
    train_size = int(len(series) * 0.8)
    train, test = series.iloc[:train_size], series.iloc[train_size:]

    # 3. Auto-SARIMA Modeling
    model = auto_arima(
        train,
        start_p=0, start_q=0,
        max_p=max_p, max_q=max_q,
        d=None,
        start_P=0, start_Q=0,
        max_P=max_P, max_Q=max_Q,
        m=m,
        seasonal=seasonal,
        trace=False,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True
    )
    best_order = model.order
    best_seasonal_order = model.seasonal_order

    # 4. Model Fitting with SARIMAX
    sarima_model = SARIMAX(train, 
                          order=best_order,
                          seasonal_order=best_seasonal_order)
    model_fit = sarima_model.fit(disp=False)
    
    # 5. Forecasting with Confidence Intervals
    forecast_result = model_fit.get_forecast(steps=len(test))
    forecast = forecast_result.predicted_mean
    conf_int = forecast_result.conf_int()

    # 6. Create forecast DataFrame
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': test.index,
        'actual': test.values,
        'forecast': forecast.values,
        'lower_ci': conf_int.iloc[:, 0],
        'upper_ci': conf_int.iloc[:, 1]
    })

    # 7. Save forecasts to CSV (append mode)
    forecast_csv_path = os.path.join('SARIMA', 'sarima_forecasts.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, 
                      mode='a', 
                      header=write_header, 
                      index=False)

    # 8. Calculate and save metrics
    rmse = np.sqrt(mean_squared_error(test, forecast))
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'best_order': str(best_order),
        'best_seasonal_order': str(best_seasonal_order),
        'stationary': is_stationary
    }])

    metrics_csv_path = os.path.join('SARIMA', 'sarima_results.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, 
                     mode='a', 
                     header=write_header_metrics, 
                     index=False)

    # 9. Save plot
    plt.figure(figsize=(12, 6))
    plt.plot(train, label='Training Data')
    plt.plot(test, label='Actual Values', color='navy')
    plt.plot(forecast, label='Forecast', color='darkorange')
    plt.fill_between(forecast.index, 
                    conf_int.iloc[:, 0],
                    conf_int.iloc[:, 1],
                    color='orange', alpha=0.1)
    plt.title(f'SARIMA{best_order}x{best_seasonal_order} Forecast for {district_name}\nRMSE: {rmse:.2f}')
    plt.legend()
    plt.savefig(os.path.join('SARIMA', f'SARIMA_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df
    }


In [4]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        # ts = district_data["I8"].asfreq('MS')
        ts = district_data["I48"].astype(float).asfreq('MS')

        
        results = sarima_forecast_district(ts, district)
    
        # Show results
        # print("=== Metrics ===")
        # print(results['metrics_df'])
        # print("\n=== Forecast Data ===")
        # print(results['forecast_df'].head())
    
    return results
run_for_each_district()

  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-stationary starting seasonal autoregressive'
  warn('Non-invertible starting seasonal moving average'
  warn('Non-invertible starting seasonal moving average'


ValueError: There are no more samples after a first-order seasonal differencing. See http://alkaline-ml.com/pmdarima/seasonal-differencing-issues.html for a more in-depth explanation and potential work-arounds.