In [4]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error

In [5]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [6]:
def sarima_forecast_district(series, district_name, seasonal=True, 
                            max_p=3, max_q=3, max_P=2, max_Q=2, m=12):
    """
    Complete SARIMA forecasting with data storage and plot saving
    """
    # Create directory structure
    os.makedirs('SARIMA', exist_ok=True)
    
    # 1. Stationarity Check
    adf_result = adfuller(series.dropna())
    is_stationary = adf_result[1] < 0.05

    # 2. Train-Test Split (80:20)
    train_size = int(len(series) * 0.8)
    train, test = series.iloc[:train_size], series.iloc[train_size:]

    # 3. Auto-SARIMA Modeling
    model = auto_arima(
        train,
        start_p=0, start_q=0,
        max_p=max_p, max_q=max_q,
        d=None,
        start_P=0, start_Q=0,
        max_P=max_P, max_Q=max_Q,
        m=m,
        seasonal=seasonal,
        trace=False,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True
    )
    best_order = model.order
    best_seasonal_order = model.seasonal_order

    # 4. Model Fitting with SARIMAX
    sarima_model = SARIMAX(train, 
                          order=best_order,
                          seasonal_order=best_seasonal_order)
    model_fit = sarima_model.fit(disp=False)
    
    # 5. Forecasting with Confidence Intervals
    forecast_result = model_fit.get_forecast(steps=len(test))
    forecast = forecast_result.predicted_mean
    conf_int = forecast_result.conf_int()

    # 6. Create forecast DataFrame
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': test.index,
        'actual': test.values,
        'forecast': forecast.values,
        'lower_ci': conf_int.iloc[:, 0],
        'upper_ci': conf_int.iloc[:, 1]
    })

    # 7. Save forecasts to CSV (append mode)
    forecast_csv_path = os.path.join('SARIMA', 'sarima_forecasts.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, 
                      mode='a', 
                      header=write_header, 
                      index=False)

    # 8. Calculate and save metrics
    rmse = np.sqrt(mean_squared_error(test, forecast))
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'best_order': str(best_order),
        'best_seasonal_order': str(best_seasonal_order),
        'stationary': is_stationary
    }])

    metrics_csv_path = os.path.join('SARIMA', 'sarima_results.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, 
                     mode='a', 
                     header=write_header_metrics, 
                     index=False)

    # 9. Save plot
    plt.figure(figsize=(12, 6))
    plt.plot(train, label='Training Data')
    plt.plot(test, label='Actual Values', color='navy')
    plt.plot(forecast, label='Forecast', color='darkorange')
    plt.fill_between(forecast.index, 
                    conf_int.iloc[:, 0],
                    conf_int.iloc[:, 1],
                    color='orange', alpha=0.1)
    plt.title(f'SARIMA{best_order}x{best_seasonal_order} Forecast for {district_name}\nRMSE: {rmse:.2f}')
    plt.legend()
    plt.savefig(os.path.join('SARIMA', f'SARIMA_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df
    }


In [7]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I1"].asfreq('MS')
        
        results = sarima_forecast_district(ts, district)
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()

=== Metrics ===
     district         rmse best_order best_seasonal_order  stationary
0  AHMEDNAGAR  6603.957851  (0, 0, 0)       (0, 0, 0, 12)        True

=== Forecast Data ===
              district       date  actual  forecast      lower_ci  \
2020-06-01  AHMEDNAGAR 2020-06-01    6875       0.0 -13639.468321   
2020-07-01  AHMEDNAGAR 2020-07-01    6478       0.0 -13639.468321   
2020-08-01  AHMEDNAGAR 2020-08-01    5975       0.0 -13639.468321   
2020-09-01  AHMEDNAGAR 2020-09-01    6550       0.0 -13639.468321   
2020-10-01  AHMEDNAGAR 2020-10-01    6471       0.0 -13639.468321   

                upper_ci  
2020-06-01  13639.468321  
2020-07-01  13639.468321  
2020-08-01  13639.468321  
2020-09-01  13639.468321  
2020-10-01  13639.468321  


  warn('Non-invertible starting seasonal moving average'


=== Metrics ===
  district         rmse best_order best_seasonal_order  stationary
0    AKOLA  1608.783964  (0, 0, 0)       (0, 0, 1, 12)        True

=== Forecast Data ===
           district       date  actual  forecast     lower_ci     upper_ci
2020-06-01    AKOLA 2020-06-01    2999   1595.50 -1802.462561  4993.462561
2020-07-01    AKOLA 2020-07-01    3121   1497.25 -1900.712561  4895.212561
2020-08-01    AKOLA 2020-08-01    3029   1373.25 -2024.712561  4771.212561
2020-09-01    AKOLA 2020-09-01    2763   1401.50 -1996.462561  4799.462561
2020-10-01    AKOLA 2020-10-01    2737   1207.00 -2190.962561  4604.962561
=== Metrics ===
   district         rmse best_order best_seasonal_order  stationary
0  AMRAVATI  3612.235128  (0, 0, 1)       (0, 0, 0, 12)        True

=== Forecast Data ===
            district       date  actual     forecast     lower_ci     upper_ci
2020-06-01  AMRAVATI 2020-06-01    3883  1819.029504 -2151.987036  5790.046045
2020-07-01  AMRAVATI 2020-07-01    3459     

  warn('Non-invertible starting seasonal moving average'


=== Metrics ===
  district         rmse best_order best_seasonal_order  stationary
0    JALNA  1604.893244  (0, 0, 1)       (0, 0, 1, 12)        True

=== Forecast Data ===
           district       date  actual     forecast     lower_ci     upper_ci
2020-06-01    JALNA 2020-06-01    3865  3575.682164  1049.863879  6101.500449
2020-07-01    JALNA 2020-07-01    3234  1745.879763 -1798.145524  5289.905050
2020-08-01    JALNA 2020-08-01    3247  1540.391720 -2003.633567  5084.417007
2020-09-01    JALNA 2020-09-01    3401  1683.879248 -1860.146039  5227.904535
2020-10-01    JALNA 2020-10-01    3244  1638.892269 -1905.133018  5182.917556
=== Metrics ===
   district         rmse best_order best_seasonal_order  stationary
0  KOLHAPUR  4806.402573  (0, 0, 1)       (0, 0, 0, 12)        True

=== Forecast Data ===
            district       date  actual     forecast     lower_ci     upper_ci
2020-06-01  KOLHAPUR 2020-06-01    4861  2159.614625 -3231.494117  7550.723367
2020-07-01  KOLHAPUR 2020-

  warn('Non-invertible starting seasonal moving average'


=== Metrics ===
  district         rmse best_order best_seasonal_order  stationary
0    LATUR  2046.262023  (0, 0, 0)       (0, 0, 1, 12)        True

=== Forecast Data ===
           district       date  actual  forecast     lower_ci     upper_ci
2020-06-01    LATUR 2020-06-01    3989   1895.00 -3048.572635  6838.572635
2020-07-01    LATUR 2020-07-01    3694   1672.75 -3270.822635  6616.322635
2020-08-01    LATUR 2020-08-01    3498   1955.00 -2988.572635  6898.572635
2020-09-01    LATUR 2020-09-01    3654   1881.00 -3062.572635  6824.572635
2020-10-01    LATUR 2020-10-01    3751   1805.50 -3138.072635  6749.072635
=== Metrics ===
  district         rmse best_order best_seasonal_order  stationary
0   MUMBAI  1628.089401  (0, 1, 0)       (1, 0, 0, 12)       False

=== Forecast Data ===
           district       date  actual     forecast     lower_ci      upper_ci
2020-06-01   MUMBAI 2020-06-01    2613  1860.485086 -3427.762575   7148.732748
2020-07-01   MUMBAI 2020-07-01    2916  1518.3

  warn('Too few observations to estimate starting parameters%s.'


=== Metrics ===
    district       rmse best_order best_seasonal_order  stationary
0  NANDURBAR  699.34567  (1, 0, 0)       (2, 0, 2, 12)        True

=== Forecast Data ===
             district       date  actual     forecast     lower_ci  \
2020-06-01  NANDURBAR 2020-06-01    4043  4389.918835  3926.878571   
2020-07-01  NANDURBAR 2020-07-01    3577  4207.539700  3566.587872   
2020-08-01  NANDURBAR 2020-08-01    2901  3483.464176  2714.903962   
2020-09-01  NANDURBAR 2020-09-01    3040  3577.102934  2708.007388   
2020-10-01  NANDURBAR 2020-10-01    2588  3612.795272  2660.976968   

               upper_ci  
2020-06-01  4852.959099  
2020-07-01  4848.491528  
2020-08-01  4252.024391  
2020-09-01  4446.198481  
2020-10-01  4564.613576  
=== Metrics ===
  district         rmse best_order best_seasonal_order  stationary
0   NASHIK  1566.498522  (1, 0, 0)       (0, 0, 0, 12)        True

=== Forecast Data ===
           district       date  actual     forecast     lower_ci      upper_c

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting seasonal moving average'


=== Metrics ===
  district         rmse best_order best_seasonal_order  stationary
0  PALGHAR  1010.223758  (1, 0, 0)       (0, 0, 1, 12)        True

=== Forecast Data ===
           district       date  actual     forecast     lower_ci     upper_ci
2020-06-01  PALGHAR 2020-06-01    5473  6170.275006  5371.163957  6969.386054
2020-07-01  PALGHAR 2020-07-01    4867  6114.720262  4987.364270  7242.076255
2020-08-01  PALGHAR 2020-08-01    4606  5990.849086  4613.489341  7368.208831
2020-09-01  PALGHAR 2020-09-01    4753  5879.497833  4292.928090  7466.067576
2020-10-01  PALGHAR 2020-10-01    4616  5822.686702  4053.155105  7592.218298
=== Metrics ===
   district         rmse best_order best_seasonal_order  stationary
0  PARBHANI  3123.087668  (0, 0, 1)       (0, 0, 0, 12)        True

=== Forecast Data ===
            district       date  actual   forecast     lower_ci     upper_ci
2020-06-01  PARBHANI 2020-06-01    3561  896.61906 -2316.248576  4109.486695
2020-07-01  PARBHANI 2020-07-0

  warn('Non-invertible starting MA parameters found.'


=== Metrics ===
  district         rmse best_order best_seasonal_order  stationary
0  SOLAPUR  6585.335815  (0, 0, 1)       (0, 0, 0, 12)        True

=== Forecast Data ===
           district       date  actual     forecast      lower_ci  \
2020-06-01  SOLAPUR 2020-06-01    6951  4137.406571  -3111.205369   
2020-07-01  SOLAPUR 2020-07-01    6858     0.000000 -10122.133831   
2020-08-01  SOLAPUR 2020-08-01    6356     0.000000 -10122.133831   
2020-09-01  SOLAPUR 2020-09-01    6758     0.000000 -10122.133831   
2020-10-01  SOLAPUR 2020-10-01    6321     0.000000 -10122.133831   

                upper_ci  
2020-06-01  11386.018511  
2020-07-01  10122.133831  
2020-08-01  10122.133831  
2020-09-01  10122.133831  
2020-10-01  10122.133831  
=== Metrics ===
  district          rmse best_order best_seasonal_order  stationary
0    THANE  13841.070297  (0, 0, 1)       (0, 0, 0, 12)        True

=== Forecast Data ===
           district       date  actual     forecast      lower_ci  \
2020-0

  warn('Non-invertible starting MA parameters found.'
  warn('Too few observations to estimate starting parameters%s.'


=== Metrics ===
  district        rmse best_order best_seasonal_order  stationary
0   WARDHA  609.580748  (0, 0, 1)       (0, 0, 2, 12)        True

=== Forecast Data ===
           district       date  actual     forecast    lower_ci     upper_ci
2020-06-01   WARDHA 2020-06-01    1435  1333.098187  505.751652  2160.444721
2020-07-01   WARDHA 2020-07-01    1452  1108.411428  -49.464432  2266.287288
2020-08-01   WARDHA 2020-08-01    1358   845.023458 -312.852402  2002.899318
2020-09-01   WARDHA 2020-09-01    1360   792.073099 -365.802761  1949.948959
2020-10-01   WARDHA 2020-10-01    1306   818.453996 -339.421864  1976.329856




=== Metrics ===
  district        rmse best_order best_seasonal_order  stationary
0   WASHIM  158.819219  (1, 0, 1)       (1, 0, 0, 12)        True

=== Forecast Data ===
           district       date  actual     forecast     lower_ci     upper_ci
2020-06-01   WASHIM 2020-06-01    1583  1827.177199  1485.256349  2169.098049
2020-07-01   WASHIM 2020-07-01    1629  1796.934424  1455.009900  2138.858949
2020-08-01   WASHIM 2020-08-01    1502  1644.282046  1302.353846  1986.210246
2020-09-01   WASHIM 2020-09-01    1599  1619.319685  1277.387811  1961.251560
2020-10-01   WASHIM 2020-10-01    1501  1542.513258  1200.577708  1884.448807
=== Metrics ===
   district        rmse best_order best_seasonal_order  stationary
0  YAVATMAL  263.422179  (0, 1, 1)       (0, 0, 0, 12)        True

=== Forecast Data ===
            district       date  actual     forecast     lower_ci     upper_ci
2020-06-01  YAVATMAL 2020-06-01    3616  3838.685775  3168.260226  4509.111324
2020-07-01  YAVATMAL 2020-07-0

ValueError: There are no more samples after a first-order seasonal differencing. See http://alkaline-ml.com/pmdarima/seasonal-differencing-issues.html for a more in-depth explanation and potential work-arounds.