In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pmdarima import auto_arima
from statsmodels.tsa.arima.model import ARIMA
from sklearn.metrics import mean_squared_error
from statsmodels.tsa.stattools import adfuller


In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)



In [3]:
def arima_forecast_district(series, district_name, seasonal=False, max_p=5, max_q=5):
    """
    Complete ARIMA forecasting with data storage and plot saving
    """
    # Create directory structure
    os.makedirs('ARIMA', exist_ok=True)
    
    # 1. Stationarity Check
    adf_result = adfuller(series.dropna())
    is_stationary = adf_result[1] < 0.05

    # 2. Train-Test Split (80:20)
    train_size = int(len(series) * 0.8)
    train, test = series.iloc[:train_size], series.iloc[train_size:]

    # 3. Auto-ARIMA Modeling
    model = auto_arima(
        train,
        start_p=0, start_q=0,
        max_p=max_p, max_q=max_q,
        d=None,
        seasonal=seasonal,
        trace=False,
        error_action='ignore',
        suppress_warnings=True,
        stepwise=True
    )
    best_order = model.order

    # 4. Model Fitting
    arima_model = ARIMA(train, order=best_order)
    model_fit = arima_model.fit()
    
    # 5. Forecasting with Confidence Intervals
    forecast_result = model_fit.get_forecast(steps=len(test))
    forecast = forecast_result.predicted_mean
    conf_int = forecast_result.conf_int()

    # 6. Create forecast DataFrame
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': test.index,
        'actual': test.values,
        'forecast': forecast.values,
        'lower_ci': conf_int.iloc[:, 0],
        'upper_ci': conf_int.iloc[:, 1]
    })

    # 7. Save forecasts to CSV (append mode)
    forecast_csv_path = os.path.join('ARIMA', 'arima_forecasts.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, 
                      mode='a', 
                      header=write_header, 
                      index=False)

    # 8. Calculate and save metrics
    rmse = np.sqrt(mean_squared_error(test, forecast))
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'best_order': str(best_order),
        'stationary': is_stationary
    }])

    metrics_csv_path = os.path.join('ARIMA', 'arima_results.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, 
                     mode='a', 
                     header=write_header_metrics, 
                     index=False)

    # 9. Save plot
    plt.figure(figsize=(12, 6))
    plt.plot(train, label='Training Data')
    plt.plot(test, label='Actual Values', color='navy')
    plt.plot(forecast, label='Forecast', color='darkorange')
    plt.fill_between(forecast.index, 
                    conf_int.iloc[:, 0],
                    conf_int.iloc[:, 1],
                    color='orange', alpha=0.1)
    plt.title(f'ARIMA{best_order} Forecast for {district_name}\nRMSE: {rmse:.2f}')
    plt.legend()
    plt.savefig(os.path.join('ARIMA', f'ARIMA_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df
    }

In [5]:
districts = data['district'].unique()
rmse_values = []
districts

array(['AHMEDNAGAR', 'AKOLA', 'AMRAVATI', 'AURANGABAD', 'BEED',
       'BHANDARA', 'BULDHANA', 'CHANDRAPUR', 'DHULE', 'GADCHIROLI',
       'GONDIA', 'HINGOLI', 'JALGAON', 'JALNA', 'KOLHAPUR', 'LATUR',
       'MUMBAI', 'NAGPUR', 'NANDED', 'NANDURBAR', 'NASHIK', 'OSMANABAD',
       'PALGHAR', 'PARBHANI', 'PUNE', 'RAIGAD', 'RATNAGIRI', 'SANGLI',
       'SATARA', 'SINDHUDURG', 'SOLAPUR', 'THANE', 'WARDHA', 'WASHIM',
       'YAVATMAL', 'MUMBAI SUBURBAN'], dtype=object)

In [6]:

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I1"].asfreq('MS')
        
        results = arima_forecast_district(ts, district)
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()

=== Metrics ===
     district        rmse best_order  stationary
0  AHMEDNAGAR  546.308856  (0, 0, 0)        True

=== Forecast Data ===
              district       date  actual     forecast     lower_ci  \
2020-06-01  AHMEDNAGAR 2020-06-01    6875  6938.184219  5882.996413   
2020-07-01  AHMEDNAGAR 2020-07-01    6478  6938.184219  5882.996413   
2020-08-01  AHMEDNAGAR 2020-08-01    5975  6938.184219  5882.996413   
2020-09-01  AHMEDNAGAR 2020-09-01    6550  6938.184219  5882.996413   
2020-10-01  AHMEDNAGAR 2020-10-01    6471  6938.184219  5882.996413   

               upper_ci  
2020-06-01  7993.372026  
2020-07-01  7993.372026  
2020-08-01  7993.372026  
2020-09-01  7993.372026  
2020-10-01  7993.372026  
=== Metrics ===
  district        rmse best_order  stationary
0    AKOLA  284.176113  (0, 0, 0)        True

=== Forecast Data ===
           district       date  actual     forecast     lower_ci    upper_ci
2020-06-01    AKOLA 2020-06-01    2999  2706.236806  2306.445172  3106.0

  warn('Non-stationary starting autoregressive parameters'
  warn('Non-invertible starting MA parameters found.'


=== Metrics ===
   district        rmse best_order  stationary
0  BHANDARA  117.589124  (2, 0, 2)       False

=== Forecast Data ===
            district       date  actual     forecast     lower_ci     upper_ci
2020-06-01  BHANDARA 2020-06-01    1341  1454.647423  1232.346041  1676.948805
2020-07-01  BHANDARA 2020-07-01    1262  1369.914724  1147.492486  1592.336962
2020-08-01  BHANDARA 2020-08-01    1185  1296.726555  1074.447190  1519.005919
2020-09-01  BHANDARA 2020-09-01    1200  1255.773739  1033.748754  1477.798724
2020-10-01  BHANDARA 2020-10-01    1233  1258.633401  1036.704616  1480.562187
=== Metrics ===
   district        rmse best_order  stationary
0  BULDHANA  361.922044  (2, 0, 0)        True

=== Forecast Data ===
            district       date  actual     forecast     lower_ci     upper_ci
2020-06-01  BULDHANA 2020-06-01    4320  4352.093807  3749.293783  4954.893832
2020-07-01  BULDHANA 2020-07-01    3900  4087.309504  3453.745092  4720.873917
2020-08-01  BULDHANA 20

{'district': 'MUMBAI SUBURBAN',
 'forecast_df':                    district       date  actual      forecast      lower_ci  \
 2021-01-01  MUMBAI SUBURBAN 2021-01-01   13242  13038.458054  10138.540247   
 2021-02-01  MUMBAI SUBURBAN 2021-02-01   13444  12745.939612   8873.528387   
 2021-03-01  MUMBAI SUBURBAN 2021-03-01   13294  12487.070685   7997.802391   
 
                 upper_ci  
 2021-01-01  15938.375860  
 2021-02-01  16618.350837  
 2021-03-01  16976.338979  ,
 'metrics_df':           district        rmse best_order  stationary
 0  MUMBAI SUBURBAN  627.123738  (1, 0, 0)        True}

In [16]:
pd.DataFrame(rmse_values)

Unnamed: 0,0,1
0,AHMEDNAGAR,546.308856
1,AKOLA,284.176113
2,472.281209,AMRAVATI
3,AURANGABAD,629.832046
4,453.447573,BEED
5,BHANDARA,117.589124
6,BULDHANA,361.922044
7,CHANDRAPUR,399.417906
8,403.910176,DHULE
9,GADCHIROLI,347.797524
