In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit


In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def bayesian_ridge_regression_lags_only(
    series,
    district_name,
    max_lags=3,
    differencing=True,
    n_iter=300,
    alpha_1=1e-6,
    alpha_2=1e-6,
    lambda_1=1e-6,
    lambda_2=1e-6,
    compute_scores=True,
    random_state=None
):
    """
    Bayesian Ridge Regression for time series forecasting with:
    - Probabilistic uncertainty estimates
    - Automatic complexity control
    - Lag features only
    """
    # Create directory structure
    os.makedirs('BayesianRidgeRegression', exist_ok=True)
    
    # 1. Stationarity and Differencing
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 2. Feature Engineering - Lag features
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 3. Train-Test Split
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 4. Model Training with Bayesian Ridge
    model = BayesianRidge(
        n_iter=n_iter,
        alpha_1=alpha_1,
        alpha_2=alpha_2,
        lambda_1=lambda_1,
        lambda_2=lambda_2,
        compute_score=compute_scores,
    )
    model.fit(X_train, y_train)

    # 5. Forecasting with Uncertainty
    pred_test, pred_test_std = model.predict(X_test, return_std=True)

    # 6. Inverse Differencing
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = np.cumsum(pred_test) + last_train_value
        y_test = original_series.iloc[-len(test):]

    # 7. Create forecast DataFrame with uncertainty
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast_mean': pred_test,
        'forecast_std': pred_test_std
    })

    # 8. Save forecasts
    forecast_csv_path = os.path.join('BayesianRidgeRegression', 'bayesian_forecasts.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, mode='a', header=write_header, index=False)

    # 9. Calculate metrics and uncertainty statistics
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast_mean']))
    log_marginal_likelihood = model.scores_[-1] if compute_scores else np.nan
    
    # Calculate coefficient statistics
    coef_means = model.coef_
    coef_stds = np.sqrt(np.diag(model.sigma_))
    
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'log_marginal_likelihood': log_marginal_likelihood,
        'coefficient_means': coef_means.tolist(),
        'coefficient_stds': coef_stds.tolist(),
        'intercept_mean': model.intercept_,
        'intercept_std': np.nan,
        'effective_params': model.lambda_ / model.alpha_,
        'differencing': d,
        'n_iter': n_iter
    }])

    metrics_csv_path = os.path.join('BayesianRidgeRegression', 'bayesian_metrics.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, mode='a', header=write_header_metrics, index=False)

    # 10. Plotting with uncertainty bands
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast_mean'], label='Mean Forecast', color='darkorange')
    plt.fill_between(
        forecast_df['date'],
        forecast_df['forecast_mean'] - 1.96*forecast_df['forecast_std'],
        forecast_df['forecast_mean'] + 1.96*forecast_df['forecast_std'],
        color='orange',
        alpha=0.2,
        label='95% Confidence'
    )
    plt.title(
        f'Bayesian Ridge Forecast for {district_name}\n'
        f'RMSE: {rmse:.2f} | Log ML: {log_marginal_likelihood:.2f}'
    )
    plt.legend()
    plt.savefig(os.path.join('BayesianRidgeRegression', f'bayesian_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'model': model,
        'coef_means': coef_means,
        'coef_stds': coef_stds,
        'intercept_mean': model.intercept_,
        'intercept_std': np.nan
    }

In [4]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I8"].asfreq('MS')
        
        results = bayesian_ridge_regression_lags_only(
            ts,
            district,
            max_lags=4,
            n_iter=500,
            alpha_1=1e-5,
            lambda_1=1e-3
        )
        
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()



=== Metrics ===
     district         rmse  log_marginal_likelihood  \
0  AHMEDNAGAR  1045.384759              -266.157275   

                                   coefficient_means  \
0  [-0.055915173252454466, -0.006594617063588331,...   

                                    coefficient_stds  intercept_mean  \
0  [0.07040739967804341, 0.07050449682920758, 0.0...       96.344732   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      6.115652e+07             1     500  

=== Forecast Data ===
              district       date  actual  forecast_mean  forecast_std
date                                                                  
2020-07-01  AHMEDNAGAR 2020-07-01    5982    6492.087600    612.678322
2020-08-01  AHMEDNAGAR 2020-08-01    5559    6593.683125    611.559116
2020-09-01  AHMEDNAGAR 2020-09-01    5903    6723.393704    604.721683
2020-10-01  AHMEDNAGAR 2020-10-01    5719    6817.608441    605.047265
2020-11-01  AHMEDNAGAR 2020-11-01    5427    693



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  AMRAVATI  292.646152              -240.833523   

                                   coefficient_means  \
0  [0.3684088953023246, -0.02632047231256236, -0....   

                                    coefficient_stds  intercept_mean  \
0  [0.136352628553879, 0.14699826003869187, 0.145...      915.156101   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     640834.074308             0     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  AMRAVATI 2020-07-01  1905.0    2216.932911    332.368996
2020-08-01  AMRAVATI 2020-08-01  1776.0    2127.382064    318.792893
2020-09-01  AMRAVATI 2020-09-01  1794.0    2115.743981    308.944319
2020-10-01  AMRAVATI 2020-10-01  1814.0    2029.386770    298.231796
2020-11-01  AMRAVATI 2020-11-01  1896.0    2009.075392    296.3942



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0     BEED  596.362735              -238.333383   

                                   coefficient_means  \
0  [-0.005286221023545177, -0.0006380683692904613...   

                                    coefficient_stds  intercept_mean  \
0  [0.025148835180151558, 0.02515016700752817, 0....       50.027619   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.106426e+08             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01     BEED 2020-07-01    3407    3503.759268    268.592213
2020-08-01     BEED 2020-08-01    2907    3553.050828    268.124848
2020-09-01     BEED 2020-09-01    3136    3603.789146    268.174427
2020-10-01     BEED 2020-10-01    3117    3654.397620    268.180197
2020-11-01     BEED 2020-11-01    2842    3703.163969    267.991545
=== Me



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  BULDHANA  188.372013              -226.861608   

                                   coefficient_means  \
0  [-0.19418255354068545, -0.06683891318722619, 0...   

                                    coefficient_stds  intercept_mean  \
0  [0.1116713480544039, 0.11302558480038329, 0.11...       27.416055   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.458839e+06             1     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  BULDHANA 2020-07-01    1439    1532.948691    187.122351
2020-08-01  BULDHANA 2020-08-01    1345    1579.217526    186.312541
2020-09-01  BULDHANA 2020-09-01    1602    1630.849234    186.415162
2020-10-01  BULDHANA 2020-10-01    1459    1613.551684    186.695794
2020-11-01  BULDHANA 2020-11-01    1389    1650.160172    187.0590



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0    DHULE  679.649229              -245.775946   

                                   coefficient_means  \
0  [-0.17826493238467264, -0.10618656256634618, 0...   

                                    coefficient_stds  intercept_mean  \
0  [0.11619745497694281, 0.11908945340657139, 0.1...       67.158842   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      4.103243e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01    DHULE 2020-07-01    3317    3574.484502    329.696409
2020-08-01    DHULE 2020-08-01    2764    3718.012836    330.377316
2020-09-01    DHULE 2020-09-01    3344    3890.461809    335.283148
2020-10-01    DHULE 2020-10-01    3020    3905.606630    335.896388
2020-11-01    DHULE 2020-11-01    2908    3944.453038    335.093475
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   GONDIA  119.867312              -225.049381   

                                   coefficient_means  \
0  [0.3795506593733345, -0.0221733431181515, 0.09...   

                                    coefficient_stds  intercept_mean  \
0  [0.1390737809972628, 0.14817134556889042, 0.14...      908.999293   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     320866.547263             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   GONDIA 2020-07-01  1454.0    1461.817659    351.246757
2020-08-01   GONDIA 2020-08-01  1331.0    1429.817023    348.992316
2020-09-01   GONDIA 2020-09-01  1309.0    1369.915249    341.614159
2020-10-01   GONDIA 2020-10-01  1341.0    1361.510003    327.141367
2020-11-01   GONDIA 2020-11-01  1444.0    1372.504608    316.648291
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0    JALNA  226.509423              -233.514934   

                                   coefficient_means  \
0  [0.5813788269446596, -0.05528607363758603, 0.2...   

                                    coefficient_stds  intercept_mean  \
0  [0.14258469008641025, 0.16417381130412506, 0.1...      818.194486   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     215002.571532             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01    JALNA 2020-07-01  2128.0    1840.119262    384.718927
2020-08-01    JALNA 2020-08-01  1604.0    1943.152000    403.723260
2020-09-01    JALNA 2020-09-01  1731.0    1570.480138    394.494363
2020-10-01    JALNA 2020-10-01  1698.0    1745.297468    383.752306
2020-11-01    JALNA 2020-11-01  1869.0    1555.008248    379.067572
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0    LATUR  563.170007              -241.805607   

                                   coefficient_means  \
0  [-0.0013529203558331736, -0.000404701609560308...   

                                    coefficient_stds  intercept_mean  \
0  [0.013271728493739266, 0.013271823776079958, 0...       70.628176   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      4.968853e+08             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01    LATUR 2020-07-01    3500    3854.291187    296.815365
2020-08-01    LATUR 2020-08-01    3274    3925.311167    296.825355
2020-09-01    LATUR 2020-09-01    3634    3996.445857    296.777769
2020-10-01    LATUR 2020-10-01    3408    4066.860867    296.807583
2020-11-01    LATUR 2020-11-01    3363    4137.746625    296.821167
=== Me



=== Metrics ===
  district         rmse  log_marginal_likelihood  \
0   NAGPUR  1230.944181              -272.107458   

                                   coefficient_means  \
0  [-0.2743230018280001, -0.18557832148219133, 0....   

                                    coefficient_stds  intercept_mean  \
0  [0.13553686900015208, 0.1402494378087789, 0.14...       85.241813   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      8.768603e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   NAGPUR 2020-07-01    5816    5469.592915    730.429052
2020-08-01   NAGPUR 2020-08-01    4461    5617.495414    695.563832
2020-09-01   NAGPUR 2020-09-01    4485    6137.772217    722.650564
2020-10-01   NAGPUR 2020-10-01    4837    6482.238350    704.870027
2020-11-01   NAGPUR 2020-11-01    5721    6285.262388    705.551960
=== 



=== Metrics ===
    district         rmse  log_marginal_likelihood  \
0  NANDURBAR  1375.567976              -241.223893   

                                   coefficient_means  \
0  [-0.3011047904314968, 0.15918270595637454, 0.0...   

                                    coefficient_stds  intercept_mean  \
0  [0.13470760184092792, 0.14315908802617477, 0.1...       66.821082   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.543506e+06             1     500  

=== Forecast Data ===
             district       date  actual  forecast_mean  forecast_std
date                                                                 
2020-07-01  NANDURBAR 2020-07-01    3193    3618.157898    284.730711
2020-08-01  NANDURBAR 2020-08-01    2585    3759.827886    288.050139
2020-09-01  NANDURBAR 2020-09-01    2762    3949.565626    299.953359
2020-10-01  NANDURBAR 2020-10-01    2322    3862.816565    293.460948
2020-11-01  NANDURBAR 2020-11-01    2311    4086.488643 



=== Metrics ===
    district        rmse  log_marginal_likelihood  \
0  OSMANABAD  449.430486              -234.423022   

                                   coefficient_means  \
0  [-0.33728557059304964, -0.47210028604208437, -...   

                                    coefficient_stds  intercept_mean  \
0  [0.1547070035667847, 0.1637848707880108, 0.159...        25.66973   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      422738.83042             1     500  

=== Forecast Data ===
             district       date  actual  forecast_mean  forecast_std
date                                                                 
2020-07-01  OSMANABAD 2020-07-01    1856    1954.675659    227.497190
2020-08-01  OSMANABAD 2020-08-01    1675    2173.085101    227.304687
2020-09-01  OSMANABAD 2020-09-01    1793    2351.847720    232.986029
2020-10-01  OSMANABAD 2020-10-01    1701    2434.523867    232.039977
2020-11-01  OSMANABAD 2020-11-01    1857    2452.070324   



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  PARBHANI  201.608717              -214.812099   

                                   coefficient_means  \
0  [-0.015670141715620343, -0.007627041502659375,...   

                                    coefficient_stds  intercept_mean  \
0  [0.048225267205823215, 0.04812325644001451, 0....       15.109423   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      7.064381e+06             1     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  PARBHANI 2020-07-01    1441    1559.512176    133.635844
2020-08-01  PARBHANI 2020-08-01    1144    1576.400150    133.699153
2020-09-01  PARBHANI 2020-09-01    1478    1594.241477    134.472652
2020-10-01  PARBHANI 2020-10-01    1580    1608.834143    135.388115
2020-11-01  PARBHANI 2020-11-01    1311    1626.570550    135.3533



=== Metrics ===
  district         rmse  log_marginal_likelihood  \
0   RAIGAD  1827.987996               -256.21688   

                                   coefficient_means  \
0  [-0.22752049515865258, -0.08797424505744648, 0...   

                                    coefficient_stds  intercept_mean  \
0  [0.1251239233962866, 0.12582995794246385, 0.13...      -26.193298   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      3.994216e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   RAIGAD 2020-07-01    1563    1189.448213    474.261670
2020-08-01   RAIGAD 2020-08-01    1516    1374.534324    466.925824
2020-09-01   RAIGAD 2020-09-01    1568    1326.561047    427.315976
2020-10-01   RAIGAD 2020-10-01    1569    1302.798071    427.197363
2020-11-01   RAIGAD 2020-11-01    2801    1265.063380    426.870875
=== 



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   SANGLI  311.005251              -232.266027   

                                   coefficient_means  \
0  [-0.1290702903985413, -0.0432250870782345, 0.0...   

                                    coefficient_stds  intercept_mean  \
0  [0.1119265162663729, 0.11437289805522842, 0.11...       30.086389   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      2.115628e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   SANGLI 2020-07-01    1914    1766.514428    221.364371
2020-08-01   SANGLI 2020-08-01    1544    1720.036820    222.266657
2020-09-01   SANGLI 2020-09-01    1534    1842.059594    224.482714
2020-10-01   SANGLI 2020-10-01    1492    1885.701063    222.292586
2020-11-01   SANGLI 2020-11-01    1581    1871.648680    221.700848
=== Me



=== Metrics ===
     district       rmse  log_marginal_likelihood  \
0  SINDHUDURG  88.311832              -176.066576   

                                   coefficient_means  \
0  [0.00777431221249118, -0.03916801703616647, -0...   

                                    coefficient_stds  intercept_mean  \
0  [0.08096318009582673, 0.0810126646126468, 0.08...        8.829283   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     212325.027929             1     500  

=== Forecast Data ===
              district       date  actual  forecast_mean  forecast_std
date                                                                  
2020-07-01  SINDHUDURG 2020-07-01     500     431.564202     42.604752
2020-08-01  SINDHUDURG 2020-08-01     376     442.702443     43.029192
2020-09-01  SINDHUDURG 2020-09-01     367     444.381999     44.242340
2020-10-01  SINDHUDURG 2020-10-01     361     451.834528     44.059229
2020-11-01  SINDHUDURG 2020-11-01     322     470.28



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   WARDHA  175.147948              -216.652335   

                                   coefficient_means  \
0  [0.06618262735130617, -0.004822405221734008, -...   

                                    coefficient_stds  intercept_mean  \
0  [0.07654670146894761, 0.07671074912005561, 0.0...     1387.323155   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.839439e+06             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   WARDHA 2020-07-01  1258.0    1380.105868    237.326835
2020-08-01   WARDHA 2020-08-01  1219.0    1380.636805    231.560300
2020-09-01   WARDHA 2020-09-01  1186.0    1384.069283    223.087774
2020-10-01   WARDHA 2020-10-01  1151.0    1387.160348    215.317067
2020-11-01   WARDHA 2020-11-01  1033.0    1386.429097    211.686276
=== Me



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  YAVATMAL  337.123054              -252.394013   

                                   coefficient_means  \
0  [0.22966774768749817, 0.18659379553309444, 0.1...   

                                    coefficient_stds  intercept_mean  \
0  [0.13052287005289992, 0.13333982932146451, 0.1...     1176.995952   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      2.184172e+06             0     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  YAVATMAL 2020-07-01  3187.0    3102.802364    578.720726
2020-08-01  YAVATMAL 2020-08-01  2613.0    3107.205390    585.523668
2020-09-01  YAVATMAL 2020-09-01  2629.0    2969.922610    562.951334
2020-10-01  YAVATMAL 2020-10-01  2797.0    2864.561734    541.173314
2020-11-01  YAVATMAL 2020-11-01  2682.0    2853.864777    537.4263

{'district': 'MUMBAI SUBURBAN',
 'forecast_df':                    district       date  actual  forecast_mean  forecast_std
 date                                                                       
 2021-02-01  MUMBAI SUBURBAN 2021-02-01    9380   11106.686699    755.107842
 2021-03-01  MUMBAI SUBURBAN 2021-03-01    9076   11698.350871    397.179698,
 'metrics_df':           district         rmse  log_marginal_likelihood  \
 0  MUMBAI SUBURBAN  2220.154392               -39.707873   
 
                                    coefficient_means  \
 0  [-0.5932117462261333, 0.3697421682894404, -0.0...   
 
                                     coefficient_stds  intercept_mean  \
 0  [0.24369439621683497, 0.3627965832852178, 0.18...      700.229865   
 
    intercept_std  effective_params  differencing  n_iter  
 0            NaN     511401.665001             1     500  ,
 'model': BayesianRidge(alpha_1=1e-05, compute_score=True, lambda_1=0.001, n_iter=500),
 'coef_means': array([-0.59321175