In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit


In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def bayesian_ridge_regression_lags_only(
    series,
    district_name,
    max_lags=3,
    differencing=True,
    n_iter=300,
    alpha_1=1e-6,
    alpha_2=1e-6,
    lambda_1=1e-6,
    lambda_2=1e-6,
    compute_scores=True,
    random_state=None
):
    """
    Bayesian Ridge Regression for time series forecasting with:
    - Probabilistic uncertainty estimates
    - Automatic complexity control
    - Lag features only
    """
    # Create directory structure
    os.makedirs('BayesianRidgeRegression', exist_ok=True)
    
    # 1. Stationarity and Differencing
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 2. Feature Engineering - Lag features
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 3. Train-Test Split
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 4. Model Training with Bayesian Ridge
    model = BayesianRidge(
        n_iter=n_iter,
        alpha_1=alpha_1,
        alpha_2=alpha_2,
        lambda_1=lambda_1,
        lambda_2=lambda_2,
        compute_score=compute_scores,
    )
    model.fit(X_train, y_train)

    # 5. Forecasting with Uncertainty
    pred_test, pred_test_std = model.predict(X_test, return_std=True)

    # 6. Inverse Differencing
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = np.cumsum(pred_test) + last_train_value
        y_test = original_series.iloc[-len(test):]

    # 7. Create forecast DataFrame with uncertainty
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast_mean': pred_test,
        'forecast_std': pred_test_std
    })

    # 8. Save forecasts
    forecast_csv_path = os.path.join('BayesianRidgeRegression', 'bayesian_forecasts.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, mode='a', header=write_header, index=False)

    # 9. Calculate metrics and uncertainty statistics
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast_mean']))
    log_marginal_likelihood = model.scores_[-1] if compute_scores else np.nan
    
    # Calculate coefficient statistics
    coef_means = model.coef_
    coef_stds = np.sqrt(np.diag(model.sigma_))
    
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'log_marginal_likelihood': log_marginal_likelihood,
        'coefficient_means': coef_means.tolist(),
        'coefficient_stds': coef_stds.tolist(),
        'intercept_mean': model.intercept_,
        'intercept_std': np.nan,
        'effective_params': model.lambda_ / model.alpha_,
        'differencing': d,
        'n_iter': n_iter
    }])

    metrics_csv_path = os.path.join('BayesianRidgeRegression', 'bayesian_metrics.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, mode='a', header=write_header_metrics, index=False)

    # 10. Plotting with uncertainty bands
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast_mean'], label='Mean Forecast', color='darkorange')
    plt.fill_between(
        forecast_df['date'],
        forecast_df['forecast_mean'] - 1.96*forecast_df['forecast_std'],
        forecast_df['forecast_mean'] + 1.96*forecast_df['forecast_std'],
        color='orange',
        alpha=0.2,
        label='95% Confidence'
    )
    plt.title(
        f'Bayesian Ridge Forecast for {district_name}\n'
        f'RMSE: {rmse:.2f} | Log ML: {log_marginal_likelihood:.2f}'
    )
    plt.legend()
    plt.savefig(os.path.join('BayesianRidgeRegression', f'bayesian_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'model': model,
        'coef_means': coef_means,
        'coef_stds': coef_stds,
        'intercept_mean': model.intercept_,
        'intercept_std': np.nan
    }

In [4]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I55"].asfreq('MS')
        
        results = bayesian_ridge_regression_lags_only(
            ts,
            district,
            max_lags=4,
            n_iter=500,
            alpha_1=1e-5,
            lambda_1=1e-3
        )
        
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()



=== Metrics ===
     district         rmse  log_marginal_likelihood  \
0  AHMEDNAGAR  1147.891774              -297.098709   

                                   coefficient_means  \
0  [0.00017421912766629884, 0.0001022266202467690...   

                                    coefficient_stds  intercept_mean  \
0  [0.010292224337625017, 0.010291754452258777, 0...     5655.984167   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.299915e+10             0     500  

=== Forecast Data ===
              district       date  actual  forecast_mean  forecast_std
date                                                                  
2020-07-01  AHMEDNAGAR 2020-07-01  6225.0    5658.771976   1181.417269
2020-08-01  AHMEDNAGAR 2020-08-01  6609.0    5658.745849   1182.084431
2020-09-01  AHMEDNAGAR 2020-09-01  7268.0    5658.646842   1182.533193
2020-10-01  AHMEDNAGAR 2020-10-01  6799.0    5659.056467   1183.212084
2020-11-01  AHMEDNAGAR 2020-11-01  5248.0    565



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  AMRAVATI  527.651504              -239.703052   

                                   coefficient_means  \
0  [-0.007606445821520719, 0.0030314872396111966,...   

                                    coefficient_stds  intercept_mean  \
0  [0.04252063062931969, 0.04250811006620522, 0.0...       24.378582   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      4.041172e+07             1     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  AMRAVATI 2020-07-01    2772    3202.748144    280.387128
2020-08-01  AMRAVATI 2020-08-01    3081    3223.599557    280.882549
2020-09-01  AMRAVATI 2020-09-01    3483    3240.977250    280.463103
2020-10-01  AMRAVATI 2020-10-01    3505    3252.171338    280.963113
2020-11-01  AMRAVATI 2020-11-01    3484    3284.033425    279.4565



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0     BEED  442.698264              -264.140774   

                                   coefficient_means  \
0  [0.2171443244509102, 0.0684702810123648, 0.137...   

                                    coefficient_stds  intercept_mean  \
0  [0.12034162883958527, 0.12173035763030711, 0.1...     1408.530135   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      5.760513e+06             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01     BEED 2020-07-01  2942.0    2258.671375    623.146922
2020-08-01     BEED 2020-08-01  2934.0    2373.551204    646.818421
2020-09-01     BEED 2020-09-01  2943.0    2405.086669    664.895862
2020-10-01     BEED 2020-10-01  3112.0    2484.632788    679.850148
2020-11-01     BEED 2020-11-01  2812.0    2482.154956    703.024312
=== Me



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  BULDHANA  335.927304              -222.987274   

                                   coefficient_means  \
0  [-0.08447610585665564, -0.0506408284928104, 0....   

                                    coefficient_stds  intercept_mean  \
0  [0.0877370098756473, 0.08809716255351273, 0.09...        0.920084   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      2.672108e+06             1     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  BULDHANA 2020-07-01    1172    1142.734926    173.521008
2020-08-01  BULDHANA 2020-08-01    1332    1153.016933    173.549406
2020-09-01  BULDHANA 2020-09-01    1442    1135.737636    168.795508
2020-10-01  BULDHANA 2020-10-01    1664    1118.300832    168.845832
2020-11-01  BULDHANA 2020-11-01    1582    1096.287463    170.1368



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0    DHULE  337.579786              -247.141961   

                                   coefficient_means  \
0  [0.024454924530857933, 0.0028575816183715984, ...   

                                    coefficient_stds  intercept_mean  \
0  [0.05786613652853339, 0.05772012184962978, 0.0...     1643.796592   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      2.064883e+07             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01    DHULE 2020-07-01  1662.0    1620.992432    328.438130
2020-08-01    DHULE 2020-08-01  2372.0    1612.291342    333.751150
2020-09-01    DHULE 2020-09-01  2023.0    1622.308519    349.670478
2020-10-01    DHULE 2020-10-01  2028.0    1611.575241    357.338840
2020-11-01    DHULE 2020-11-01  1809.0    1596.466538    360.874968
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   GONDIA  202.753151              -219.242504   

                                   coefficient_means  \
0  [0.2645186669738886, -0.03550823490719274, -0....   

                                    coefficient_stds  intercept_mean  \
0  [0.12523192623372634, 0.12973626788572282, 0.1...      722.175549   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     399143.377424             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   GONDIA 2020-07-01   973.0     810.214660    205.988081
2020-08-01   GONDIA 2020-08-01   997.0     805.463453    217.730847
2020-09-01   GONDIA 2020-09-01   986.0     807.459482    222.561766
2020-10-01   GONDIA 2020-10-01  1065.0     805.890284    220.945873
2020-11-01   GONDIA 2020-11-01   934.0     820.359383    228.535329
=== Me



=== Metrics ===
  district         rmse  log_marginal_likelihood  \
0  JALGAON  3998.720655              -248.392255   

                                   coefficient_means  \
0  [-0.012689782123584112, -0.0020838964018897246...   

                                    coefficient_stds  intercept_mean  \
0  [0.03600448296213644, 0.03606382657864819, 0.0...       82.226191   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      9.519612e+07             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01  JALGAON 2020-07-01    3714    4387.628473    361.172948
2020-08-01  JALGAON 2020-08-01   15827    4483.990045    361.791043
2020-09-01  JALGAON 2020-09-01    5302    4410.640485    565.571365
2020-10-01  JALGAON 2020-10-01    4900    4605.088995    677.984654
2020-11-01  JALGAON 2020-11-01    4163    4643.609459    680.150504
=== 



=== Metrics ===
   district         rmse  log_marginal_likelihood  \
0  KOLHAPUR  1915.220901              -276.206558   

                                   coefficient_means  \
0  [-0.00032801608990627365, -0.00080153295329744...   

                                    coefficient_stds  intercept_mean  \
0  [0.014406883814397762, 0.014407503223970703, 0...      167.601967   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      3.187755e+09             1     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  KOLHAPUR 2020-07-01    6281    6855.858615    819.001647
2020-08-01  KOLHAPUR 2020-08-01    6879    7025.078869    819.009548
2020-09-01  KOLHAPUR 2020-09-01    7432    7192.912820    816.376193
2020-10-01  KOLHAPUR 2020-10-01    6246    7360.189047    816.394833
2020-11-01  KOLHAPUR 2020-11-01    5778    7527.102964    816.56



=== Metrics ===
  district         rmse  log_marginal_likelihood  \
0   MUMBAI  1446.833971               -287.98189   

                                   coefficient_means  \
0  [0.00019493856996600364, 0.0004358592067448613...   

                                    coefficient_stds  intercept_mean  \
0  [0.012330653094712462, 0.012331245534253424, 0...     -152.876715   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      8.712924e+09             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   MUMBAI 2020-07-01    2667    2468.255921   1155.606521
2020-08-01   MUMBAI 2020-08-01    2879    2312.569351   1155.595916
2020-09-01   MUMBAI 2020-09-01    3817    2160.270817   1154.183983
2020-10-01   MUMBAI 2020-10-01    3638    2007.577533   1154.185765
2020-11-01   MUMBAI 2020-11-01    4551    1854.962865   1154.186789
=== 



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   NANDED  892.929641              -273.228394   

                                   coefficient_means  \
0  [0.6497461932324495, 0.08801087962945608, -0.1...   

                                    coefficient_stds  intercept_mean  \
0  [0.15830040932161876, 0.18712164069743759, 0.1...       863.77926   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.895657e+06             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   NANDED 2020-07-01  3902.0    2521.458696    620.240837
2020-08-01   NANDED 2020-08-01  4032.0    3803.221093    708.056787
2020-09-01   NANDED 2020-09-01  4237.0    3966.796153    716.710196
2020-10-01   NANDED 2020-10-01  4756.0    3802.767282    730.912839
2020-11-01   NANDED 2020-11-01  4347.0    4423.717985    724.176735
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   NASHIK  1194.25349              -260.577451   

                                   coefficient_means  \
0  [-0.146180118886697, -0.17291566209681591, 0.1...   

                                    coefficient_stds  intercept_mean  \
0  [0.1318465194464965, 0.13317584453695103, 0.14...      164.900242   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      5.601197e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   NASHIK 2020-07-01    7484    7061.705286    568.121655
2020-08-01   NASHIK 2020-08-01    7551    7040.943988    571.341254
2020-09-01   NASHIK 2020-09-01    8376    6956.303385    512.574036
2020-10-01   NASHIK 2020-10-01    7227    7174.125899    526.221984
2020-11-01   NASHIK 2020-11-01    6907    7288.025322    548.426401
=== Me



=== Metrics ===
  district       rmse  log_marginal_likelihood  \
0  PALGHAR  477.49577              -248.277664   

                                   coefficient_means  \
0  [-0.00043356278545325937, 3.461710581228284e-0...   

                                    coefficient_stds  intercept_mean  \
0  [0.011670059713678096, 0.01167000238748939, 0....       -8.882578   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      9.419786e+08             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01  PALGHAR 2020-07-01    4269    4408.063488    359.031708
2020-08-01  PALGHAR 2020-08-01    4229    4399.278784    359.035421
2020-09-01  PALGHAR 2020-09-01    4522    4390.236432    359.028947
2020-10-01  PALGHAR 2020-10-01    5385    4381.155024    359.041981
2020-11-01  PALGHAR 2020-11-01    5076    4371.978057    359.161825
=== Metr



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0     PUNE  1174.40653              -302.858646   

                                   coefficient_means  \
0  [0.6049145853248633, 0.053640877941311865, -0....   

                                    coefficient_stds  intercept_mean  \
0  [0.13604163720355558, 0.115890844583045, 0.104...     3005.139214   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.262903e+07             0     500  

=== Forecast Data ===
           district       date   actual  forecast_mean  forecast_std
date                                                                
2020-07-01     PUNE 2020-07-01  11790.0   11895.808147   2092.005068
2020-08-01     PUNE 2020-08-01  12579.0   11611.974237   2136.923513
2020-09-01     PUNE 2020-09-01  14094.0   11895.362296   2149.233064
2020-10-01     PUNE 2020-10-01  14381.0   13273.168789   2233.025495
2020-11-01     PUNE 2020-11-01  14329.0   13166.736695   2279.935038



=== Metrics ===
    district       rmse  log_marginal_likelihood  \
0  RATNAGIRI  401.11828              -235.445143   

                                   coefficient_means  \
0  [0.3061995791825715, 0.07113084819506228, 0.19...   

                                    coefficient_stds  intercept_mean  \
0  [0.13568200585676748, 0.13988510802980977, 0.1...      409.404821   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     656531.135138             0     500  

=== Forecast Data ===
             district       date  actual  forecast_mean  forecast_std
date                                                                 
2020-07-01  RATNAGIRI 2020-07-01  1387.0    1204.523987    272.388039
2020-08-01  RATNAGIRI 2020-08-01  2246.0    1277.362429    280.212385
2020-09-01  RATNAGIRI 2020-09-01  1755.0    1531.352697    330.970807
2020-10-01  RATNAGIRI 2020-10-01  1658.0    1461.676389    340.969411
2020-11-01  RATNAGIRI 2020-11-01  1155.0    1572.882516    3



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   SATARA  962.662489              -248.055444   

                                   coefficient_means  \
0  [-0.132676194293479, 0.24539814767630685, -0.1...   

                                    coefficient_stds  intercept_mean  \
0  [0.1364319629425789, 0.1555703196596868, 0.162...      -24.367855   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.774306e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   SATARA 2020-07-01    1934    1408.506205    399.234499
2020-08-01   SATARA 2020-08-01    2767    1646.626892    400.493749
2020-09-01   SATARA 2020-09-01    2016    1120.352906    400.069994
2020-10-01   SATARA 2020-10-01    2412    1486.235575    367.556008
2020-11-01   SATARA 2020-11-01    2455    1103.969231    368.254029
=== Me



=== Metrics ===
  district         rmse  log_marginal_likelihood  \
0  SOLAPUR  1542.228881               -246.60055   

                                   coefficient_means  \
0  [-0.011847062828973671, -0.012185638919725208,...   

                                    coefficient_stds  intercept_mean  \
0  [0.04868141210664134, 0.04876660237992273, 0.0...        18.57639   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      4.598675e+07             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01  SOLAPUR 2020-07-01    2470    2300.504590    346.913863
2020-08-01  SOLAPUR 2020-08-01    1918    2321.620120    346.728002
2020-09-01  SOLAPUR 2020-09-01    2558    2363.453481    347.045073
2020-10-01  SOLAPUR 2020-10-01    2463    2375.619616    347.254465
2020-11-01  SOLAPUR 2020-11-01    3694    2396.898466    343.212908
=== 



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   WARDHA  204.713415              -229.593031   

                                   coefficient_means  \
0  [-0.05018692399133392, -0.020814263103176526, ...   

                                    coefficient_stds  intercept_mean  \
0  [0.06952992710072865, 0.06961560276363121, 0.0...       20.624198   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      7.225715e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   WARDHA 2020-07-01    1131    1106.092989    205.481229
2020-08-01   WARDHA 2020-08-01    1326    1120.936332    205.465626
2020-09-01   WARDHA 2020-09-01    1431    1130.945173    205.563393
2020-10-01   WARDHA 2020-10-01    1425    1139.296348    205.720990
2020-11-01   WARDHA 2020-11-01    1342    1158.770466    205.639150
=== Me



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  YAVATMAL  500.346237              -268.358064   

                                   coefficient_means  \
0  [0.13610079067647174, 0.0014311377579161837, -...   

                                    coefficient_stds  intercept_mean  \
0  [0.09438222069989387, 0.09364014843948175, 0.0...     2899.098963   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.858053e+07             0     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  YAVATMAL 2020-07-01  4215.0    3124.008142    657.290905
2020-08-01  YAVATMAL 2020-08-01  3800.0    3315.079779    713.575405
2020-09-01  YAVATMAL 2020-09-01  3624.0    3267.196511    728.062166
2020-10-01  YAVATMAL 2020-10-01  3708.0    3212.340655    741.056735
2020-11-01  YAVATMAL 2020-11-01  3219.0    3185.697655    762.9369

{'district': 'MUMBAI SUBURBAN',
 'forecast_df':                    district       date  actual  forecast_mean  forecast_std
 date                                                                       
 2021-02-01  MUMBAI SUBURBAN 2021-02-01    3958    5377.220479    881.390890
 2021-03-01  MUMBAI SUBURBAN 2021-03-01    4827    5485.373364    880.705417,
 'metrics_df':           district         rmse  log_marginal_likelihood  \
 0  MUMBAI SUBURBAN  1106.264492               -40.993208   
 
                                    coefficient_means  \
 0  [0.0004577890775305479, -0.001090077856734878,...   
 
                                     coefficient_stds  intercept_mean  \
 0  [0.030557320589525618, 0.030554889196201532, 0...      106.331803   
 
    intercept_std  effective_params  differencing  n_iter  
 0            NaN      8.247671e+08             1     500  ,
 'model': BayesianRidge(alpha_1=1e-05, compute_score=True, lambda_1=0.001, n_iter=500),
 'coef_means': array([ 0.00045779