In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit


In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [9]:
def bayesian_ridge_regression_lags_only(
    series,
    district_name,
    max_lags=3,
    differencing=True,
    n_iter=300,
    alpha_1=1e-6,
    alpha_2=1e-6,
    lambda_1=1e-6,
    lambda_2=1e-6,
    compute_scores=True,
    random_state=None
):
    """
    Bayesian Ridge Regression for time series forecasting with:
    - Probabilistic uncertainty estimates
    - Automatic complexity control
    - Lag features only
    """
    # Create directory structure
    os.makedirs('BayesianRidgeRegression', exist_ok=True)
    
    # 1. Stationarity and Differencing
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 2. Feature Engineering - Lag features
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 3. Train-Test Split
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 4. Model Training with Bayesian Ridge
    model = BayesianRidge(
        n_iter=n_iter,
        alpha_1=alpha_1,
        alpha_2=alpha_2,
        lambda_1=lambda_1,
        lambda_2=lambda_2,
        compute_score=compute_scores,
    )
    model.fit(X_train, y_train)

    # 5. Forecasting with Uncertainty
    pred_test, pred_test_std = model.predict(X_test, return_std=True)

    # 6. Inverse Differencing
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = np.cumsum(pred_test) + last_train_value
        y_test = original_series.iloc[-len(test):]

    # 7. Create forecast DataFrame with uncertainty
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast_mean': pred_test,
        'forecast_std': pred_test_std
    })

    # 8. Save forecasts
    forecast_csv_path = os.path.join('BayesianRidgeRegression', 'bayesian_forecasts.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, mode='a', header=write_header, index=False)

    # 9. Calculate metrics and uncertainty statistics
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast_mean']))
    log_marginal_likelihood = model.scores_[-1] if compute_scores else np.nan
    
    # Calculate coefficient statistics
    coef_means = model.coef_
    coef_stds = np.sqrt(np.diag(model.sigma_))
    
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'log_marginal_likelihood': log_marginal_likelihood,
        'coefficient_means': coef_means.tolist(),
        'coefficient_stds': coef_stds.tolist(),
        'intercept_mean': model.intercept_,
        'intercept_std': np.nan,
        'effective_params': model.lambda_ / model.alpha_,
        'differencing': d,
        'n_iter': n_iter
    }])

    metrics_csv_path = os.path.join('BayesianRidgeRegression', 'bayesian_metrics.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, mode='a', header=write_header_metrics, index=False)

    # 10. Plotting with uncertainty bands
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast_mean'], label='Mean Forecast', color='darkorange')
    plt.fill_between(
        forecast_df['date'],
        forecast_df['forecast_mean'] - 1.96*forecast_df['forecast_std'],
        forecast_df['forecast_mean'] + 1.96*forecast_df['forecast_std'],
        color='orange',
        alpha=0.2,
        label='95% Confidence'
    )
    plt.title(
        f'Bayesian Ridge Forecast for {district_name}\n'
        f'RMSE: {rmse:.2f} | Log ML: {log_marginal_likelihood:.2f}'
    )
    plt.legend()
    plt.savefig(os.path.join('BayesianRidgeRegression', f'bayesian_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'model': model,
        'coef_means': coef_means,
        'coef_stds': coef_stds,
        'intercept_mean': model.intercept_,
        'intercept_std': np.nan
    }

In [None]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I1"].asfreq('MS')
        
        results = bayesian_ridge_regression_lags_only(
            ts,
            district,
            max_lags=4,
            n_iter=500,
            alpha_1=1e-5,
            lambda_1=1e-3
        )
        
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()



=== Metrics ===
     district        rmse  log_marginal_likelihood  \
0  AHMEDNAGAR  547.094447              -269.217875   

                                   coefficient_means  \
0  [-0.00014123796678755662, -2.204828891672853e-...   

                                    coefficient_stds  intercept_mean  \
0  [0.012739564876249108, 0.012740198082995886, 0...     6907.043918   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.720346e+09             0     500  

=== Forecast Data ===
              district       date  actual  forecast_mean  forecast_std
date                                                                  
2020-07-01  AHMEDNAGAR 2020-07-01  6478.0    6893.374185    559.060073
2020-08-01  AHMEDNAGAR 2020-08-01  5975.0    6892.695115    560.033636
2020-09-01  AHMEDNAGAR 2020-09-01  6550.0    6893.660781    557.179375
2020-10-01  AHMEDNAGAR 2020-10-01  6471.0    6894.402562    555.110651
2020-11-01  AHMEDNAGAR 2020-11-01  6107.0    6895.



=== Metrics ===
   district        rmse  log_marginal_likelihood  \
0  AMRAVATI  426.503318              -247.782894   

                                   coefficient_means  \
0  [0.23000123180310356, -0.11171823602840758, -0...   

                                    coefficient_stds  intercept_mean  \
0  [0.12723276144892515, 0.12913136571165096, 0.1...     4020.051024   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.992296e+06             0     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  AMRAVATI 2020-07-01  3459.0    3803.632266    958.485216
2020-08-01  AMRAVATI 2020-08-01  3049.0    3722.250701    930.657371
2020-09-01  AMRAVATI 2020-09-01  3325.0    3696.561622    879.788002
2020-10-01  AMRAVATI 2020-10-01  3191.0    3867.660075    851.144841
2020-11-01  AMRAVATI 2020-11-01  3502.0    3873.343765    805.8808



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0     BEED  280.911221              -264.455487   

                                   coefficient_means  \
0  [0.24688269543818533, 0.08815133983931156, 0.0...   

                                    coefficient_stds  intercept_mean  \
0  [0.13165033247703475, 0.13849686693670388, 0.1...     3496.789317   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      3.838194e+06             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01     BEED 2020-07-01  3821.0    4066.886515   1059.732149
2020-08-01     BEED 2020-08-01  3261.0    3603.945171   1066.651751
2020-09-01     BEED 2020-09-01  3819.0    3551.997845    961.860362
2020-10-01     BEED 2020-10-01  3751.0    3851.003405    905.409301
2020-11-01     BEED 2020-11-01  3332.0    3909.794776    886.096351
=== Me



=== Metrics ===
   district       rmse  log_marginal_likelihood  \
0  BULDHANA  368.44708              -245.744043   

                                   coefficient_means  \
0  [0.0968790496273067, -0.035588737572419586, -0...   

                                    coefficient_stds  intercept_mean  \
0  [0.08960611321297346, 0.09071620602297056, 0.0...     4532.280613   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      5.069301e+06             0     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  BULDHANA 2020-07-01  3900.0    4160.546123    809.074027
2020-08-01  BULDHANA 2020-08-01  3612.0    4091.033228    792.648140
2020-09-01  BULDHANA 2020-09-01  4073.0    4067.263719    778.836033
2020-10-01  BULDHANA 2020-10-01  3614.0    4179.822447    757.890322
2020-11-01  BULDHANA 2020-11-01  3484.0    4171.413614    724.139960



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0    DHULE  375.369294              -248.983933   

                                   coefficient_means  \
0  [0.1118075826774507, 0.018723418187600427, -0....   

                                    coefficient_stds  intercept_mean  \
0  [0.08775546663245197, 0.08636778432858143, 0.0...     3299.339994   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      7.907918e+06             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01    DHULE 2020-07-01  3857.0    3769.355726    658.199246
2020-08-01    DHULE 2020-08-01  3076.0    3740.998858    671.178517
2020-09-01    DHULE 2020-09-01  3376.0    3656.717744    656.457945
2020-10-01    DHULE 2020-10-01  3157.0    3678.062041    635.286834
2020-11-01    DHULE 2020-11-01  3146.0    3675.261272    604.261159
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   GONDIA  163.339502              -225.353098   

                                   coefficient_means  \
0  [0.27662783857412054, 0.11996477126996427, -0....   

                                    coefficient_stds  intercept_mean  \
0  [0.13235300075373813, 0.13651433609668281, 0.1...     1413.975337   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     405590.402643             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   GONDIA 2020-07-01  1703.0    1650.042817    381.734060
2020-08-01   GONDIA 2020-08-01  1383.0    1621.562662    380.665628
2020-09-01   GONDIA 2020-09-01  1445.0    1557.971407    354.883254
2020-10-01   GONDIA 2020-10-01  1472.0    1566.027669    340.010887
2020-11-01   GONDIA 2020-11-01  1519.0    1597.022947    331.719018
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0    JALNA  238.585729              -244.967321   

                                   coefficient_means  \
0  [0.05093419559819565, -0.009665473893038203, -...   

                                    coefficient_stds  intercept_mean  \
0  [0.0648663252801161, 0.06492184433273274, 0.06...     3447.810809   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.406420e+07             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01    JALNA 2020-07-01  3234.0    3578.830965    544.577647
2020-08-01    JALNA 2020-08-01  3247.0    3545.938080    533.401803
2020-09-01    JALNA 2020-09-01  3401.0    3552.425813    520.065670
2020-10-01    JALNA 2020-10-01  3244.0    3565.537098    508.720835
2020-11-01    JALNA 2020-11-01  3164.0    3555.393876    490.358088
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0    LATUR  330.909022              -248.218882   

                                   coefficient_means  \
0  [0.014494177865175858, -0.09043609242653247, -...   

                                    coefficient_stds  intercept_mean  \
0  [0.08242690929152881, 0.08173442677770523, 0.0...     4109.868883   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      8.974734e+06             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01    LATUR 2020-07-01  3694.0    3877.687492    731.097802
2020-08-01    LATUR 2020-08-01  3498.0    3901.813241    726.151232
2020-09-01    LATUR 2020-09-01  3654.0    3925.989691    699.908669
2020-10-01    LATUR 2020-10-01  3751.0    3943.020332    680.508728
2020-11-01    LATUR 2020-11-01  3673.0    3922.474410    671.232278
=== Me



=== Metrics ===
  district         rmse  log_marginal_likelihood  \
0   NAGPUR  1196.305067              -276.758966   

                                   coefficient_means  \
0  [0.07267758971604243, 0.01761318415150683, -0....   

                                    coefficient_stds  intercept_mean  \
0  [0.07606217167348618, 0.07648096794084933, 0.0...     6315.173791   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      5.755473e+07             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   NAGPUR 2020-07-01  7018.0    7057.372923   1182.992692
2020-08-01   NAGPUR 2020-08-01  5585.0    7028.297383   1211.708114
2020-09-01   NAGPUR 2020-09-01  5078.0    6939.038245   1187.525802
2020-10-01   NAGPUR 2020-10-01  5618.0    6870.109008   1108.961156
2020-11-01   NAGPUR 2020-11-01  6429.0    6899.836120   1058.400468
=== 



=== Metrics ===
    district        rmse  log_marginal_likelihood  \
0  OSMANABAD  266.905709              -239.995235   

                                   coefficient_means  \
0  [0.009448038525606018, -0.0008913153378621369,...   

                                    coefficient_stds  intercept_mean  \
0  [0.03005852386843852, 0.030100774455687276, 0....     2436.123661   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      5.646632e+07             0     500  

=== Forecast Data ===
             district       date  actual  forecast_mean  forecast_std
date                                                                 
2020-07-01  OSMANABAD 2020-07-01  2273.0    2449.910422    282.617119
2020-08-01  OSMANABAD 2020-08-01  1935.0    2445.698781    281.680562
2020-09-01  OSMANABAD 2020-09-01  2241.0    2443.333294    272.717579
2020-10-01  OSMANABAD 2020-10-01  2207.0    2447.539554    267.012465
2020-11-01  OSMANABAD 2020-11-01  2274.0    2447.945029   



=== Metrics ===
   district       rmse  log_marginal_likelihood  \
0  PARBHANI  352.01802              -245.875234   

                                   coefficient_means  \
0  [0.003063498976610599, 0.0004223185407808872, ...   

                                    coefficient_stds  intercept_mean  \
0  [0.01755878473059538, 0.017559493819945593, 0....     3062.583507   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      2.374141e+08             0     500  

=== Forecast Data ===
            district       date  actual  forecast_mean  forecast_std
date                                                                
2020-07-01  PARBHANI 2020-07-01  3186.0    3072.970936    295.059336
2020-08-01  PARBHANI 2020-08-01  2743.0    3071.974100    295.583234
2020-09-01  PARBHANI 2020-09-01  3099.0    3070.294172    293.968931
2020-10-01  PARBHANI 2020-10-01  2920.0    3071.338684    293.629741
2020-11-01  PARBHANI 2020-11-01  2538.0    3071.191698    291.446823



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   RAIGAD  946.107765              -253.840703   

                                   coefficient_means  \
0  [-0.2870758789292527, -0.06945805678099214, -0...   

                                    coefficient_stds  intercept_mean  \
0  [0.13229079192121562, 0.13682854541204198, 0.1...       21.604127   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      3.580114e+06             1     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   RAIGAD 2020-07-01    3465    3987.086476    400.817064
2020-08-01   RAIGAD 2020-08-01    3013    4153.057556    405.729051
2020-09-01   RAIGAD 2020-09-01    3412    4371.628021    413.252623
2020-10-01   RAIGAD 2020-10-01    3704    4347.172163    413.840384
2020-11-01   RAIGAD 2020-11-01    3998    4270.412618    413.262992
=== Me



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   SANGLI  272.743322              -237.143584   

                                   coefficient_means  \
0  [0.2779239968907206, -0.03119610209982461, 0.1...   

                                    coefficient_stds  intercept_mean  \
0  [0.13184992730843814, 0.13350196370096726, 0.1...     2301.161794   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     906859.500596             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   SANGLI 2020-07-01  4264.0    4266.143925    851.503434
2020-08-01   SANGLI 2020-08-01  3874.0    4254.748940    865.735573
2020-09-01   SANGLI 2020-09-01  3891.0    4134.898450    836.446454
2020-10-01   SANGLI 2020-10-01  3891.0    4134.040913    813.313540
2020-11-01   SANGLI 2020-11-01  3821.0    4067.904538    790.265037
=== Me



=== Metrics ===
     district        rmse  log_marginal_likelihood  \
0  SINDHUDURG  104.569212              -203.308065   

                                   coefficient_means  \
0  [0.0004459034139938646, 5.4010855933466905e-05...   

                                    coefficient_stds  intercept_mean  \
0  [0.011498285902748732, 0.011498800647098769, 0...      674.361557   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      4.893987e+07             0     500  

=== Forecast Data ===
              district       date  actual  forecast_mean  forecast_std
date                                                                  
2020-07-01  SINDHUDURG 2020-07-01   721.0     674.407418     82.552147
2020-08-01  SINDHUDURG 2020-08-01   549.0     674.282643     82.639355
2020-09-01  SINDHUDURG 2020-09-01   579.0     674.223411     82.316722
2020-10-01  SINDHUDURG 2020-10-01   530.0     674.232507     82.077834
2020-11-01  SINDHUDURG 2020-11-01   476.0     674.



=== Metrics ===
  district         rmse  log_marginal_likelihood  \
0    THANE  1539.748517              -279.862544   

                                   coefficient_means  \
0  [0.07839739741932383, -0.003560983536722004, -...   

                                    coefficient_stds  intercept_mean  \
0  [0.09128182580291455, 0.09121958514694915, 0.0...    15996.646452   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      2.889873e+07             0     500  

=== Forecast Data ===
           district       date   actual  forecast_mean  forecast_std
date                                                                
2020-07-01    THANE 2020-07-01  12050.0   14128.231540   2365.322323
2020-08-01    THANE 2020-08-01  12215.0   13851.776276   2371.853786
2020-09-01    THANE 2020-09-01  13770.0   13839.688723   2340.147084
2020-10-01    THANE 2020-10-01  13402.0   14114.230268   2372.853582
2020-11-01    THANE 2020-11-01  13995.0   14228.003715   2341.0822



=== Metrics ===
  district        rmse  log_marginal_likelihood  \
0   WASHIM  148.618951              -234.481868   

                                   coefficient_means  \
0  [0.2547399861941074, -0.2425647374619671, 0.08...   

                                    coefficient_stds  intercept_mean  \
0  [0.1369639989819333, 0.14091457760391307, 0.13...      1769.57716   

   intercept_std  effective_params  differencing  n_iter  
0            NaN     578549.771337             0     500  

=== Forecast Data ===
           district       date  actual  forecast_mean  forecast_std
date                                                               
2020-07-01   WASHIM 2020-07-01  1629.0    1613.352895    457.250352
2020-08-01   WASHIM 2020-08-01  1502.0    1679.792431    455.400971
2020-09-01   WASHIM 2020-09-01  1599.0    1603.190002    444.327916
2020-10-01   WASHIM 2020-10-01  1501.0    1699.102234    431.429055
2020-11-01   WASHIM 2020-11-01  1453.0    1632.067308    426.992480
=== Me



=== Metrics ===
          district        rmse  log_marginal_likelihood  \
0  MUMBAI SUBURBAN  547.126116               -45.383661   

                                   coefficient_means  \
0  [0.17370178045390328, 0.20029270664928311, 0.1...   

                                    coefficient_stds  intercept_mean  \
0  [0.17819497277613575, 0.17831596175993802, 0.1...     4419.859653   

   intercept_std  effective_params  differencing  n_iter  
0            NaN      1.431852e+06             0     500  

=== Forecast Data ===
                   district       date   actual  forecast_mean  forecast_std
date                                                                        
2021-02-01  MUMBAI SUBURBAN 2021-02-01  13444.0   13771.678830   1838.051175
2021-03-01  MUMBAI SUBURBAN 2021-03-01  13294.0   13994.942622   1834.523871


{'district': 'MUMBAI SUBURBAN',
 'forecast_df':                    district       date   actual  forecast_mean  forecast_std
 date                                                                        
 2021-02-01  MUMBAI SUBURBAN 2021-02-01  13444.0   13771.678830   1838.051175
 2021-03-01  MUMBAI SUBURBAN 2021-03-01  13294.0   13994.942622   1834.523871,
 'metrics_df':           district        rmse  log_marginal_likelihood  \
 0  MUMBAI SUBURBAN  547.126116               -45.383661   
 
                                    coefficient_means  \
 0  [0.17370178045390328, 0.20029270664928311, 0.1...   
 
                                     coefficient_stds  intercept_mean  \
 0  [0.17819497277613575, 0.17831596175993802, 0.1...     4419.859653   
 
    intercept_std  effective_params  differencing  n_iter  
 0            NaN      1.431852e+06             0     500  ,
 'model': BayesianRidge(alpha_1=1e-05, compute_score=True, lambda_1=0.001, n_iter=500),
 'coef_means': array([0.1737017

: 