In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import ElasticNet, ElasticNetCV
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit


In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def elastic_net_regression_lags_only(
    series,
    district_name,
    max_lags=3,
    differencing=True,
    alpha=1.0,
    l1_ratio=0.5,  # Mix ratio (0=Ridge, 1=LASSO)
    use_cv=False,
    alphas=np.logspace(-3, 2, 20),
    l1_ratios=[0.1, 0.5, 0.7, 0.9, 0.95, 0.99, 1],  # Tested if use_cv=True
    cv=5,
    random_state=42
):
    """
    Elastic Net regression (L1/L2 regularization) for time series forecasting.
    Supports both fixed parameters and cross-validated hyperparameter tuning.
    """
    # Create directory structure
    os.makedirs('ElasticNetRegression', exist_ok=True)
    
    # 1. Stationarity and Differencing
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 2. Feature Engineering - Lag features
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 3. Train-Test Split
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 4. Model Training
    if use_cv:
        model = ElasticNetCV(
            alphas=alphas,
            l1_ratio=l1_ratios,
            cv=TimeSeriesSplit(n_splits=5),
            max_iter=10000,
            random_state=random_state
        )
        model.fit(X_train, y_train)
        best_alpha = model.alpha_
        best_l1 = model.l1_ratio_
    else:
        model = ElasticNet(
            alpha=alpha,
            l1_ratio=l1_ratio,
            max_iter=10000,
            random_state=random_state
        )
        model.fit(X_train, y_train)
        best_alpha = alpha
        best_l1 = l1_ratio

    # 5. Forecasting
    pred_test = model.predict(X_test)

    # 6. Inverse Differencing
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = np.cumsum(pred_test) + last_train_value
        y_test = original_series.iloc[-len(test):]
    
    # 7. Forecast DataFrame
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast': pred_test
    })

    # 8. Save Forecasts
    forecast_csv_path = os.path.join('ElasticNetRegression', 'elasticnet_forecasts.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, mode='a', header=write_header, index=False)

    # 9. Metrics Calculation
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast']))
    r_squared = model.score(X_train, y_train)
    
    metrics_data = {
        'district': district_name,
        'rmse': rmse,
        'r_squared': r_squared,
        'coefficients': model.coef_.tolist(),
        'intercept': model.intercept_,
        'final_alpha': best_alpha,
        'final_l1_ratio': best_l1,
        'differencing': d
    }
    
    if use_cv:
        metrics_data.update({
            'alphas_tried': alphas.tolist(),
            'l1_ratios_tried': l1_ratios
        })

    metrics_df = pd.DataFrame([metrics_data])

    metrics_csv_path = os.path.join('ElasticNetRegression', 'elasticnet_metrics.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, mode='a', header=write_header_metrics, index=False)

    # 10. Plotting
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast'], label='Forecast', color='darkorange')
    plt.title(
        f'Elastic Net Forecast for {district_name}\n'
        f'Alpha: {best_alpha:.4f}, L1 Ratio: {best_l1:.2f} | '
        f'RMSE: {rmse:.2f}, R²: {r_squared:.2f}'
    )
    plt.legend()
    plt.savefig(os.path.join('ElasticNetRegression', f'elasticnet_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'model_coefficients': model.coef_,
        'model_intercept': model.intercept_,
        'best_alpha': best_alpha,
        'best_l1_ratio': best_l1
    }

In [4]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I8"].asfreq('MS')
        
        results = elastic_net_regression_lags_only(
            ts,
            district,
            use_cv=True,
            alphas=np.logspace(-3, 1, 30),
            l1_ratios=[0.1, 0.3, 0.5, 0.7, 0.9, 1],
            cv=5
        )
        
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()

=== Metrics ===
     district         rmse  r_squared  \
0  AHMEDNAGAR  1779.505275    0.14262   

                                        coefficients   intercept  final_alpha  \
0  [-0.39700954793011056, -0.20203358894671658, -...  154.590092         10.0   

   final_l1_ratio  differencing  \
0             1.0             1   

                                        alphas_tried  \
0  [0.001, 0.0013738237958832624, 0.0018873918221...   

                l1_ratios_tried  
0  [0.1, 0.3, 0.5, 0.7, 0.9, 1]  

=== Forecast Data ===
     district       date  actual     forecast
0  AHMEDNAGAR 2020-07-01    5982  6556.290117
1  AHMEDNAGAR 2020-08-01    5559  6970.792117
2  AHMEDNAGAR 2020-09-01    5903  7459.921293
3  AHMEDNAGAR 2020-10-01    5719  7617.673318
4  AHMEDNAGAR 2020-11-01    5427  7834.678953
=== Metrics ===
  district        rmse  r_squared  \
0    AKOLA  287.364895    0.10274   

                                        coefficients  intercept  final_alpha  \
0  [-0.259796973

  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(
  model = cd_fast.enet_coordinate_descent(


=== Metrics ===
          district         rmse  r_squared  \
0  MUMBAI SUBURBAN  1672.411352   0.678944   

                                        coefficients    intercept  \
0  [-0.9641694349722668, -0.6259684977420369, -0....  1505.719205   

   final_alpha  final_l1_ratio  differencing  \
0         10.0             0.5             1   

                                        alphas_tried  \
0  [0.001, 0.0013738237958832624, 0.0018873918221...   

                l1_ratios_tried  
0  [0.1, 0.3, 0.5, 0.7, 0.9, 1]  

=== Forecast Data ===
          district       date  actual      forecast
0  MUMBAI SUBURBAN 2021-02-01    9380  10248.180020
1  MUMBAI SUBURBAN 2021-03-01    9076  11276.041571


{'district': 'MUMBAI SUBURBAN',
 'forecast_df':           district       date  actual      forecast
 0  MUMBAI SUBURBAN 2021-02-01    9380  10248.180020
 1  MUMBAI SUBURBAN 2021-03-01    9076  11276.041571,
 'metrics_df':           district         rmse  r_squared  \
 0  MUMBAI SUBURBAN  1672.411352   0.678944   
 
                                         coefficients    intercept  \
 0  [-0.9641694349722668, -0.6259684977420369, -0....  1505.719205   
 
    final_alpha  final_l1_ratio  differencing  \
 0         10.0             0.5             1   
 
                                         alphas_tried  \
 0  [0.001, 0.0013738237958832624, 0.0018873918221...   
 
                 l1_ratios_tried  
 0  [0.1, 0.3, 0.5, 0.7, 0.9, 1]  ,
 'model_coefficients': array([-0.96416943, -0.6259685 , -0.45154936]),
 'model_intercept': 1505.7192046900993,
 'best_alpha': 10.0,
 'best_l1_ratio': 0.5}