In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.linear_model import Lasso
from sklearn.linear_model import LassoCV
from sklearn.model_selection import TimeSeriesSplit

from sklearn.metrics import mean_squared_error

In [None]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [None]:
def lasso_regression_lags_only_cv(
    series,
    district_name,
    max_lags=3,
    differencing=True,
    alphas=np.logspace(-3, 1, 20),  # Range of alpha values to try
    cv=5,  # Number of cross-validation folds
    random_state=42
):
    """
    LASSO regression for time series forecasting with lag features only,
    using cross-validated alpha (L1 regularization strength).
    """
    # Create directory structure
    os.makedirs('LASSORegression', exist_ok=True)
    
    # 1. Stationarity and Differencing
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 2. Feature Engineering - Lag features only
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 3. Train-Test Split (time-based)
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 4. Model Training with LASSO + Cross-Validation
    model = LassoCV(alphas=alphas, cv=TimeSeriesSplit(n_splits=5), random_state=random_state, max_iter=10000)
    model.fit(X_train, y_train)
    best_alpha = model.alpha_

    # 5. Forecasting
    pred_test = model.predict(X_test)

    # 6. Inverse Differencing if applied
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = np.cumsum(pred_test) + last_train_value
        y_test = original_series.iloc[-len(test):]
    
    # 7. Create forecast DataFrame
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast': pred_test
    })

    # 8. Save forecasts to CSV
    forecast_csv_path = os.path.join('LASSORegression', 'lasso_forecasts_lags_only_cv.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, mode='a', header=write_header, index=False)

    # 9. Calculate metrics
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast']))
    r_squared = model.score(X_train, y_train)
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'r_squared': r_squared,
        'coefficients': model.coef_.tolist(),
        'intercept': model.intercept_,
        'best_alpha': best_alpha,
        'alphas_tried': alphas.tolist(),
        'differencing': d
    }])

    metrics_csv_path = os.path.join('LASSORegression', 'lasso_metrics_lags_only_cv.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, mode='a', header=write_header_metrics, index=False)

    # 10. Plotting
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast'], label='Forecast', color='darkorange')
    plt.title(
        f'LASSO (CV) Regression (Lags Only) for {district_name}\n'
        f'Best alpha: {best_alpha:.4f} | RMSE: {rmse:.2f}, R²: {r_squared:.2f}'
    )
    plt.legend()
    plt.savefig(os.path.join('LASSORegression', f'LASSO_CV_forecast_lags_only_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'model_coefficients': model.coef_,
        'model_intercept': model.intercept_,
        'best_alpha': best_alpha,
        'alphas_tried': alphas
    }

In [None]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I40"].asfreq('MS')
        
        results = lasso_regression_lags_only_cv(
            ts, 
            district, 
            max_lags=5, 
            differencing=True,
            alphas=np.logspace(-3, 1, 30),  # Try more alphas if you want
            cv=5
        )
        
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()

=== Metrics ===
     district        rmse  r_squared  \
0  AHMEDNAGAR  531.925517    0.08101   

                                        coefficients    intercept  best_alpha  \
0  [-0.038596158529227924, 0.015475550011280096, ...  8385.528295        10.0   

                                        alphas_tried  differencing  
0  [0.001, 0.0013738237958832624, 0.0018873918221...             0  

=== Forecast Data ===
     district       date  actual     forecast
0  AHMEDNAGAR 2020-07-01  6478.0  6810.782296
1  AHMEDNAGAR 2020-08-01  5975.0  6566.470638
2  AHMEDNAGAR 2020-09-01  6550.0  6942.109824
3  AHMEDNAGAR 2020-10-01  6471.0  7081.510105
4  AHMEDNAGAR 2020-11-01  6107.0  7158.542236
=== Metrics ===
  district        rmse  r_squared  \
0    AKOLA  295.673095    0.19196   

                                        coefficients    intercept  best_alpha  \
0  [0.16268711223932864, -0.17554680829749805, -0...  3712.028172        10.0   

                                        alphas_tr

ValueError: Cannot have number of folds=6 greater than the number of samples=5.

In [None]:

from sklearn.preprocessing import StandardScaler

def normalize_series(series):
    """Normalize a pandas Series using StandardScaler"""
    scaler = StandardScaler()
    scaled_values = scaler.fit_transform(series.values.reshape(-1, 1))
    return pd.Series(scaled_values.flatten(), index=series.index), scaler

def run_for_each_district():
    results = {}
    rmse_values = []
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I8"].asfreq('MS')
        
        # Normalize the time series data
        scaled_ts, scaler = normalize_series(ts)
        
        # Run LASSO on normalized data
        results = lasso_regression_lags_only_cv(
            scaled_ts, 
            district, 
            max_lags=5, 
            differencing=True,
            alphas=np.logspace(-3, 1, 30),
            cv=5
        )
        
        # Convert forecasts back to original scale
        forecast_df = results['forecast_df']
        forecast_df['actual'] = scaler.inverse_transform(
            forecast_df[['actual']]
        ).flatten()
        forecast_df['forecast'] = scaler.inverse_transform(
            forecast_df[['forecast']]
        ).flatten()
        
        # Recalculate RMSE in original scale
        rmse = np.sqrt(mean_squared_error(
            forecast_df['actual'], 
            forecast_df['forecast']
        ))
        rmse_values.append(rmse)
        
        # Update results with transformed data
        results['forecast_df'] = forecast_df
        results['metrics_df']['rmse_original_scale'] = rmse  # Add new metric

        print(f"\n=== {district} ===")
        print(f"Normalized RMSE: {results['metrics_df']['rmse'].values[0]:.2f}")
        print(f"Original Scale RMSE: {rmse:.2f}")
        print("Forecast Preview:")
        print(forecast_df.head())
    
    return results, rmse_values

# Execute the analysis
final_results, district_rmses = run_for_each_district()



=== AHMEDNAGAR ===
Normalized RMSE: 1.02
Original Scale RMSE: 546.43
Forecast Preview:
     district       date  actual     forecast
0  AHMEDNAGAR 2020-07-01  6478.0  6892.470588
1  AHMEDNAGAR 2020-08-01  5975.0  6892.470588
2  AHMEDNAGAR 2020-09-01  6550.0  6892.470588
3  AHMEDNAGAR 2020-10-01  6471.0  6892.470588
4  AHMEDNAGAR 2020-11-01  6107.0  6892.470588

=== AKOLA ===
Normalized RMSE: 1.33
Original Scale RMSE: 289.97
Forecast Preview:
  district       date  actual     forecast
0    AKOLA 2020-07-01  3121.0  2698.117647
1    AKOLA 2020-08-01  3029.0  2698.117647
2    AKOLA 2020-09-01  2763.0  2698.117647
3    AKOLA 2020-10-01  2737.0  2698.117647
4    AKOLA 2020-11-01  2700.0  2698.117647

=== AMRAVATI ===
Normalized RMSE: 1.13
Original Scale RMSE: 381.81
Forecast Preview:
   district       date  actual     forecast
0  AMRAVATI 2020-07-01  3459.0  3761.748022
1  AMRAVATI 2020-08-01  3049.0  3664.970695
2  AMRAVATI 2020-09-01  3325.0  3619.811042
3  AMRAVATI 2020-10-01  3191.0  3