In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox

from statsmodels.stats.stattools import durbin_watson
from statsmodels.graphics.tsaplots import plot_acf
from scipy.stats import shapiro
from sklearn.linear_model import RidgeCV
from sklearn.metrics import mean_squared_error
import statsmodels.api as sm

In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def linear_regression_lags_only(series, district_name, max_lags=3, differencing=True):
    """
    Simplified linear regression for time series forecasting with:
    - Lag features only
    - Ordinary Least Squares (OLS) regression
    - Optional differencing for stationarity
    """
    # Create directory structure
    os.makedirs('LinearRegression', exist_ok=True)
    
    # 1. Stationarity and Differencing
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 2. Feature Engineering - Lag features only
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 3. Train-Test Split (time-based)
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 4. Model Training with OLS
    X_train_sm = sm.add_constant(X_train)
    model = sm.OLS(y_train, X_train_sm).fit()

    # 5. Forecasting
    X_test_sm = sm.add_constant(X_test)
    pred_test = model.predict(X_test_sm)

    # 6. Inverse Differencing if applied
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = pred_test.cumsum() + last_train_value
        y_test = original_series.iloc[-len(test):]
    
    # 7. Create forecast DataFrame
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast': pred_test.values
    })

    # 8. Save forecasts to CSV
    forecast_csv_path = os.path.join('LinearRegression', 'linear_forecasts_lags_only.csv')
    write_header = not os.path.exists(forecast_csv_path)
    forecast_df.to_csv(forecast_csv_path, mode='a', header=write_header, index=False)

    # 9. Calculate metrics
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast']))
    metrics_df = pd.DataFrame([{
        'district': district_name,
        'rmse': rmse,
        'r_squared': model.rsquared,
        'params': model.params.to_dict(),
        'differencing': d
    }])

    metrics_csv_path = os.path.join('LinearRegression', 'linear_metrics_lags_only.csv')
    write_header_metrics = not os.path.exists(metrics_csv_path)
    metrics_df.to_csv(metrics_csv_path, mode='a', header=write_header_metrics, index=False)

    # 10. Plotting
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast'], label='Forecast', color='darkorange')
    plt.title(f'Linear Regression (Lags Only) Forecast for {district_name}\nRMSE: {rmse:.2f}, R²: {model.rsquared:.2f}')
    plt.legend()
    plt.savefig(os.path.join('LinearRegression', f'Linear_forecast_lags_only_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'model_summary': model.summary().as_text()
    }

In [4]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I55"].asfreq('MS')
        
        results = linear_regression_lags_only(ts, district)
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()

=== Metrics ===
     district         rmse  r_squared  \
0  AHMEDNAGAR  1097.847303   0.016063   

                                              params  differencing  
0  {'const': 4645.802543119852, 'lag_1': 0.051969...             0  

=== Forecast Data ===
     district       date  actual     forecast
0  AHMEDNAGAR 2020-07-01  6225.0  5693.196308
1  AHMEDNAGAR 2020-08-01  6609.0  5740.042167
2  AHMEDNAGAR 2020-09-01  7268.0  5735.792717
3  AHMEDNAGAR 2020-10-01  6799.0  5826.873600
4  AHMEDNAGAR 2020-11-01  5248.0  5859.107910
=== Metrics ===
  district        rmse  r_squared  \
0    AKOLA  493.705353   0.281245   

                                              params  differencing  
0  {'const': 107.65066799576812, 'lag_1': -0.5185...             1  

=== Forecast Data ===
  district       date  actual     forecast
0    AKOLA 2020-07-01    1999  2631.615871
1    AKOLA 2020-08-01    1927  2624.163672
2    AKOLA 2020-09-01    2604  2688.552528
3    AKOLA 2020-10-01    2342  2462.7181

  warn("omni_normtest is not valid with less than 8 observations; %i "


{'district': 'MUMBAI SUBURBAN',
 'forecast_df':           district       date  actual     forecast
 0  MUMBAI SUBURBAN 2021-02-01    3958  6918.119819
 1  MUMBAI SUBURBAN 2021-03-01    4827  9077.671903,
 'metrics_df':           district         rmse  r_squared  \
 0  MUMBAI SUBURBAN  3662.684874   0.785455   
 
                                               params  differencing  
 0  {'const': 704.0110694687002, 'lag_1': -0.41231...             1  ,