In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def knn_regression_lags_only(
    series,
    district_name,
    max_lags=3,
    differencing=True,
    n_neighbors=5,
    weights='uniform',
    algorithm='auto',
    use_cv=False,
    cv_params=None,
    cv=5
):
    """
    K-Nearest Neighbors regression for time series forecasting with:
    - Lag features only
    - Optional differencing for stationarity
    - Optional hyperparameter tuning via cross-validation
    """
    # 1. Directory structure
    os.makedirs('KNNRegression', exist_ok=True)
    
    # 2. Stationarity and Differencing
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 3. Feature Engineering - Lag features only
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 4. Train-Test Split (time-based)
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 5. Model Training (with optional CV)
    if use_cv:
        param_grid = cv_params or {
            'n_neighbors': [3, 5, 7, 9],
            'weights': ['uniform', 'distance'],
            'algorithm': ['auto', 'ball_tree', 'kd_tree']
        }
        grid_search = GridSearchCV(
            KNeighborsRegressor(),
            param_grid,
            cv=cv,
            scoring='neg_mean_squared_error'
        )
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        model = KNeighborsRegressor(
            n_neighbors=n_neighbors,
            weights=weights,
            algorithm=algorithm
        )
        model.fit(X_train, y_train)
        best_params = {
            'n_neighbors': n_neighbors,
            'weights': weights,
            'algorithm': algorithm
        }

    # 6. Forecasting
    pred_test = model.predict(X_test)

    # 7. Inverse Differencing if applied
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = np.cumsum(pred_test) + last_train_value
        y_test = original_series.iloc[-len(test):]

    # 8. Create forecast DataFrame
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast': pred_test
    })

    # 9. Save forecasts to CSV
    forecast_csv_path = os.path.join('KNNRegression', 'knn_forecasts.csv')
    forecast_df.to_csv(
        forecast_csv_path,
        mode='a',
        header=not os.path.exists(forecast_csv_path),
        index=False
    )

    # 10. Calculate metrics
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast']))
    r_squared = model.score(X_train, y_train)
    metrics_data = {
        'district': district_name,
        'rmse': rmse,
        'r_squared': r_squared,
        'best_params': best_params,
        'differencing': d
    }
    metrics_df = pd.DataFrame([metrics_data])

    metrics_csv_path = os.path.join('KNNRegression', 'knn_metrics.csv')
    metrics_df.to_csv(
        metrics_csv_path,
        mode='a',
        header=not os.path.exists(metrics_csv_path),
        index=False
    )

    # 11. Plotting
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast'], label='KNN Forecast', color='darkorange')
    plt.title(f'KNN Regression Forecast for {district_name}\nRMSE: {rmse:.2f}, R²: {r_squared:.2f}')
    plt.legend()
    plt.savefig(os.path.join('KNNRegression', f'knn_forecast_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'best_params': best_params
    }

In [4]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I40"].asfreq('MS')
        
        results = knn_regression_lags_only(
            ts,
            district,
            max_lags=5,
            use_cv=True,
            cv_params={
                'n_neighbors': [3, 5, 7, 9],
                'weights': ['uniform', 'distance']
            }
        )

        
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()

=== Metrics ===
     district      rmse  r_squared                               best_params  \
0  AHMEDNAGAR  18.33896   0.083483  {'n_neighbors': 7, 'weights': 'uniform'}   

   differencing  
0             0  

=== Forecast Data ===
     district       date  actual   forecast
0  AHMEDNAGAR 2020-07-01    40.0  37.714286
1  AHMEDNAGAR 2020-08-01    55.0  37.714286
2  AHMEDNAGAR 2020-09-01    36.0  39.857143
3  AHMEDNAGAR 2020-10-01    82.0  36.428571
4  AHMEDNAGAR 2020-11-01    49.0  48.142857
=== Metrics ===
  district       rmse  r_squared                                best_params  \
0    AKOLA  10.775273        1.0  {'n_neighbors': 9, 'weights': 'distance'}   

   differencing  
0             0  

=== Forecast Data ===
  district       date  actual   forecast
0    AKOLA 2020-07-01    34.0  44.883932
1    AKOLA 2020-08-01    45.0  44.580450
2    AKOLA 2020-09-01    54.0  44.752158
3    AKOLA 2020-10-01    48.0  47.731520
4    AKOLA 2020-11-01    39.0  50.204471
=== Metrics ===
   d

ValueError: Cannot have number of splits n_splits=5 greater than the number of samples: n_samples=4.