In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def gradient_boosting_regression_lags_only(
    series,
    district_name,
    max_lags=3,
    differencing=True,
    n_estimators=100,
    learning_rate=0.1,
    max_depth=3,
    random_state=42,
    use_cv=False,
    cv_params=None,
    cv=5
):
    """
    Gradient Boosting regression for time series forecasting with:
    - Sequential boosting with decision trees
    - Time-series aware cross-validation
    - Early stopping (via n_iter_no_change)
    - Feature importance analysis
    """
    # 1. Directory setup
    os.makedirs('GradientBoostingRegression', exist_ok=True)
    
    # 2. Stationarity handling
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 3. Create lag features
    df = pd.DataFrame({'y': series.astype(float)})
    for lag in range(1, max_lags + 1):
        df[f'lag_{lag}'] = df['y'].shift(lag).astype(float)
    df = df.dropna()

    # 4. Temporal split
    train_size = int(len(df) * 0.8)
    train = df.iloc[:train_size]
    test = df.iloc[train_size:]

    X_train = train.drop(columns=['y'])
    y_train = train['y']
    X_test = test.drop(columns=['y'])
    y_test = test['y']

    # 5. Model training with temporal CV
    if use_cv:
        param_grid = cv_params or {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5],
            'subsample': [0.8, 1.0],
            'min_samples_split': [2, 5]
        }
        tscv = TimeSeriesSplit(n_splits=cv)
        grid_search = GridSearchCV(
            GradientBoostingRegressor(
                random_state=random_state,
                n_iter_no_change=10,
                validation_fraction=0.1
            ),
            param_grid,
            cv=tscv,
            scoring='neg_mean_squared_error',
            n_jobs=-1
        )
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
        best_params = grid_search.best_params_
    else:
        model = GradientBoostingRegressor(
            n_estimators=n_estimators,
            learning_rate=learning_rate,
            max_depth=max_depth,
            random_state=random_state,
            n_iter_no_change=10,
            validation_fraction=0.1
        )
        model.fit(X_train, y_train)
        best_params = {
            'n_estimators': model.n_estimators_,
            'learning_rate': learning_rate,
            'max_depth': max_depth
        }

    # 6. Forecasting
    pred_test = model.predict(X_test)

    # 7. Inverse differencing
    if d == 1:
        last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
        pred_test = np.cumsum(pred_test) + last_train_value
        y_test = original_series.iloc[-len(test):]

    # 8. Save forecasts
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast': pred_test
    })
    forecast_csv_path = os.path.join('GradientBoostingRegression', 'gb_forecasts.csv')
    forecast_df.to_csv(forecast_csv_path, mode='a', header=not os.path.exists(forecast_csv_path), index=False)

    # 9. Calculate metrics
    rmse = np.sqrt(mean_squared_error(forecast_df['actual'], forecast_df['forecast']))
    metrics_data = {
        'district': district_name,
        'rmse': rmse,
        'r_squared': model.score(X_train, y_train),
        'best_params': best_params,
        'differencing': d,
        'best_iteration': model.n_estimators_,
        'feature_importances': model.feature_importances_.tolist()
    }
    metrics_df = pd.DataFrame([metrics_data])
    metrics_csv_path = os.path.join('GradientBoostingRegression', 'gb_metrics.csv')
    metrics_df.to_csv(metrics_csv_path, mode='a', header=not os.path.exists(metrics_csv_path), index=False)

    # 10. Visualization
    plt.figure(figsize=(12, 6))
    plt.plot(original_series.index, original_series.values, label='Original Series')
    plt.plot(forecast_df['date'], forecast_df['forecast'], label='GB Forecast', color='darkblue')
    plt.title(f'Gradient Boosting Forecast for {district_name}\nRMSE: {rmse:.2f}, Trees: {model.n_estimators_}')
    plt.legend()
    plt.savefig(os.path.join('GradientBoostingRegression', f'gb_forecast_{district_name}.png'))
    plt.close()

    # Feature importance plot
    features = X_train.columns
    importances = model.feature_importances_
    plt.figure(figsize=(10, 6))
    plt.barh(features, importances, color='darkblue')
    plt.title(f'Feature Importances - {district_name}')
    plt.savefig(os.path.join('GradientBoostingRegression', f'gb_importances_{district_name}.png'))
    plt.close()

    return {
        'district': district_name,
        'forecast_df': forecast_df,
        'metrics_df': metrics_df,
        'feature_importances': dict(zip(features, importances)),
        'training_stages': model.train_score_
    }

In [4]:
districts = data['district'].unique()
rmse_values = []

def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I48"].asfreq('MS')
        
        results = gradient_boosting_regression_lags_only(
            ts,
            district,
            use_cv=True,
            cv_params={
                'learning_rate': [0.01, 0.05, 0.1],
                'max_depth': [3, 5, 7],
                'subsample': [0.8, 1.0]
            },
            cv=3
        )

        
    
        # Show results
        print("=== Metrics ===")
        print(results['metrics_df'])
        print("\n=== Forecast Data ===")
        print(results['forecast_df'].head())
    
    return results
run_for_each_district()

=== Metrics ===
     district       rmse  r_squared  \
0  AHMEDNAGAR  131.67707   0.635088   

                                         best_params  differencing  \
0  {'learning_rate': 0.05, 'max_depth': 7, 'subsa...             0   

   best_iteration                                feature_importances  
0              13  [0.40171873765838656, 0.29066800645184176, 0.3...  

=== Forecast Data ===
     district       date  actual    forecast
0  AHMEDNAGAR 2020-07-01   354.0  437.885471
1  AHMEDNAGAR 2020-08-01   391.0  442.254642
2  AHMEDNAGAR 2020-09-01   417.0  437.885471
3  AHMEDNAGAR 2020-10-01   343.0  476.769914
4  AHMEDNAGAR 2020-11-01   468.0  422.365099
=== Metrics ===
  district       rmse  r_squared  \
0    AKOLA  87.290967   0.676997   

                                         best_params  differencing  \
0  {'learning_rate': 0.01, 'max_depth': 7, 'subsa...             0   

   best_iteration                                feature_importances  
0             100  [0.277436

{'district': 'MUMBAI SUBURBAN',
 'forecast_df':           district       date  actual    forecast
 0  MUMBAI SUBURBAN 2021-02-01   607.0  787.548211
 1  MUMBAI SUBURBAN 2021-03-01   618.0  796.270886,
 'metrics_df':           district        rmse  r_squared  \
 0  MUMBAI SUBURBAN  179.413161   0.399637   
 
                                          best_params  differencing  \
 0  {'learning_rate': 0.05, 'max_depth': 3, 'subsa...             0   
 
    best_iteration                                feature_importances  
 0              16  [0.17223432341679512, 0.09940442906009306, 0.7...  ,
 'feature_importances': {'lag_1': 0.17223432341679512,
  'lag_2': 0.09940442906009306,
  'lag_3': 0.7283612475231118},
 'training_stages': array([20713.75381944, 15567.18815486,  7525.67008341, 13077.40728062,
        11523.50748301, 14605.70790428,  5485.60144539, 12288.53478308,
        17156.13093837, 15483.40817188,  6986.66474337,  6324.54335319,
         7953.507557  ,  7091.47146989, 10410.82