In [3]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from statsmodels.tsa.stattools import adfuller
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from xgboost import XGBRegressor

In [4]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [None]:
def xgboost_forecast_lags(
    series,
    district_name,
    max_lags=12,
    differencing=True,
    use_cv=True,
    cv_params=None,
    custom_params=None
):
    """
    XGBoost time series forecasting with error handling
    Returns (None, None) if insufficient data, allowing pipeline continuation
    """
    try:
        # 1. Directory setup
        os.makedirs('XGBoost_Forecasts', exist_ok=True)
        
        # 2. Stationarity and differencing
        original_series = series.copy()
        d = 0
        if differencing:
            adf_result = adfuller(series.dropna())
            if adf_result[1] > 0.05:
                d = 1
                series = series.diff().dropna()
                if len(series) < 5:  # Check after differencing
                    raise ValueError(f"Series too short after differencing: {len(series)}")

        # 3. Create lag features with validation
        df = pd.DataFrame({'y': series.astype(float)})
        for lag in range(1, max_lags + 1):
            df[f'lag_{lag}'] = df['y'].shift(lag)
        df = df.dropna()
        
        if len(df) < 10:
            raise ValueError(f"Only {len(df)} samples after lag features")

        # 4. Temporal split validation
        train_size = int(len(df) * 0.8)
        if train_size < 5 or (len(df) - train_size) < 2:
            raise ValueError(f"Insufficient split: {train_size} train, {len(df)-train_size} test")
            
        train = df.iloc[:train_size]
        test = df.iloc[train_size:]

        X_train = train.drop(columns=['y'])
        y_train = train['y']
        X_test = test.drop(columns=['y'])
        y_test = test['y']

        # 5. Dynamic cross-validation setup
        xgb_params = custom_params or {
            'n_estimators': 500,
            'learning_rate': 0.05,
            'max_depth': 3,
            'subsample': 0.8,
            'colsample_bytree': 0.8,
            'random_state': 42
        }
        
        if use_cv:
            n_splits = min(5, max(2, len(X_train)//2))  # Ensure 2 ≤ splits ≤5
            param_grid = cv_params or {
                'n_estimators': [100, 300],
                'learning_rate': [0.01, 0.05],
                'max_depth': [3, 5]
            }
            tscv = TimeSeriesSplit(n_splits=n_splits)
            
            grid_search = GridSearchCV(
                XGBRegressor(**xgb_params, verbosity=0),
                param_grid,
                cv=tscv,
                scoring='neg_mean_squared_error',
                n_jobs=-1
            )
            grid_search.fit(X_train, y_train)
            model = grid_search.best_estimator_
            best_params = grid_search.best_params_
        else:
            model = XGBRegressor(**xgb_params, verbosity=0)
            model.fit(X_train, y_train)
            best_params = xgb_params

        # 6. Forecasting and post-processing
        pred_test = model.predict(X_test)

        if d == 1:
            last_train_value = original_series.iloc[len(original_series) - len(test) - 1]
            pred_test = np.cumsum(pred_test) + last_train_value
            y_test = original_series.iloc[-len(test):]

        # 7. Save results
        forecast_df = pd.DataFrame({
            'district': district_name,
            'date': y_test.index,
            'actual': y_test.values,
            'forecast': pred_test
        })
        forecast_df.to_csv(
            os.path.join('XGBoost_Forecasts', f'{district_name}_forecast.csv'), 
            index=False
        )

        # 8. Metrics and visualization
        rmse = np.sqrt(mean_squared_error(y_test, pred_test))
        
        plt.figure(figsize=(14, 7))
        plt.plot(original_series.index, original_series, label='Original')
        plt.plot(forecast_df['date'], forecast_df['forecast'], label='Forecast', alpha=0.7)
        plt.title(f'{district_name} Forecast (RMSE: {rmse:.2f})')
        plt.legend()
        plt.savefig(os.path.join('XGBoost_Forecasts', f'{district_name}_forecast.png'))
        plt.close()

        return {
            'district': district_name,
            'rmse': rmse,
            'best_params': best_params,
            'differencing': d,
            'feature_importances': dict(zip(X_train.columns, model.feature_importances_))
        }, forecast_df

    except (ValueError, IndexError) as e:
        print(f"⚠️ Skipping {district_name}: {str(e)}")
        return None, None


In [8]:
districts = data['district'].unique()
rmse_values = []

# Usage Example


def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I1"].asfreq('MS')
        
        results, forecast = xgboost_forecast_lags(
            series=ts,  # pd.Series with DatetimeIndex
            district_name=district,
            max_lags=6,
            use_cv=True,
            cv_params={
                'n_estimators': [200, 500],
                'learning_rate': [0.05, 0.1],
                'max_depth': [3, 5]
            },
            custom_params={
                'subsample': 0.8,
                'colsample_bytree': 0.8,
                'random_state': 42
            }
        )

        
    
        # Show results
        print("=== Metrics ===")
        print(results)
        print("\n=== Forecast Data ===")
        print(forecast)
    
    return results
run_for_each_district()

=== Metrics ===
{'district': 'AHMEDNAGAR', 'rmse': 480.6463336265987, 'best_params': {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200}, 'differencing': 0, 'feature_importances': {'lag_1': 0.068087116, 'lag_2': 0.13447188, 'lag_3': 0.18072307, 'lag_4': 0.19512717, 'lag_5': 0.19026463, 'lag_6': 0.23132613}}

=== Forecast Data ===
     district       date  actual     forecast
0  AHMEDNAGAR 2020-07-01  6478.0  6725.376465
1  AHMEDNAGAR 2020-08-01  5975.0  6330.391113
2  AHMEDNAGAR 2020-09-01  6550.0  7082.198242
3  AHMEDNAGAR 2020-10-01  6471.0  6924.736328
4  AHMEDNAGAR 2020-11-01  6107.0  6851.355957
5  AHMEDNAGAR 2020-12-01  7460.0  7047.118652
6  AHMEDNAGAR 2021-01-01  6692.0  7061.560547
7  AHMEDNAGAR 2021-02-01  7023.0  6649.207031
8  AHMEDNAGAR 2021-03-01  6274.0  6907.833984
=== Metrics ===
{'district': 'AKOLA', 'rmse': 260.95722324245725, 'best_params': {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 500}, 'differencing': 0, 'feature_importances': {'lag_1': 0.227

In [7]:
# !pip install "dask<2025.1"
