In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import lightgbm as lgb
from statsmodels.tsa.stattools import adfuller
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.transformations.series.summarize import WindowSummarizer
from sktime.forecasting.compose import make_reduction
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [2]:
data = pd.read_csv("../data/HMIS_DATA_CORRECTED_17_21/mh_dist17_21_with_IDs_date_correction.csv")
data = data[(data['indicator_type'] == 'Total [(A+B) or (C+D)]')]
data['date'] = pd.to_datetime(data['date'])
data = data.set_index('date')
data.index = pd.DatetimeIndex(data.index)

In [3]:
def lightgbm_forecast_lags(series, district_name, 
                          max_lags=6, 
                          window_features=None,
                          differencing=True,
                          use_cv=True,
                          custom_params=None):
    """
    LightGBM time series forecasting with proper temporal validation
    """
    # Create output directory
    os.makedirs('LightGBM_Forecasts', exist_ok=True)
    
    # 1. Stationarity handling
    original_series = series.copy()
    d = 0
    if differencing:
        adf_result = adfuller(series.dropna())
        if adf_result[1] > 0.05:
            d = 1
            series = series.diff().dropna()

    # 2. Temporal split using sktime's proper method
    y_train, y_test = temporal_train_test_split(series, test_size=0.2)

    # 3. Create feature engineering pipeline
    features = []
    if max_lags > 0:
        features.append(("lags", WindowSummarizer(
            lag_feature={"lag": list(range(1, max_lags+1))},
            truncate="bfill"
        )))
    
    if window_features:
        features.append(("window", WindowSummarizer(
            lag_feature={
                "mean": [[1, window_features['mean_window']]],
                "std": [[1, window_features['std_window']]]
            },
            truncate="bfill"
        )))

    # 4. LightGBM regressor setup
    regressor = lgb.LGBMRegressor(
        **custom_params or {
            'objective': 'regression',
            'metric': 'rmse',
            'num_leaves': 31,
            'learning_rate': 0.05,
            'n_estimators': 500,
            'random_state': 42
        }
    )
    
    # 5. Create forecasting pipeline
    forecaster = make_reduction(
        estimator=Pipeline([
            ("features", Pipeline(features)),
            ("regressor", regressor)
        ]),
        window_length=max_lags,
        strategy="recursive"
    )

    # 6. Hyperparameter tuning with temporal CV
    if use_cv:
        param_grid = {
            'estimator__regressor__num_leaves': [15, 31, 63],
            'estimator__regressor__learning_rate': [0.01, 0.05, 0.1],
            'estimator__regressor__min_child_samples': [10, 20, 50]
        }
        
        forecaster = GridSearchCV(
            forecaster,
            param_grid,
            cv=TimeSeriesSplit(n_splits=5),
            scoring='neg_mean_squared_error',
            n_jobs=-1
        )

    # 7. Training
    forecaster.fit(y_train)
    
    # 8. Forecasting
    fh = np.arange(len(y_test)) + 1  # Forecast horizon
    y_pred = forecaster.predict(fh)
    
    # 9. Inverse differencing
    if d == 1:
        last_train_value = original_series.iloc[-len(y_test)-1]
        y_pred = pd.Series(np.cumsum(y_pred) + last_train_value, index=y_test.index)
        y_test = original_series.iloc[-len(y_test):]

    # 10. Create results dataframe
    forecast_df = pd.DataFrame({
        'district': district_name,
        'date': y_test.index,
        'actual': y_test.values,
        'forecast': y_pred.values
    })

    # 11. Save results
    forecast_df.to_csv(
        f'LightGBM_Forecasts/{district_name}_forecast.csv',
        index=False
    )

    # 12. Calculate metrics
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    metrics = {
        'district': district_name,
        'rmse': rmse,
        'best_params': getattr(forecaster, 'best_params_', None),
        'differencing': d
    }

    # 13. Visualization
    plt.figure(figsize=(14, 7))
    plt.plot(original_series.index, original_series, label='Original')
    plt.plot(forecast_df['date'], forecast_df['forecast'], 
            label='LightGBM Forecast', alpha=0.7)
    plt.title(f'{district_name} Forecast\nRMSE: {rmse:.2f}')
    plt.legend()
    plt.savefig(f'LightGBM_Forecasts/{district_name}_forecast_plot.png')
    plt.close()

    return metrics, forecast_df

In [4]:
districts = data['district'].unique()
rmse_values = []

# Usage Example


def run_for_each_district():
    results = {}
    
    for district in districts:
        district_data = data[data['district'] == district]
        ts = district_data["I48"].asfreq('MS')
        
        results, forecast = lightgbm_forecast_lags(
        series=ts,
        district_name=district,
        max_lags=3,
        window_features={'mean_window': 3, 'std_window': 6},
        custom_params={
            'num_leaves': 63,
            'learning_rate': 0.1,
            'n_estimators': 1000
        }
    )

        
    
        # Show results
        print("=== Metrics ===")
        print(results)
        print("\n=== Forecast Data ===")
        print(forecast)
    
    return results
run_for_each_district()

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000087 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 470.600000
=== Metrics ===
{'district': 'AHMEDNAGAR', 'rmse': 130.2921199604947, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
     district       date  actual    forecast
0  AHMEDNAGAR 2020-06-01     378  396.556197
1  AHMEDNAGAR 2020-07-01     354  449.200639
2  AHMEDNAGAR 2020-08-01     391  476.997212
3  AHMEDNAGAR 2020-09-01     417  480.586661
4  AHMEDNAGAR 2020-10-01     343  513.225232
5  AHMEDNAGAR 2020-11-01     468  536.021537
6  AHMEDNAGAR 2020-12-01     305  536.021537
7  AHMEDNAGAR 2021-01-01     372  467.010

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 455.971429
=== Metrics ===
{'district': 'AKOLA', 'rmse': 79.92392978946668, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0    AKOLA 2020-06-01     339  396.198962
1    AKOLA 2020-07-01     407  425.646844
2    AKOLA 2020-08-01     306  420.816961
3    AKOLA 2020-09-01     504  476.122899
4    AKOLA 2020-10-01     493  521.828043
5    AKOLA 2020-11-01     514  459.190623
6    AKOLA 2020-12-01     450  429.713131
7    AKOLA 2021-01-01     481  460.775708
8    AKOLA 2021-02-01     41

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000257 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score -12.000000
=== Metrics ===
{'district': 'AMRAVATI', 'rmse': 962.3824856769169, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
   district       date  actual    forecast
0  AMRAVATI 2020-06-01     380  220.682770
1  AMRAVATI 2020-07-01     470  186.872171
2  AMRAVATI 2020-08-01     482  -38.568215
3  AMRAVATI 2020-09-01     586  -61.657467
4  AMRAVATI 2020-10-01     819 -284.895560
5  AMRAVATI 2020-11-01     609 -305.378911
6  AMRAVATI 2020-12-01     634 -528.617004
7  AMRAVATI 2021-01-01     472 -549.100355
8  AMRAVATI 2021

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score -6.500000
=== Metrics ===
{'district': 'AURANGABAD', 'rmse': 220.01835028954503, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
     district       date  actual    forecast
0  AURANGABAD 2020-06-01     426  526.547173
1  AURANGABAD 2020-07-01     378  579.894927
2  AURANGABAD 2020-08-01     729  529.479642
3  AURANGABAD 2020-09-01     857  517.711219
4  AURANGABAD 2020-10-01     889  475.849045
5  AURANGABAD 2020-11-01     837  551.999703
6  AURANGABAD 2020-12-01     522  588.692645
7  AURANGABAD 2021-01-01     685  634.274

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000200 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 360.142857
=== Metrics ===
{'district': 'BEED', 'rmse': 49.7737991557493, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0     BEED 2020-06-01     546  491.258327
1     BEED 2020-07-01     465  491.258327
2     BEED 2020-08-01     554  487.897845
3     BEED 2020-09-01     485  491.258327
4     BEED 2020-10-01     495  491.258327
5     BEED 2020-11-01     472  491.258327
6     BEED 2020-12-01     561  491.258327
7     BEED 2021-01-01     439  491.258327
8     BEED 2021-02-01     414 

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 298.485714
=== Metrics ===
{'district': 'BHANDARA', 'rmse': 90.19503329881378, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
   district       date  actual    forecast
0  BHANDARA 2020-06-01     250  281.734030
1  BHANDARA 2020-07-01     267  326.925708
2  BHANDARA 2020-08-01     323  375.852033
3  BHANDARA 2020-09-01     344  391.947873
4  BHANDARA 2020-10-01     397  349.592405
5  BHANDARA 2020-11-01     285  341.112953
6  BHANDARA 2020-12-01     316  333.794943
7  BHANDARA 2021-01-01     194  341.936729
8  BHANDARA 2021

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000199 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 232.400000
=== Metrics ===
{'district': 'BULDHANA', 'rmse': 51.04420742337745, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
   district       date  actual    forecast
0  BULDHANA 2020-06-01     197  202.357360
1  BULDHANA 2020-07-01     186  202.357360
2  BULDHANA 2020-08-01     211  229.926813
3  BULDHANA 2020-09-01     266  250.158694
4  BULDHANA 2020-10-01     280  271.465333
5  BULDHANA 2020-11-01     328  222.869273
6  BULDHANA 2020-12-01     268  237.968200
7  BULDHANA 2021-01-01     168  238.143155
8  BULDHANA 2021

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000181 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score -2.147059
=== Metrics ===
{'district': 'CHANDRAPUR', 'rmse': 212.10241777734572, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
     district       date  actual    forecast
0  CHANDRAPUR 2020-06-01     327  293.551139
1  CHANDRAPUR 2020-07-01     445  300.373812
2  CHANDRAPUR 2020-08-01     502  282.406660
3  CHANDRAPUR 2020-09-01     592  235.657997
4  CHANDRAPUR 2020-10-01     628  303.986453
5  CHANDRAPUR 2020-11-01     562  365.798826
6  CHANDRAPUR 2020-12-01     467  316.061131
7  CHANDRAPUR 2021-01-01     385  259.988

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000259 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 373.457143
=== Metrics ===
{'district': 'DHULE', 'rmse': 107.8282405200693, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0    DHULE 2020-06-01     227  348.589047
1    DHULE 2020-07-01     106  328.063078
2    DHULE 2020-08-01     142  236.384268
3    DHULE 2020-09-01     210  309.467372
4    DHULE 2020-10-01     167  304.668314
5    DHULE 2020-11-01     356  328.063078
6    DHULE 2020-12-01     294  304.668314
7    DHULE 2021-01-01     398  304.668314
8    DHULE 2021-02-01     33

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score -1.323529
=== Metrics ===
{'district': 'GADCHIROLI', 'rmse': 82.66615428212576, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
     district       date  actual    forecast
0  GADCHIROLI 2020-06-01     318  293.839873
1  GADCHIROLI 2020-07-01     294  309.121610
2  GADCHIROLI 2020-08-01     439  325.537906
3  GADCHIROLI 2020-09-01     482  364.043112
4  GADCHIROLI 2020-10-01     431  387.892371
5  GADCHIROLI 2020-11-01     434  336.177609
6  GADCHIROLI 2020-12-01     399  297.205315
7  GADCHIROLI 2021-01-01     360  279.8628

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000192 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score -2.411765
=== Metrics ===
{'district': 'GONDIA', 'rmse': 56.69478873174892, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
  district       date  actual    forecast
0   GONDIA 2020-06-01     261  262.850161
1   GONDIA 2020-07-01     335  286.126483
2   GONDIA 2020-08-01     304  293.766579
3   GONDIA 2020-09-01     282  286.963006
4   GONDIA 2020-10-01     335  263.635144
5   GONDIA 2020-11-01     425  310.575041
6   GONDIA 2020-12-01     365  375.125532
7   GONDIA 2021-01-01     314  338.212005
8   GONDIA 2021-02-01     25

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000133 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 170.514286
=== Metrics ===
{'district': 'HINGOLI', 'rmse': 84.54568522276735, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0  HINGOLI 2020-06-01      91  195.723694
1  HINGOLI 2020-07-01     138  237.854998
2  HINGOLI 2020-08-01      86  163.055484
3  HINGOLI 2020-09-01     165  188.302432
4  HINGOLI 2020-10-01     164  234.178560
5  HINGOLI 2020-11-01     125  218.320049
6  HINGOLI 2020-12-01      79  191.700451
7  HINGOLI 2021-01-01     296  224.390233
8  HINGOLI 2021-02-01     

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000151 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 306.800000
=== Metrics ===
{'district': 'JALGAON', 'rmse': 115.73984739568512, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0  JALGAON 2020-06-01     281  353.320001
1  JALGAON 2020-07-01     244  393.654842
2  JALGAON 2020-08-01     292  385.743107
3  JALGAON 2020-09-01     362  385.743107
4  JALGAON 2020-10-01     415  448.081521
5  JALGAON 2020-11-01     469  384.955850
6  JALGAON 2020-12-01     593  384.955850
7  JALGAON 2021-01-01     340  403.907083
8  JALGAON 2021-02-01    

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000238 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 265.400000
=== Metrics ===
{'district': 'JALNA', 'rmse': 60.69053819390452, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0    JALNA 2020-06-01     159  247.164295
1    JALNA 2020-07-01     260  247.164295
2    JALNA 2020-08-01     218  238.588587
3    JALNA 2020-09-01     164  238.588587
4    JALNA 2020-10-01     226  238.588587
5    JALNA 2020-11-01     211  238.588587
6    JALNA 2020-12-01     163  238.588587
7    JALNA 2021-01-01     184  238.588587
8    JALNA 2021-02-01     16

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000277 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 378.057143
=== Metrics ===
{'district': 'KOLHAPUR', 'rmse': 87.43742087700168, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
   district       date  actual    forecast
0  KOLHAPUR 2020-06-01     356  424.329807
1  KOLHAPUR 2020-07-01     383  431.236503
2  KOLHAPUR 2020-08-01     292  416.025836
3  KOLHAPUR 2020-09-01     390  415.651062
4  KOLHAPUR 2020-10-01     332  415.651062
5  KOLHAPUR 2020-11-01     381  415.651062
6  KOLHAPUR 2020-12-01     365  415.651062
7  KOLHAPUR 2021-01-01     395  415.651062
8  KOLHAPUR 2021

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000048 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 286.828571
=== Metrics ===
{'district': 'LATUR', 'rmse': 105.7331129316178, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0    LATUR 2020-06-01     371  280.077986
1    LATUR 2020-07-01     345  302.482593
2    LATUR 2020-08-01     387  334.246919
3    LATUR 2020-09-01     426  334.246919
4    LATUR 2020-10-01     448  334.246919
5    LATUR 2020-11-01     462  334.246919
6    LATUR 2020-12-01     367  334.246919
7    LATUR 2021-01-01     570  334.246919
8    LATUR 2021-02-01     32

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000299 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score -33.264706
=== Metrics ===
{'district': 'MUMBAI', 'rmse': 452.8400400784729, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
  district       date  actual     forecast
0   MUMBAI 2020-06-01     663   659.939248
1   MUMBAI 2020-07-01     513   620.125533
2   MUMBAI 2020-08-01     579   741.337088
3   MUMBAI 2020-09-01     683   800.484771
4   MUMBAI 2020-10-01     763   950.560017
5   MUMBAI 2020-11-01     583  1009.707701
6   MUMBAI 2020-12-01     934  1210.025974
7   MUMBAI 2021-01-01     474  1128.127574
8   MUMBAI 2021-02

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 825.542857
=== Metrics ===
{'district': 'NAGPUR', 'rmse': 353.08275698552853, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual     forecast
0   NAGPUR 2020-06-01     610   877.389942
1   NAGPUR 2020-07-01     617   939.692384
2   NAGPUR 2020-08-01     638  1039.693091
3   NAGPUR 2020-09-01     488   797.087049
4   NAGPUR 2020-10-01     603   634.285710
5   NAGPUR 2020-11-01     560   544.612001
6   NAGPUR 2020-12-01     819   656.405591
7   NAGPUR 2021-01-01     599   877.389942
8   NAGPUR 2021-0

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000250 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 393.828571
=== Metrics ===
{'district': 'NANDED', 'rmse': 131.14862667807523, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0   NANDED 2020-06-01     287  419.522177
1   NANDED 2020-07-01     261  495.315376
2   NANDED 2020-08-01     400  443.399527
3   NANDED 2020-09-01     435  390.636107
4   NANDED 2020-10-01     391  402.783192
5   NANDED 2020-11-01     412  452.203826
6   NANDED 2020-12-01     334  481.714852
7   NANDED 2021-01-01     287  390.636107
8   NANDED 2021-02-01     

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000161 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 452.285714
=== Metrics ===
{'district': 'NANDURBAR', 'rmse': 79.38149112008671, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
    district       date  actual    forecast
0  NANDURBAR 2020-06-01     434  338.617433
1  NANDURBAR 2020-07-01     480  454.353506
2  NANDURBAR 2020-08-01     533  518.672360
3  NANDURBAR 2020-09-01     497  586.137505
4  NANDURBAR 2020-10-01     568  496.256150
5  NANDURBAR 2020-11-01     511  426.948335
6  NANDURBAR 2020-12-01     505  403.643839
7  NANDURBAR 2021-01-01     342  387.868187
8  NAN

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000246 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 1003.542857
=== Metrics ===
{'district': 'NASHIK', 'rmse': 150.63085767168744, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual     forecast
0   NASHIK 2020-06-01     800   892.213263
1   NASHIK 2020-07-01     975  1040.379361
2   NASHIK 2020-08-01    1023  1051.566373
3   NASHIK 2020-09-01    1143   991.663460
4   NASHIK 2020-10-01    1264   911.105115
5   NASHIK 2020-11-01    1037   849.505577
6   NASHIK 2020-12-01     976   872.118318
7   NASHIK 2021-01-01     882   881.825884
8   NASHIK 2021-

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000195 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 170.800000
=== Metrics ===
{'district': 'OSMANABAD', 'rmse': 47.31400830016783, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
    district       date  actual    forecast
0  OSMANABAD 2020-06-01     182  187.879561
1  OSMANABAD 2020-07-01     177  187.583396
2  OSMANABAD 2020-08-01     234  162.865924
3  OSMANABAD 2020-09-01     190  147.036580
4  OSMANABAD 2020-10-01     248  147.853385
5  OSMANABAD 2020-11-01     189  174.650009
6  OSMANABAD 2020-12-01     200  210.681678
7  OSMANABAD 2021-01-01     138  199.842249
8  OSM

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000198 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 787.342857
=== Metrics ===
{'district': 'PALGHAR', 'rmse': 162.73805978540085, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0  PALGHAR 2020-06-01     808  733.737663
1  PALGHAR 2020-07-01     784  699.297994
2  PALGHAR 2020-08-01     785  731.067773
3  PALGHAR 2020-09-01     795  721.270582
4  PALGHAR 2020-10-01    1020  714.791173
5  PALGHAR 2020-11-01    1006  744.408746
6  PALGHAR 2020-12-01     948  721.270582
7  PALGHAR 2021-01-01     784  714.791173
8  PALGHAR 2021-02-01    

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000205 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 76
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 197.400000
=== Metrics ===
{'district': 'PARBHANI', 'rmse': 89.77614995843207, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
   district       date  actual    forecast
0  PARBHANI 2020-06-01     140  119.631055
1  PARBHANI 2020-07-01     257  152.007355
2  PARBHANI 2020-08-01     232  152.007355
3  PARBHANI 2020-09-01     291  193.315542
4  PARBHANI 2020-10-01     315  194.120455
5  PARBHANI 2020-11-01     205  179.792716
6  PARBHANI 2020-12-01     179  166.582682
7  PARBHANI 2021-01-01     287  126.856697
8  PARBHANI 2021

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000145 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 1863.028571
=== Metrics ===
{'district': 'PUNE', 'rmse': 732.0585657746469, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual     forecast
0     PUNE 2020-06-01    1284  2047.409067
1     PUNE 2020-07-01    1326  2080.066377
2     PUNE 2020-08-01    1529  2015.231616
3     PUNE 2020-09-01    1439  2080.163365
4     PUNE 2020-10-01    1349  2080.163365
5     PUNE 2020-11-01    1578  2080.163365
6     PUNE 2020-12-01    1364  2080.163365
7     PUNE 2021-01-01    1384  2080.163365
8     PUNE 2021-02-

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000180 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 358.600000
=== Metrics ===
{'district': 'RAIGAD', 'rmse': 76.71765100844038, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0   RAIGAD 2020-06-01     425  412.915105
1   RAIGAD 2020-07-01     420  441.707262
2   RAIGAD 2020-08-01     459  387.514307
3   RAIGAD 2020-09-01     475  403.037698
4   RAIGAD 2020-10-01     491  387.514307
5   RAIGAD 2020-11-01     465  405.457670
6   RAIGAD 2020-12-01     444  403.037698
7   RAIGAD 2021-01-01     377  405.457670
8   RAIGAD 2021-02-01     2

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000141 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 64
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score 1.000000
=== Metrics ===
{'district': 'RATNAGIRI', 'rmse': 75.68748286442825, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
    district       date  actual    forecast
0  RATNAGIRI 2020-06-01     415  255.014949
1  RATNAGIRI 2020-07-01     203  255.924597
2  RATNAGIRI 2020-08-01     179  241.181672
3  RATNAGIRI 2020-09-01     255  226.035003
4  RATNAGIRI 2020-10-01     253  255.899554
5  RATNAGIRI 2020-11-01     257  287.914502
6  RATNAGIRI 2020-12-01     209  280.141367
7  RATNAGIRI 2021-01-01     184  248.949306
8  RATNA

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000204 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 432.885714
=== Metrics ===
{'district': 'SANGLI', 'rmse': 64.9307025764279, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0   SANGLI 2020-06-01     481  410.286247
1   SANGLI 2020-07-01     547  448.185976
2   SANGLI 2020-08-01     406  324.553220
3   SANGLI 2020-09-01     461  431.022585
4   SANGLI 2020-10-01     373  401.390031
5   SANGLI 2020-11-01     410  390.273826
6   SANGLI 2020-12-01     432  389.735909
7   SANGLI 2021-01-01     300  383.313490
8   SANGLI 2021-02-01     29

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000245 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 458.314286
=== Metrics ===
{'district': 'SATARA', 'rmse': 135.8562276199418, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0   SATARA 2020-06-01     350  496.725612
1   SATARA 2020-07-01     347  475.341068
2   SATARA 2020-08-01     349  465.750879
3   SATARA 2020-09-01     446  449.585340
4   SATARA 2020-10-01     494  477.507322
5   SATARA 2020-11-01     518  482.680091
6   SATARA 2020-12-01     814  480.202933
7   SATARA 2021-01-01     577  461.131318
8   SATARA 2021-02-01     4

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000241 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 71
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 119.857143
=== Metrics ===
{'district': 'SINDHUDURG', 'rmse': 20.59636675594302, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
     district       date  actual    forecast
0  SINDHUDURG 2020-06-01     116  102.548191
1  SINDHUDURG 2020-07-01     121  107.391231
2  SINDHUDURG 2020-08-01      97  114.845204
3  SINDHUDURG 2020-09-01     141  115.757251
4  SINDHUDURG 2020-10-01     144  112.400139
5  SINDHUDURG 2020-11-01     117  104.141768
6  SINDHUDURG 2020-12-01     117  114.845204
7  SINDHUDURG 2021-01-01      86  110.151

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000153 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 72
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 537.771429
=== Metrics ===
{'district': 'SOLAPUR', 'rmse': 103.08037978704976, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0  SOLAPUR 2020-06-01     338  437.698761
1  SOLAPUR 2020-07-01     410  460.509254
2  SOLAPUR 2020-08-01     405  519.883434
3  SOLAPUR 2020-09-01     654  460.509254
4  SOLAPUR 2020-10-01     535  460.509254
5  SOLAPUR 2020-11-01     622  500.018867
6  SOLAPUR 2020-12-01     481  460.509254
7  SOLAPUR 2021-01-01     416  460.509254
8  SOLAPUR 2021-02-01    

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 75
[LightGBM] [Info] Number of data points in the train set: 34, number of used features: 6
[LightGBM] [Info] Start training from score -4.382353
=== Metrics ===
{'district': 'THANE', 'rmse': 375.45237954059536, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 1}

=== Forecast Data ===
  district       date  actual     forecast
0    THANE 2020-06-01     749  1073.372853
1    THANE 2020-07-01     708   987.549554
2    THANE 2020-08-01     893  1069.387013
3    THANE 2020-09-01     916  1225.759866
4    THANE 2020-10-01    1156  1193.978085
5    THANE 2020-11-01     848  1217.747605
6    THANE 2020-12-01    1031  1208.714214
7    THANE 2021-01-01     949  1269.230418
8    THANE 2021-02-

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000105 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 73
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 218.971429
=== Metrics ===
{'district': 'WARDHA', 'rmse': 74.29566760490735, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0   WARDHA 2020-06-01     180  217.256448
1   WARDHA 2020-07-01     179  270.092797
2   WARDHA 2020-08-01     192  257.255583
3   WARDHA 2020-09-01     251  241.970443
4   WARDHA 2020-10-01     241  215.148054
5   WARDHA 2020-11-01     231  266.244788
6   WARDHA 2020-12-01     310  256.051993
7   WARDHA 2021-01-01     166  252.878996
8   WARDHA 2021-02-01     1

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 70
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 127.371429
=== Metrics ===
{'district': 'WASHIM', 'rmse': 45.30720599046441, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
  district       date  actual    forecast
0   WASHIM 2020-06-01     107  171.765700
1   WASHIM 2020-07-01     154  179.761825
2   WASHIM 2020-08-01     127  164.942770
3   WASHIM 2020-09-01     120  157.769066
4   WASHIM 2020-10-01     113  157.769066
5   WASHIM 2020-11-01     154  157.769066
6   WASHIM 2020-12-01     123  157.769066
7   WASHIM 2021-01-01     105  157.769066
8   WASHIM 2021-02-01     1

 nan nan nan nan nan nan nan nan nan]


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000156 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 74
[LightGBM] [Info] Number of data points in the train set: 35, number of used features: 6
[LightGBM] [Info] Start training from score 215.028571
=== Metrics ===
{'district': 'YAVATMAL', 'rmse': 31.188946678458368, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
   district       date  actual    forecast
0  YAVATMAL 2020-06-01     152  116.439654
1  YAVATMAL 2020-07-01     148  155.341672
2  YAVATMAL 2020-08-01     201  155.341672
3  YAVATMAL 2020-09-01     225  155.341672
4  YAVATMAL 2020-10-01     182  155.341672
5  YAVATMAL 2020-11-01     174  155.341672
6  YAVATMAL 2020-12-01     163  155.341672
7  YAVATMAL 2021-01-01     146  155.341672
8  YAVATMAL 202

27 fits failed out of a total of 135.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
27 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\nauti\AppData\Local\Programs\Python\Python311\Lib\site-packages\sklearn\model_selection\_validation.py", line 893, in _fit_and_score
    estimator.fit(X_train, **fit_params)
  File "c:\Users\nauti\AppData\Local\Programs\Python\Python311\Lib\site-packages\sktime\forecasting\base\_base.py", line 396, in fit
    self._fit(y=y_inner, X=X_inner, fh=fh)
  File "c:\Users\nauti\AppData\Local\Programs\Python\Python311\Lib\site-packages\sktime\forecasting\compose\_reduce.py", line 972, in _fit
    self.estimator_.fit(Xt, yt)
  File "c:\Users\nauti\AppData\Local\Programs\P

[LightGBM] [Info] Total Bins 0
[LightGBM] [Info] Number of data points in the train set: 6, number of used features: 0
[LightGBM] [Info] Start training from score 838.000000
=== Metrics ===
{'district': 'MUMBAI SUBURBAN', 'rmse': 217.07295240694236, 'best_params': {'estimator__regressor__learning_rate': 0.01, 'estimator__regressor__min_child_samples': 10, 'estimator__regressor__num_leaves': 15}, 'differencing': 0}

=== Forecast Data ===
          district       date  actual  forecast
0  MUMBAI SUBURBAN 2021-01-01     639     838.0
1  MUMBAI SUBURBAN 2021-02-01     607     838.0
2  MUMBAI SUBURBAN 2021-03-01     618     838.0


{'district': 'MUMBAI SUBURBAN',
 'rmse': 217.07295240694236,
 'best_params': {'estimator__regressor__learning_rate': 0.01,
  'estimator__regressor__min_child_samples': 10,
  'estimator__regressor__num_leaves': 15},
 'differencing': 0}

In [5]:
# !pip install "dask<2025.1"
