## One sku at a time

### Import libraries

In [2]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping, log_evaluation
from datetime import timedelta
import os

### Load and Prepare Data

In [3]:
# Load preprocessed data
current_file = os.path.abspath("__file__")
code_dir = os.path.dirname(current_file)
data_dir = os.path.join(os.path.dirname(code_dir), "Data", "Processed")

sales_model = pd.read_pickle(os.path.join(data_dir, "lgbm_state_evaluation_data.pkl"))

In [4]:
# Configuration
R = 7   # review period
L = 3   # forecast lag
horizon = R + L  # forecast window
features = ['lag_7', 'lag_28', 'rmean_7', 'rmean_28',
            'avg_sell_price', 'day', 'weekday', 'month', 'year']


In [None]:
# sku_list = [
#     "FOODS_3_819", "FOODS_3_090",
#     "HOBBIES_1_234", "HOUSEHOLD_1_118"
# ]

In [21]:
# Filter for the SKU of interest
sku_id = "HOBBIES_1_234"
state_id = "TX"

sku_df = sales_model[(sales_model['item_id'] == sku_id) & (sales_model['state_id'] == state_id)].copy().sort_values("date")
# sku_df = sales_model[(sales_model['item_id'] == sku_id)].copy().sort_values("date")

# Drop rows with missing lag or rmean values
sku_df = sku_df.dropna(subset=features + ['sales'])
sku_df.head()

Unnamed: 0,state_id,item_id,d,date,wm_yr_wk,sales,lag_7,lag_28,rmean_7,rmean_28,avg_sell_price,day,weekday,month,year
9146020,TX,HOBBIES_1_234,d_29,2011-02-26,11105,0,0.0,0.0,0.0,0.0,0.3,26,5,2,2011
9146021,TX,HOBBIES_1_234,d_30,2011-02-27,11105,0,0.0,0.0,0.0,0.0,0.3,27,6,2,2011
9146022,TX,HOBBIES_1_234,d_31,2011-02-28,11105,0,0.0,0.0,0.0,0.0,0.3,28,0,2,2011
9146023,TX,HOBBIES_1_234,d_32,2011-03-01,11105,0,0.0,0.0,0.0,0.0,0.3,1,1,3,2011
9146024,TX,HOBBIES_1_234,d_33,2011-03-02,11105,0,0.0,0.0,0.0,0.0,0.3,2,2,3,2011


### Set up review indices

In [11]:
# Set up rolling forecast indices
T = len(sku_df)
start_idx = T - 393
end_idx = T - 1
review_indices = list(range(end_idx, start_idx - 1, -R))
review_indices = sorted(review_indices)

print(f"Total rolling points: {len(review_indices)}")
print(f"Review indices: {review_indices}")

Total rolling points: 57
Review indices: [1520, 1527, 1534, 1541, 1548, 1555, 1562, 1569, 1576, 1583, 1590, 1597, 1604, 1611, 1618, 1625, 1632, 1639, 1646, 1653, 1660, 1667, 1674, 1681, 1688, 1695, 1702, 1709, 1716, 1723, 1730, 1737, 1744, 1751, 1758, 1765, 1772, 1779, 1786, 1793, 1800, 1807, 1814, 1821, 1828, 1835, 1842, 1849, 1856, 1863, 1870, 1877, 1884, 1891, 1898, 1905, 1912]


### Rolling window forecasts

In [22]:
forecasts = []

for review_idx in review_indices:
    train_data = sku_df.iloc[:review_idx + 1]
    test_data = sku_df.iloc[review_idx + 1 : review_idx + 1 + horizon]

    if len(test_data) < horizon:
        continue  

    X_train = train_data[features]
    y_train = train_data['sales']

    model = LGBMRegressor(
        objective='regression',
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    )
    model.fit(X_train, y_train)

    X_forecast = test_data[features]
    y_pred = model.predict(X_forecast)

    for i, pred in enumerate(y_pred):
        forecasts.append({
            'review_index': review_idx, # index of the review point
            'review_day': sku_df.iloc[review_idx]['date'], # date of the review point
            'forecast_day': f'F{i+1}', # forecast day (F1, F2, ...)
            'forecast': pred, # predicted sales
            # 'forecast_index': review_idx + i + 1, # index of the forecast day
            # forcast date
            'forecast_d': sku_df.iloc[review_idx + i + 1]['date'] if (review_idx + i + 1) < len(sku_df) else None, # forecast day (d_1, d_2, 
            

            
        })


rolling_lgbm_df = pd.DataFrame(forecasts)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000340 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 469
[LightGBM] [Info] Number of data points in the train set: 1521, number of used features: 8
[LightGBM] [Info] Start training from score 4.051282
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000080 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 469
[LightGBM] [Info] Number of data points in the train set: 1528, number of used features: 8
[LightGBM] [Info] Start training from score 4.047775
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000216 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 469
[LightGBM] [Info] Number of data points in the train set: 1535, number of used features: 8
[LightGBM] [Info] Start training fro

### Save forecast result

In [23]:
# Save result
forecast_folder = "Point Forecasts"
os.makedirs(forecast_folder, exist_ok=True)
forecast_csv_path = os.path.join(forecast_folder, "lgbm_HOBBIES_1_234_TX_forecast.csv")
rolling_lgbm_df.to_csv(forecast_csv_path, index=False)

# Preview
print("✅ Rolling forecast complete.")
print(rolling_lgbm_df.head())

✅ Rolling forecast complete.
   review_index review_day forecast_day  forecast forecast_d
0          1520 2015-04-26           F1  0.124753 2015-04-27
1          1520 2015-04-26           F2  2.902038 2015-04-28
2          1520 2015-04-26           F3  3.278507 2015-04-29
3          1520 2015-04-26           F4  6.115380 2015-04-30
4          1520 2015-04-26           F5  3.907282 2015-05-01


## Loop through SKUs

In [25]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping, log_evaluation
from datetime import timedelta
import os

# Load preprocessed data
current_file = os.path.abspath("__file__")
code_dir = os.path.dirname(current_file)
data_dir = os.path.join(os.path.dirname(code_dir), "Data", "Processed")

sales_model = pd.read_pickle(os.path.join(data_dir, "lgbm_state_evaluation_data.pkl"))


# Configuration
R = 7   # review period
L = 3   # forecast lag
horizon = R + L  # forecast window
features = ['lag_7', 'lag_28', 'rmean_7', 'rmean_28',
            'avg_sell_price', 'day', 'weekday', 'month', 'year']

# Initialize variables
sku_list = [
    "FOODS_3_819", "FOODS_3_090",
    "HOBBIES_1_234", "HOUSEHOLD_1_118"
]
state_id = "TX"

# Loop through each SKU
for sku in sku_list:
    sku_id = sku
    
    sku_df = sales_model[(sales_model['item_id'] == sku_id) & (sales_model['state_id'] == state_id)].copy().sort_values("date")
    sku_df = sku_df.dropna(subset=features + ['sales'])
    
    # Set up rolling forecast indices
    T = len(sku_df)
    start_idx = T - 393
    end_idx = T - 1
    review_indices = list(range(end_idx, start_idx - 1, -R))
    review_indices = sorted(review_indices)

    # Initialize model
    forecasts = []

    for review_idx in review_indices:
        train_data = sku_df.iloc[:review_idx + 1]
        test_data = sku_df.iloc[review_idx + 1 : review_idx + 1 + horizon]

        if len(test_data) < horizon:
            continue  

        X_train = train_data[features]
        y_train = train_data['sales']

        model = LGBMRegressor(
            objective='regression',
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        )
        model.fit(X_train, y_train)

        X_forecast = test_data[features]
        y_pred = model.predict(X_forecast)

        for i, pred in enumerate(y_pred):
            forecasts.append({
                'review_index': review_idx, # index of the review point
                'review_day': sku_df.iloc[review_idx]['date'], # date of the review point
                'forecast_day': f'F{i+1}', # forecast day (F1, F2, ...)
                'forecast': pred, # predicted sales
                'forecast_d': sku_df.iloc[review_idx + i + 1]['date'] if (review_idx + i + 1) < len(sku_df) else None
            })

    rolling_lgbm_df = pd.DataFrame(forecasts)

    # Save result
    forecast_folder = "Point Forecasts"
    os.makedirs(forecast_folder, exist_ok=True)
    forecast_csv_path = os.path.join(forecast_folder, "lgbm_"+sku+"_TX"+"_forecast.csv")
    rolling_lgbm_df.to_csv(forecast_csv_path, index=False)

    # Preview
    print("✅ Rolling forecast complete.")
    print(rolling_lgbm_df.head())

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000178 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1521, number of used features: 8
[LightGBM] [Info] Start training from score 4.518738
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 1528, number of used features: 8
[LightGBM] [Info] Start training from score 4.507199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000125 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 1535, number of used features: 8
[LightGBM] [Info] Start training fro