## One sku at a time

### Import libraries

In [2]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping, log_evaluation
from datetime import timedelta
import os

### Load and Prepare Data

In [3]:
# Load preprocessed data
current_file = os.path.abspath("__file__")
code_dir = os.path.dirname(current_file)
data_dir = os.path.join(os.path.dirname(code_dir), "Data", "Processed")

sales_model = pd.read_pickle(os.path.join(data_dir, "lgbm_state_evaluation_data.pkl"))

In [4]:
# Configuration
R = 7   # review period
L = 3   # forecast lag
horizon = R + L  # forecast window
features = ['lag_7', 'lag_28', 'rmean_7', 'rmean_28',
            'avg_sell_price', 'day', 'weekday', 'month', 'year']


In [None]:
# sku_list = [
#     "FOODS_3_819", "FOODS_3_090",
#     "HOBBIES_1_234", "HOUSEHOLD_1_118"
# ]

In [13]:
# Filter for the SKU of interest
sku_id = "HOUSEHOLD_1_118"
state_id = "TX"

sku_df = sales_model[(sales_model['item_id'] == sku_id) & (sales_model['state_id'] == state_id)].copy().sort_values("date")
# sku_df = sales_model[(sales_model['item_id'] == sku_id)].copy().sort_values("date")

# Drop rows with missing lag or rmean values
sku_df = sku_df.dropna(subset=features + ['sales'])
sku_df.head()

Unnamed: 0,state_id,item_id,d,date,wm_yr_wk,sales,lag_7,lag_28,rmean_7,rmean_28,avg_sell_price,day,weekday,month,year
10023352,TX,HOUSEHOLD_1_118,d_29,2011-02-26,11105,28,12.0,6.0,19.857143,17.5,0.97,26,5,2,2011
10023353,TX,HOUSEHOLD_1_118,d_30,2011-02-27,11105,14,41.0,27.0,22.142857,18.285714,0.97,27,6,2,2011
10023354,TX,HOUSEHOLD_1_118,d_31,2011-02-28,11105,18,24.0,11.0,18.285714,17.821429,0.97,28,0,2,2011
10023355,TX,HOUSEHOLD_1_118,d_32,2011-03-01,11105,5,17.0,15.0,17.428571,18.071429,0.97,1,1,3,2011
10023356,TX,HOUSEHOLD_1_118,d_33,2011-03-02,11105,6,19.0,5.0,15.714286,17.714286,0.97,2,2,3,2011


### Set up review indices

In [6]:
# Set up rolling forecast indices
T = len(sku_df)
start_idx = T - 393
end_idx = T - 1
review_indices = list(range(end_idx, start_idx - 1, -R))
review_indices = sorted(review_indices)

print(f"Total rolling points: {len(review_indices)}")
print(f"Review indices: {review_indices}")

NameError: name 'sku_df' is not defined

### Rolling window forecasts

In [15]:
forecasts = []

for review_idx in review_indices:
    train_data = sku_df.iloc[:review_idx + 1]
    test_data = sku_df.iloc[review_idx + 1 : review_idx + 1 + horizon]

    if len(test_data) < horizon:
        continue  

    X_train = train_data[features]
    y_train = train_data['sales']

    model = LGBMRegressor(
        objective='regression',
        n_estimators=100,
        learning_rate=0.1,
        max_depth=6,
        random_state=42
    )
    model.fit(X_train, y_train)

    X_forecast = test_data[features]
    y_pred = model.predict(X_forecast)

    for i, pred in enumerate(y_pred):
        forecasts.append({
            'review_index': review_idx, # index of the review point
            'review_day': sku_df.iloc[review_idx]['date'], # date of the review point
            'forecast_day': f'F{i+1}', # forecast day (F1, F2, ...)
            'forecast': pred, # predicted sales
            # 'forecast_index': review_idx + i + 1, # index of the forecast day
            # forcast date
            'forecast_d': sku_df.iloc[review_idx + i + 1]['date'] if (review_idx + i + 1) < len(sku_df) else None, # forecast day (d_1, d_2, 
            

            
        })


rolling_lgbm_df = pd.DataFrame(forecasts)



[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000626 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of data points in the train set: 1521, number of used features: 8
[LightGBM] [Info] Start training from score 19.075608
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000093 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of data points in the train set: 1528, number of used features: 8
[LightGBM] [Info] Start training from score 19.060209
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000113 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 581
[LightGBM] [Info] Number of data points in the train set: 1535, number of used features: 8
[LightGBM] [Info] Start training f

### Save forecast result

In [9]:
# Save result
forecast_folder = "Point Forecasts"
os.makedirs(forecast_folder, exist_ok=True)
forecast_csv_path = os.path.join(forecast_folder, "lgbm_HOBBIES_1_234_TX_forecast.csv")
rolling_lgbm_df.to_csv(forecast_csv_path, index=False)

# Preview
print("✅ Rolling forecast complete.")
print(rolling_lgbm_df.head(12))

✅ Rolling forecast complete.
    review_index review_day forecast_day  forecast forecast_d
0           1520 2015-04-26           F1  0.124753 2015-04-27
1           1520 2015-04-26           F2  2.902038 2015-04-28
2           1520 2015-04-26           F3  3.278507 2015-04-29
3           1520 2015-04-26           F4  6.115380 2015-04-30
4           1520 2015-04-26           F5  3.907282 2015-05-01
5           1520 2015-04-26           F6  5.631702 2015-05-02
6           1520 2015-04-26           F7  4.969626 2015-05-03
7           1520 2015-04-26           F8  3.490837 2015-05-04
8           1520 2015-04-26           F9  1.183999 2015-05-05
9           1520 2015-04-26          F10  5.461150 2015-05-06
10          1527 2015-05-03           F1  2.660322 2015-05-04
11          1527 2015-05-03           F2  0.648952 2015-05-05


## Loop through SKUs

In [7]:
import pandas as pd
import numpy as np
from lightgbm import LGBMRegressor
from lightgbm.callback import early_stopping, log_evaluation
from datetime import timedelta
from sklearn.metrics import mean_squared_error
import os

# Load preprocessed data
current_file = os.path.abspath("__file__")
code_dir = os.path.dirname(current_file)
data_dir = os.path.join(os.path.dirname(code_dir), "Data", "Processed")

sales_model = pd.read_pickle(os.path.join(data_dir, "lgbm_state_evaluation_data.pkl"))


In [3]:
sales_model.tail()

Unnamed: 0,state_id,item_id,d,date,wm_yr_wk,sales,lag_7,lag_28,rmean_7,rmean_28,avg_sell_price,day,weekday,month,year
17754322,WI,HOUSEHOLD_2_516,d_1937,2016-05-18,11616,0,0.0,0.0,0.428571,0.321429,5.94,18,2,5,2016
17754323,WI,HOUSEHOLD_2_516,d_1938,2016-05-19,11616,0,0.0,1.0,0.428571,0.321429,5.94,19,3,5,2016
17754324,WI,HOUSEHOLD_2_516,d_1939,2016-05-20,11616,1,0.0,0.0,0.428571,0.285714,5.94,20,4,5,2016
17754325,WI,HOUSEHOLD_2_516,d_1940,2016-05-21,11617,0,1.0,0.0,0.571429,0.321429,5.94,21,5,5,2016
17754326,WI,HOUSEHOLD_2_516,d_1941,2016-05-22,11617,0,1.0,0.0,0.428571,0.321429,5.94,22,6,5,2016


In [8]:


# Configuration
R = 7   # review period
L = 3   # forecast lag
horizon = R + L  # forecast window
features = ['lag_7', 'lag_28', 'rmean_7', 'rmean_28',
            'avg_sell_price', 'day', 'weekday', 'month', 'year']

# Initialize variables
sku_list = [
    "FOODS_3_819", "FOODS_3_090",
    "HOBBIES_1_234", "HOUSEHOLD_1_118"
]
state_id = "TX"

sku_rmse_list = []

# Loop through each SKU
for sku in sku_list:
    sku_id = sku
    
    sku_df = sales_model[(sales_model['item_id'] == sku_id) & (sales_model['state_id'] == state_id)].copy().sort_values("date")
    sku_df = sku_df.dropna(subset=features + ['sales'])
    
    # Set up rolling forecast indices
    T = len(sku_df)
    start_idx = T - 393
    end_idx = T - 1
    review_indices = list(range(end_idx, start_idx - 1, -R))
    review_indices = sorted(review_indices)

    # Initialize model
    forecasts = []

    for review_idx in review_indices:
        train_data = sku_df.iloc[:review_idx + 1]
        test_data = sku_df.iloc[review_idx + 1 : review_idx + 1 + horizon]

        if len(test_data) < horizon:
            continue  

        X_train = train_data[features]
        y_train = train_data['sales']

        model = LGBMRegressor(
            objective='regression',
            n_estimators=100,
            learning_rate=0.1,
            max_depth=6,
            random_state=42
        )
        model.fit(X_train, y_train)

        X_forecast = test_data[features]
        y_pred = model.predict(X_forecast)

        for i, pred in enumerate(y_pred):
            forecasts.append({
                'review_index': review_idx, # index of the review point
                'review_day': sku_df.iloc[review_idx]['date'], # date of the review point
                'forecast_day': f'F{i+1}', # forecast day (F1, F2, ...)
                'forecast': pred, # predicted sales
                'forecast_d': sku_df.iloc[review_idx + i + 1]['date'] if (review_idx + i + 1) < len(sku_df) else None,
                'actual': sku_df.iloc[review_idx + i + 1]['sales'] if (review_idx + i + 1) < len(sku_df) else None
            })

    rolling_lgbm_df = pd.DataFrame(forecasts)

    # Calculate RMSE for each forecast
    rmse = np.sqrt(np.mean((rolling_lgbm_df['actual'] - rolling_lgbm_df['forecast'])**2))
    nrmse = rmse / rolling_lgbm_df['actual'].mean()
    sku_rmse_list.append({
        'sku_id': sku_id,
        'rmse': rmse,
        'nrmse': nrmse
    })

    # Save result
    forecast_folder = "Point Forecasts"
    os.makedirs(forecast_folder, exist_ok=True)
    forecast_csv_path = os.path.join(forecast_folder, "lgbm_"+sku+"_TX"+"_forecast.csv")
    rolling_lgbm_df.to_csv(forecast_csv_path, index=False)

    # Preview
    print("✅ Rolling forecast complete.")
    print(rolling_lgbm_df.head())

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000269 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 443
[LightGBM] [Info] Number of data points in the train set: 1521, number of used features: 8
[LightGBM] [Info] Start training from score 4.518738
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000123 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 1528, number of used features: 8
[LightGBM] [Info] Start training from score 4.507199
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000122 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 439
[LightGBM] [Info] Number of data points in the train set: 1535,

#### Get NRMSE and RMSE

In [10]:
# Print RMSE for each SKU
rmse_df = pd.DataFrame(sku_rmse_list)
# format to only 2 decimal places
rmse_df['rmse'] = rmse_df['rmse'].apply(lambda x: f"{x:.2f}")
rmse_df['nrmse'] = rmse_df['nrmse'].apply(lambda x: f"{x:.2f}")
rmse_df



Unnamed: 0,sku_id,rmse,nrmse
0,FOODS_3_819,1.63,0.97
1,FOODS_3_090,50.4,0.34
2,HOBBIES_1_234,12.58,1.1
3,HOUSEHOLD_1_118,6.27,0.41


In [11]:
## Get ARIMA NRMSE
y_bar = rolling_lgbm_df['actual'].mean()
ARIMA_rmse = {
    'FOODS_3_819': 1.69,
    'FOODS_3_090': 71.73,
    'HOBBIES_1_234': 11.44,
    'HOUSEHOLD_1_118': 6.19
}
ARIMA_nrmse = {sku: rmse / y_bar for sku, rmse in ARIMA_rmse.items()}

# save ARIMA NRMSE and RMSE result as a dataframe
arima_rmse_df = pd.DataFrame({
    'sku_id': list(ARIMA_nrmse.keys()),
    'arima_rmse': list(ARIMA_rmse.values()),
    'arima_nrmse': list(ARIMA_nrmse.values())
})
# format to only 2 decimal places
arima_rmse_df['arima_rmse'] = arima_rmse_df['arima_rmse'].apply(lambda x: f"{x:.2f}")
arima_rmse_df['arima_nrmse'] = arima_rmse_df['arima_nrmse'].apply(lambda x: f"{x:.2f}")
arima_rmse_df

Unnamed: 0,sku_id,arima_rmse,arima_nrmse
0,FOODS_3_819,1.69,0.11
1,FOODS_3_090,71.73,4.64
2,HOBBIES_1_234,11.44,0.74
3,HOUSEHOLD_1_118,6.19,0.4


In [12]:
# combine ARIMA and LGBM RMSE results
combined_rmse_df = pd.merge(rmse_df, arima_rmse_df, on='sku_id', how='left')
combined_rmse_df = combined_rmse_df.rename(columns={'rmse': 'LGBM RMSE', 'nrmse': 'LGBM NRMSE'})
combined_rmse_df = combined_rmse_df.rename(columns={'arima_rmse': 'ARIMA RMSE', 'arima_nrmse': 'ARIMA NRMSE'})
combined_rmse_df = combined_rmse_df.rename(columns={'sku_id': 'Product'})
combined_rmse_df

Unnamed: 0,Product,LGBM RMSE,LGBM NRMSE,ARIMA RMSE,ARIMA NRMSE
0,FOODS_3_819,1.63,0.97,1.69,0.11
1,FOODS_3_090,50.4,0.34,71.73,4.64
2,HOBBIES_1_234,12.58,1.1,11.44,0.74
3,HOUSEHOLD_1_118,6.27,0.41,6.19,0.4


In [14]:
# Save forecast acuracy results
import matplotlib.pyplot as plt
import dataframe_image as dfi
plt.rcParams['font.family'] = 'Times New Roman'

output_dir = os.path.join(os.path.dirname(code_dir), "Visual Output")
os.makedirs(output_dir, exist_ok=True)
output_path = os.path.join(output_dir, 'forecast_accuracy_result.png')
dfi.export(combined_rmse_df, output_path, table_conversion='matplotlib')


In [15]:
# Save the combined RMSE DataFrame to a CSV file under 'CODE' folder, same as current script
combined_rmse_csv_path = os.path.join(code_dir, "forecast_accuracy_result.csv")

combined_rmse_df.to_csv(combined_rmse_csv_path, index=False)
print("✅ Forecast accuracy results saved.")



✅ Forecast accuracy results saved.
