# Step 1 | Platform Setup

## Step 1.1 | Check Environment

1. Open Anaconda Prompt
2. conda activate tf-gpu
3. cd "C:\Users\FaithanTo\Desktop\MSBA 6421 (001) Predictive Analytics\m5-forecasting-accuracy"
4. jupyter notebook

In [1]:
!where python

C:\Anaconda\envs\tf-gpu\python.exe
C:\Anaconda\python.exe


In [2]:
import sys
import tensorflow as tf
import torch

print(sys.executable)
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

C:\Anaconda\envs\tf-gpu\python.exe
2.10.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
1.13.1+cu116
True
NVIDIA GeForce RTX 3050 4GB Laptop GPU


## Step 1.2 | Import Libraries

In [3]:
import polars as pl
import pandas as pd
import numpy as np
import random
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import gc
import warnings
import os
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
from multiprocess import Pool, cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import pickle
import joblib
import glob
import psutil
import os
from m5_wrmsse import wrmsse

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
pd.set_option('display.max_columns', None)
print("NumPy version:", np.__version__)

NumPy version: 1.26.4


# Step 2 | Load Data

In [5]:
df = pd.read_parquet("C:\\Users\\FaithanTo\\Desktop\\MSBA 6421 (001) Predictive Analytics\\m5-forecasting-accuracy\\Step2_FeatureEngineering_LSTM.parquet")
df.head()

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,sales,date,weekday,month,year,event_name_1,event_type_1,snap_CA,snap_TX,snap_WI,sell_price,d_num,store_item_rolling_mean_7d,store_item_rolling_std_30d,store_total_sales_7d,store_avg_item_sales_ratio,store_total_sales,store_cat_rolling_mean_14d,state_dept_rolling_mean_30d,state_cat_prev_week_sales,state_cat_weekly_growth,country_sales,sales_LY,is_new_id
0,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,3,2011-01-29,Saturday,1,2011,No Event,No Event Type,0,0,0,2.0,1,3.0,2.12132,4337.0,0.000692,4337,3239.0,1157.0,10101.0,0.175131,32631,3.0,1
1,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0,2011-01-30,Sunday,1,2011,No Event,No Event Type,0,0,0,2.0,2,1.5,2.12132,8492.0,0.0,4155,3188.0,1149.5,10101.0,0.175131,31749,3.0,1
2,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,0,2011-01-31,Monday,1,2011,No Event,No Event Type,0,0,0,2.0,3,1.0,1.732051,11308.0,0.0,2816,2794.666748,1028.666626,10101.0,0.175131,23783,3.0,1
3,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,1,2011-02-01,Tuesday,2,2011,No Event,No Event Type,1,1,0,2.0,4,1.0,1.414214,14359.0,0.000328,3051,2660.5,957.25,10101.0,0.175131,25412,3.0,1
4,FOODS_1_001_CA_1_evaluation,FOODS_1_001,FOODS_1,FOODS,CA_1,CA,4,2011-02-02,Wednesday,2,2011,No Event,No Event Type,1,0,1,2.0,5,1.6,1.81659,16989.0,0.001521,2630,2534.800049,911.599976,10101.0,0.175131,19146,3.0,1


# Step 3 | Encoding

## Step 3.1 | Categorize Columns for Processing

### Step 3.1.1 | Identify Relevant Columns

In [6]:
# Count relevant columns

# Drop the raw date column
df.drop(columns=['date'], inplace=True)

# Copy d_num for modeling
df["d_num_scaled"] = df["d_num"]

# Count the number of columns
print("Number of columns:", df.shape[1])

Number of columns: 30


In [7]:
# Print the column names as a list

print("Column names:", df.columns.tolist())

Column names: ['id', 'item_id', 'dept_id', 'cat_id', 'store_id', 'state_id', 'sales', 'weekday', 'month', 'year', 'event_name_1', 'event_type_1', 'snap_CA', 'snap_TX', 'snap_WI', 'sell_price', 'd_num', 'store_item_rolling_mean_7d', 'store_item_rolling_std_30d', 'store_total_sales_7d', 'store_avg_item_sales_ratio', 'store_total_sales', 'store_cat_rolling_mean_14d', 'state_dept_rolling_mean_30d', 'state_cat_prev_week_sales', 'state_cat_weekly_growth', 'country_sales', 'sales_LY', 'is_new_id', 'd_num_scaled']


### Step 3.1.2 | Categorize Columns

Neural networks (like LSTMs) treat label-encoded integers as just another numeric input - they don’t impose any ordinal relationship unless the model learns one.

In [8]:
# Target variable
target_col = 'sales'

# Index / Key columns
key_cols = [
    'id',
    'd_num'
]

# Categorical features (for label encoding + embedding)
categorical_cols = [
    'item_id', 'dept_id', 'cat_id',
    'store_id', 'state_id',
    'year', 
    'event_name_1', 'event_type_1'
]

# Cyclical features
cycle_cols = ['weekday', 'month']

# Binary flags
binary_cols = [
    'snap_CA', 'snap_TX', 'snap_WI', 
    'is_new_id'
]

# Numerical features (to normalize)
numerical_cols_ss = [
    'sell_price', 'sales_LY',
    'store_item_rolling_mean_7d', 'store_item_rolling_std_30d',
    'store_total_sales_7d', 'store_avg_item_sales_ratio',
    'store_total_sales', 'store_cat_rolling_mean_14d',
    'state_dept_rolling_mean_30d', 'state_cat_prev_week_sales',
    'state_cat_weekly_growth', 'country_sales'
]

numerical_cols_mm = ['d_num_scaled']

## Step 3.2 | Normalize Numerical Columns

In [9]:
scaler_ss = StandardScaler()
df[numerical_cols_ss] = scaler_ss.fit_transform(df[numerical_cols_ss])
joblib.dump(scaler_ss, 'standard_scaler_v3.pkl')

['standard_scaler_v3.pkl']

In [10]:
scaler_mm = MinMaxScaler()
df[numerical_cols_mm] = scaler_mm.fit_transform(df[numerical_cols_mm])
joblib.dump(scaler_mm, 'minmax_scaler_v3.pkl')

['minmax_scaler_v3.pkl']

## Step 3.3 | Label Encode Categorical Columns

In [11]:
label_encoders = {}

for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

## Step 3.4 | Cyclical Encoding

### Step 3.4.1 | Weekday

In [12]:
# Map weekday names to numbers (Monday = 0, Sunday = 6)

weekday_map = {
    'Monday': 0, 'Tuesday': 1, 'Wednesday': 2,
    'Thursday': 3, 'Friday': 4, 'Saturday': 5, 'Sunday': 6
}

df['weekday'] = df['weekday'].map(weekday_map)

In [13]:
# Apply cyclical encoding

# Convert categorical weekday to integer
df['weekday'] = df['weekday'].astype(int)

df['weekday_sin'] = np.sin(2 * np.pi * df['weekday'] / 7)
df['weekday_cos'] = np.cos(2 * np.pi * df['weekday'] / 7)

# Drop the original column
df.drop(columns='weekday', inplace=True)

### Step 3.4.2 | Month

In [14]:
# Apply cyclical encoding

df['month_sin'] = np.sin(2 * np.pi * (df['month'] - 1) / 12)
df['month_cos'] = np.cos(2 * np.pi * (df['month'] - 1) / 12)

# Drop original column
df.drop(columns='month', inplace=True)

# Step 4 | Export File

## Step 4.1 | Review Final DataFrame

In [15]:
df.head(10000)

Unnamed: 0,id,item_id,dept_id,cat_id,store_id,state_id,sales,year,event_name_1,event_type_1,snap_CA,snap_TX,snap_WI,sell_price,d_num,store_item_rolling_mean_7d,store_item_rolling_std_30d,store_total_sales_7d,store_avg_item_sales_ratio,store_total_sales,store_cat_rolling_mean_14d,state_dept_rolling_mean_30d,state_cat_prev_week_sales,state_cat_weekly_growth,country_sales,sales_LY,is_new_id,d_num_scaled,weekday_sin,weekday_cos,month_sin,month_cos
0,FOODS_1_001_CA_1_evaluation,0,0,0,0,0,3,0,19,2,0,0,0,-0.693997,1,0.551939,0.574769,-2.431641,0.304447,0.674762,1.696726,-0.545545,1.464391,-0.031893,-0.249161,0.458554,1,0.000000,-0.974928,-0.222521,0.000000,1.000000
1,FOODS_1_001_CA_1_evaluation,0,0,0,0,0,0,0,19,2,0,0,0,-0.693997,2,0.109190,0.574769,-1.920052,-0.273864,0.536610,1.648795,-0.548824,1.464391,-0.031893,-0.367965,0.458554,1,0.000515,-0.781831,0.623490,0.000000,1.000000
2,FOODS_1_001_CA_1_evaluation,0,0,0,0,0,0,0,19,2,0,0,0,-0.693997,3,-0.038393,0.370301,-1.573328,-0.273864,-0.479796,1.279138,-0.601656,1.464391,-0.031893,-1.440972,0.458554,1,0.001031,0.000000,1.000000,0.000000,1.000000
3,FOODS_1_001_CA_1_evaluation,0,0,0,0,0,1,0,19,2,1,1,0,-0.693997,4,-0.038393,0.203353,-1.197670,0.000159,-0.301412,1.153047,-0.632881,1.464391,-0.031893,-1.221549,0.458554,1,0.001546,0.781831,0.623490,0.500000,0.866025
4,FOODS_1_001_CA_1_evaluation,0,0,0,0,0,4,0,19,2,1,0,1,-0.693997,5,0.138707,0.414706,-0.873848,0.997689,-0.620984,1.034914,-0.652841,1.464391,-0.031893,-2.065569,0.458554,1,0.002062,0.974928,-0.222521,0.500000,0.866025
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,FOODS_1_001_TX_2_evaluation,0,0,0,5,1,0,0,19,2,0,1,1,-0.693997,291,0.045940,-0.137711,0.037040,-0.273864,-0.222468,1.084307,-0.852653,0.049425,-0.031893,-1.047922,-0.260066,1,0.149485,0.781831,0.623490,-0.866025,0.500000
9996,FOODS_1_001_TX_2_evaluation,0,0,0,5,1,1,0,19,2,0,0,0,-0.693997,292,0.088107,-0.134958,0.026205,0.017441,-0.438806,1.090886,-0.851487,0.352269,-0.031893,-1.421441,-0.260066,1,0.150000,0.974928,-0.222521,-0.866025,0.500000
9997,FOODS_1_001_TX_2_evaluation,0,0,0,5,1,0,0,19,2,0,0,0,-0.693997,293,-0.038393,-0.134958,0.035809,-0.273864,-0.340125,1.074842,-0.851370,0.074026,-0.031893,-1.531086,-0.260066,1,0.150515,0.433884,-0.900969,-0.866025,0.500000
9998,FOODS_1_001_TX_2_evaluation,0,0,0,5,1,1,0,19,2,0,0,0,-0.693997,294,-0.038393,-0.133002,-0.013318,0.007349,-0.360620,1.055240,-0.850992,0.413346,-0.031893,-1.074054,-0.260066,1,0.151031,-0.433884,-0.900969,-0.866025,0.500000


## Step 4.2 | Export to Parquet

In [16]:
# Export to Parquet file

df.to_parquet('Step3_Encoding_LSTM_v3.parquet', index=False)