# Step 1 | Platform Setup¶

## Step 1.1 | Check Environment

1. Open Anaconda Prompt
2. conda activate tf-gpu
3. cd "C:\Users\FaithanTo\Desktop\MSBA 6421 (001) Predictive Analytics\m5-forecasting-accuracy"
4. jupyter notebook

In [1]:
!where python

C:\Anaconda\envs\tf-gpu\python.exe
C:\Anaconda\python.exe


In [2]:
import sys
import tensorflow as tf
import torch

print(sys.executable)
print(tf.__version__)
print(tf.config.list_physical_devices('GPU'))
print(torch.__version__)
print(torch.cuda.is_available())
print(torch.cuda.get_device_name(0))

C:\Anaconda\envs\tf-gpu\python.exe
2.10.0
[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]
1.13.1+cu116
True
NVIDIA GeForce RTX 3050 4GB Laptop GPU


## Step 1.2 | Import Libraries

In [3]:
import polars as pl
import pandas as pd
import numpy as np
import random
import time
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import gc
import warnings
import os
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn as nn
from torch.optim import Adam
from tqdm import tqdm
from multiprocess import Pool, cpu_count
from concurrent.futures import ProcessPoolExecutor, as_completed
import multiprocessing
import pickle
import joblib
import glob
import psutil
import os
from m5_wrmsse import wrmsse
import csv
import concurrent.futures
import threading

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = ""
warnings.filterwarnings('ignore', category=FutureWarning)
warnings.filterwarnings('ignore', category=RuntimeWarning)
pd.set_option('display.max_columns', None)
print("NumPy version:", np.__version__)

NumPy version: 1.26.4


# Step 2 | Training

## Step 2.1 | Model Setup

In [5]:
# === RMSE Loss ===
class RMSELoss(nn.Module):
    def __init__(self):
        super().__init__()
        self.mse = nn.MSELoss()

    def forward(self, y_pred, y_true):
        return torch.sqrt(self.mse(y_pred, y_true))

In [6]:
# === LSTM Model with Embeddings ===
class LSTMForecast(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_len, 
                 item_vocab_size, dept_vocab_size, item_emb_dim=6, dept_emb_dim=3, dropout=0.4):
        super(LSTMForecast, self).__init__()

        # Embedding layers
        self.item_emb = nn.Embedding(item_vocab_size, item_emb_dim)
        self.dept_emb = nn.Embedding(dept_vocab_size, dept_emb_dim)

        # Adjusted input size: original features + embedding dimensions (item_id + dept_id)
        adjusted_input_size = input_size - 2 + item_emb_dim + dept_emb_dim

        # LSTM layers
        self.lstm = nn.LSTM(adjusted_input_size, hidden_size, num_layers, batch_first=True, dropout=dropout)

        # Fully connected layers
        self.fc1 = nn.Linear(hidden_size, 128)
        self.leaky_relu = nn.LeakyReLU(negative_slope=0.01)
        self.fc2 = nn.Linear(128, output_len)

    def forward(self, x):
        # Expecting item_id (x[:,:,0]) and dept_id (x[:,:,1]) as first two columns
        item_ids = x[:, :, 0].long()
        dept_ids = x[:, :, 1].long()

        # Embed and concatenate with the rest of the features
        item_embedded = self.item_emb(item_ids)
        dept_embedded = self.dept_emb(dept_ids)
        other_feats = x[:, :, 2:]  # everything except item_id and dept_id

        # Concatenate along the last feature axis
        x_combined = torch.cat([item_embedded, dept_embedded, other_feats], dim=2)

        # LSTM and feedforward path
        lstm_out, _ = self.lstm(x_combined)
        last_hidden = lstm_out[:, -1, :]
        x = self.fc1(last_hidden)
        x = self.leaky_relu(x)
        out = self.fc2(x)
        return out

In [7]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [8]:
model = LSTMForecast(
    input_size=29,  # assuming 29 original features
    hidden_size=128,
    num_layers=3,
    output_len=28,
    item_vocab_size=3049,
    dept_vocab_size=7,
    item_emb_dim=6,
    dept_emb_dim=3,
    dropout=0.4
).to(device)

In [9]:
# === Optimizer ===
learning_rate = 0.0005  # You can adjust this as needed
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

## Step 2.2 | Load Data (Streaming Method)

In [10]:
# === 1. Load a Single Training Batch ===
def load_batch(batch_dir, batch_idx):
    """
    Loads a single training batch (X, y) from disk, with file existence checks.

    Args:
        batch_dir (str): Directory containing batch .npy files
        batch_idx (int): Index of the batch to load

    Returns:
        Tuple of torch.FloatTensors: (X_tensor, y_tensor)
    """
    X_path = os.path.join(batch_dir, f"X_batch_{batch_idx}.npy")
    y_path = os.path.join(batch_dir, f"Y_batch_{batch_idx}.npy")

    if not os.path.exists(X_path) or not os.path.exists(y_path):
        raise FileNotFoundError(f"Missing batch files: {X_path} or {y_path}")

    X = np.load(X_path)
    y = np.load(y_path)

    X_tensor = torch.from_numpy(X).float()
    y_tensor = torch.from_numpy(y).float()

    return X_tensor, y_tensor

In [11]:
# === 2. Wrap (X, y) into a DataLoader ===
def make_dataloader(X_tensor, y_tensor, batch_size):
    """
    Wraps X and y tensors into a PyTorch DataLoader with no shuffling.

    Args:
        X_tensor (Tensor): Feature tensor
        y_tensor (Tensor): Target tensor
        batch_size (int): Batch size for DataLoader

    Returns:
        DataLoader: PyTorch DataLoader object
    """
    dataset = TensorDataset(X_tensor, y_tensor)
    loader = DataLoader(dataset, 
                        batch_size=batch_size, 
                        shuffle=True, 
                        num_workers=4,
                        pin_memory=True)
    return loader

In [12]:
# === 3. Load and Wrap Validation Set ===
def get_val_loader(val_dir, batch_size):
    """
    Loads full validation set and wraps it in a DataLoader with pin_memory.

    Args:
        val_dir (str): Directory containing validation .npy files
        batch_size (int): Batch size for validation DataLoader

    Returns:
        DataLoader: PyTorch DataLoader for validation
    """
    X_path = os.path.join(val_dir, "X_val_final.npy")
    y_path = os.path.join(val_dir, "y_val_final.npy")

    if not os.path.exists(X_path) or not os.path.exists(y_path):
        raise FileNotFoundError(f"Missing validation files: {X_path} or {y_path}")

    X_val = np.load(X_path)
    y_val = np.load(y_path)

    X_tensor = torch.from_numpy(X_val).float()
    y_tensor = torch.from_numpy(y_val).float()

    val_dataset = TensorDataset(X_tensor, y_tensor)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, pin_memory=True)
    return val_loader

## Step 2.3 | Training Loop (Subfunctions)

In [13]:
def save_epoch_metrics(log_path, epoch, train_loss, val_loss):
    """Appends one row to the CSV log file."""
    file_exists = os.path.exists(log_path)
    with open(log_path, mode='a', newline='') as f:
        writer = csv.writer(f)
        if not file_exists:
            writer.writerow(["epoch", "avg_train_loss", "avg_val_loss"])
        writer.writerow([epoch, train_loss, val_loss])

In [14]:
def plot_loss_curve(history, plot_path):
    """Plots and saves train/val loss curves from history list."""
    epochs = [e for e, _, _ in history]
    train_losses = [t for _, t, _ in history]
    val_losses = [v for _, _, v in history]

    plt.figure()
    plt.plot(epochs, train_losses, label="Train RMSE")
    plt.plot(epochs, val_losses, label="Val RMSE")
    plt.xlabel("Epoch")
    plt.ylabel("RMSE Loss")
    plt.title("Training vs. Validation Loss")
    plt.legend()
    plt.grid(True)
    plt.savefig(plot_path)
    plt.close()

In [15]:
def save_model_checkpoint(model, optimizer, epoch, loss, path):
    """Saves model, optimizer, and metadata."""
    checkpoint = {
        'epoch': epoch,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'val_loss': loss
    }
    torch.save(checkpoint, path)

In [16]:
class AsyncBatchLoader:
    def __init__(self, batch_dir, total_batches, preload_ahead=1):
        self.batch_dir = batch_dir
        self.total_batches = total_batches
        self.executor = concurrent.futures.ThreadPoolExecutor(max_workers=1)
        self.preload_ahead = preload_ahead
        self.lock = threading.Lock()

        self._current_idx = 0
        self._future = None

    def _load_next(self, idx):
        return load_batch(self.batch_dir, idx)

    def get_next_batch(self):
        with self.lock:
            if self._future is None:
                self._future = self.executor.submit(self._load_next, self._current_idx)

            X, y = self._future.result()

            self._current_idx += 1
            if self._current_idx < self.total_batches:
                self._future = self.executor.submit(self._load_next, self._current_idx)
            else:
                self._future = None

            return X, y

    def done(self):
        return self._current_idx >= self.total_batches

## Step 2.4 | Training Loop (Main)

In [17]:
gc.collect()

0

In [18]:
def prepare_training_environment(model, optimizer, resume_path, device):
    start_epoch = 1
    best_val_loss = float("inf")
    
    if resume_path and os.path.exists(resume_path):
        print(f"🔁 Resuming from checkpoint: {resume_path}")
        checkpoint = torch.load(resume_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        best_val_loss = checkpoint.get('val_loss', float("inf"))

    return model.to(device), optimizer, start_epoch, best_val_loss

In [19]:
def get_randomized_batches(group_dir, seed=42):
    all_ids = sorted([
        int(f.replace("X_batch_", "").replace(".npy", ""))
        for f in os.listdir(group_dir)
        if f.startswith("X_batch_") and f.endswith(".npy")
    ])
    random.seed(seed)
    random.shuffle(all_ids)
    return all_ids

In [20]:
def train_on_batch(model, train_loader, criterion, optimizer, device):
    total_loss = 0.0

    for X_batch, y_batch in train_loader:
        X_batch = X_batch.to(device)
        y_batch = y_batch.to(device)

        optimizer.zero_grad()
        output = model(X_batch)
        
        # === Loss weighting based on is_new_id ===
        with torch.no_grad():
            is_new_id_mask = X_batch[:, :, input_cols.index("is_new_id")]
            weights = torch.where(is_new_id_mask == 1, 0.25, 1.0)
        
        # Compute unweighted loss
        squared_errors = (output - y_batch) ** 2
        
        # Apply weights and compute weighted RMSE
        weighted_mse = (weights * squared_errors).mean()
        loss = torch.sqrt(weighted_mse)

        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    
    return total_loss

In [21]:
def evaluate_model(model, val_loader, criterion, device):
    model.eval()
    val_loss = 0.0
    with torch.no_grad():
        for X_val, y_val in val_loader:
            X_val = X_val.to(device)
            y_val = y_val.to(device)
            val_preds = model(X_val)
            loss = criterion(val_preds, y_val)
            val_loss += loss.item()
    return val_loss / len(val_loader)

In [22]:
def log_and_save(epoch, train_loss, val_loss, history, model, optimizer, checkpoint_path, save_best_path, log_path, best_val_loss):
    history.append((epoch, train_loss, val_loss))
    save_epoch_metrics(log_path, epoch, train_loss, val_loss)
    save_model_checkpoint(model, optimizer, epoch, val_loss, checkpoint_path)

    if val_loss < best_val_loss:
        print(f"💾 New best model! ({best_val_loss:.4f} → {val_loss:.4f})")
        best_val_loss = val_loss
        torch.save(model.state_dict(), save_best_path)
    
    return best_val_loss

In [23]:
def log_debug_stats(epoch, batch_idx, y, val_loader, output=None, y_batch=None):
    if epoch == 1 and batch_idx == 0:
        print("📊 Train y sample stats —")
        print("  Min:", y.min().item(), "Max:", y.max().item(), "Mean:", y.mean().item())

        X_val_sample, y_val_sample = next(iter(val_loader))
        print("📊 Val y sample stats —")
        print("  Min:", y_val_sample.min().item(), 
              "Max:", y_val_sample.max().item(), 
              "Mean:", y_val_sample.mean().item())

    if output is not None and y_batch is not None:
        preds = output.detach().cpu().numpy()
        targets = y_batch.detach().cpu().numpy()
        batch_rmse = np.sqrt(np.mean((preds - targets) ** 2))
        print(f"\n🔬 Full batch RMSE: {batch_rmse:.4f}")
        print(f"Batch target stats — Min: {targets.min()}, Max: {targets.max()}, Mean: {targets.mean():.4f}")

In [24]:
def clear_batch_from_memory(*args):
    for var in args:
        del var
    gc.collect()
    if torch.cuda.is_available():
        torch.cuda.empty_cache()

In [25]:
def train_lstm_v3(
    model,
    train_dir,
    val_dir,
    num_epochs,
    total_batches,
    batch_size,
    save_best_path,
    checkpoint_path,
    log_path,
    plot_path,
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu"),
    resume_path=None,
    validate_every=1,  # how often to run validation (in epochs)
):

    print("🚀 Starting training...")
    model = model.to(device)
    criterion = nn.MSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

    train_losses = []
    val_losses = []
    best_val_loss = float("inf")

    # === PRELOAD BATCH GROUPS ONCE ===
    batch_groups = get_randomized_batches(train_dir)
    assert len(batch_groups) == total_batches, f"Expected {total_batches} batches but got {len(batch_groups)}"

    # === LOAD VALIDATION SET ONCE ===
    X_val = np.load(os.path.join(val_dir, "X_val_final.npy"))
    y_val = np.load(os.path.join(val_dir, "y_val_final.npy"))

    X_tensor = torch.tensor(X_batch, dtype=torch.float32)
    y_tensor = torch.tensor(y_batch, dtype=torch.float32)
    
    train_loader = make_dataloader(X_tensor, y_tensor, batch_size=32)
    
    for X_mini, y_mini in train_loader:
        X_mini = X_mini.to(device, non_blocking=True)
        y_mini = y_mini.to(device, non_blocking=True)
    
        optimizer.zero_grad()
        outputs = model(X_mini)
        loss = criterion(outputs, y_mini)
        loss.backward()
        optimizer.step()
    
        epoch_loss += loss.item()
    
        del X_mini, y_mini, outputs
        torch.cuda.empty_cache()
    
    for epoch in range(num_epochs):
        model.train()
        epoch_loss = 0.0
        start_time = time.time()

        for i in batch_groups:
            X_batch = np.load(os.path.join(train_dir, f"X_batch_{i}.npy"))
            y_batch = np.load(os.path.join(train_dir, f"Y_batch_{i}.npy"))
        
            X_tensor = torch.tensor(X_batch, dtype=torch.float32)
            y_tensor = torch.tensor(y_batch, dtype=torch.float32)
        
            X_tensor = X_tensor.to(device, non_blocking=True)
            y_tensor = y_tensor.to(device, non_blocking=True)
        
            optimizer.zero_grad()
            outputs = model(X_tensor)
            loss = criterion(outputs, y_tensor)
            loss.backward()
            optimizer.step()
        
            epoch_loss += loss.item()
        
            # ✅ Explicitly clear memory
            del X_tensor, y_tensor, outputs
            torch.cuda.empty_cache()

        avg_epoch_loss = epoch_loss / total_batches
        train_losses.append(avg_epoch_loss)

        # === Optional Validation ===
        if (epoch + 1) % validate_every == 0:
            model.eval()
            with torch.no_grad():
                val_outputs = model(X_val_tensor)
                val_loss = criterion(val_outputs, y_val_tensor).item()
                val_losses.append(val_loss)
        else:
            val_loss = None

        # === Save Best Model ===
        if val_loss is not None and val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), save_best_path)

        # === Save Checkpoint ===
        torch.save(model.state_dict(), checkpoint_path)

        # === Logging ===
        log_metrics_to_csv(log_path, epoch + 1, avg_epoch_loss, val_loss)
        print(f"✅ Epoch {epoch+1}/{num_epochs} - Train Loss: {avg_epoch_loss:.4f} - Val Loss: {val_loss:.4f if val_loss else 'Skipped'} - Time: {time.time() - start_time:.1f}s")

    # === Plot Loss Curves ===
    plot_loss_curves(train_losses, val_losses, save_path=plot_path)
    print("🏁 Training complete.")

In [26]:
gc.collect()

0

In [None]:
# === Run training ===
train_lstm_v3(
    model=model,
    train_dir="sequence_chunks_v3",        # training data batches
    val_dir="val_sequences_polars",         # validation sequences
    num_epochs=5,
    total_batches=305,
    batch_size=32,
    save_best_path="best_model_lstm_v3.pth",
    checkpoint_path="model_latest.pth",
    log_path="metrics_lstm_v3.csv",
    plot_path="loss_plot_lstm_v3.png",
    resume_path="model_latest.pth"         # or None to start from scratch
)

# OLD CODE

In [34]:
def train_lstm_v3(
    model, 
    train_dir, 
    val_dir, 
    num_epochs=5, 
    total_batches=305, 
    batch_size=32,
    save_best_path="best_model_lstm_v3.pth",
    checkpoint_path="model_latest.pth",
    log_path="metrics_lstm_v3.csv",
    plot_path="loss_plot_lstm_v3.png",
    resume_path=None  # Optional: resume from checkpoint
):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    print(f"📦 Training on device: {device}")
    model.to(device)

    criterion = RMSELoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
    val_loader = get_val_loader(val_dir, batch_size=batch_size)

    best_val_loss = float("inf")
    start_epoch = 1
    history = []

    # === Resume logic ===
    if resume_path and os.path.exists(resume_path):
        print(f"🔁 Resuming from checkpoint: {resume_path}")
        checkpoint = torch.load(resume_path, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        best_val_loss = checkpoint.get('val_loss', float("inf"))

    async_loader = AsyncBatchLoader(train_dir, total_batches)

    for epoch in range(start_epoch, num_epochs + 1):
        async_loader._current_idx = 0

        model.train()
        epoch_train_loss = 0.0
        print(f"\n🚀 Epoch {epoch}/{num_epochs}")

        for batch_idx in tqdm(range(total_batches), desc=f"Epoch {epoch}"):
            try:
                X, y = async_loader.get_next_batch()
            except FileNotFoundError as e:
                print(f"⚠️ Skipping batch: {e}")
                continue

            # if epoch == start_epoch and batch_idx == 0:
            #     # Log training target stats
            #     print("📊 Train y sample stats —")
            #     print("  Min:", y.min().item(), 
            #           "Max:", y.max().item(), 
            #           "Mean:", y.mean().item())

            #     # Log validation target stats
            #     X_val_sample, y_val_sample = next(iter(val_loader))
            #     print("📊 Val y sample stats —")
            #     print("  Min:", y_val_sample.min().item(), 
            #           "Max:", y_val_sample.max().item(), 
            #           "Mean:", y_val_sample.mean().item())
            
            train_loader = make_dataloader(X, y, batch_size)
            for X_batch, y_batch in train_loader:
                
                # print(f"➡️ y_batch mean: {y_batch.mean().item():.4f}")
    
                X_batch = X_batch.to(device)
                y_batch = y_batch.to(device)
        
                optimizer.zero_grad()
                output = model(X_batch)
                loss = criterion(output, y_batch)
                loss.backward()
                optimizer.step()
                epoch_train_loss += loss.item()


                
                # if epoch == start_epoch and batch_idx == 0:
                #     preds = output.detach().cpu().numpy()
                #     targets = y_batch.detach().cpu().numpy()
                #     batch_rmse = np.sqrt(np.mean((preds - targets)**2))
                #     print(f"\n🔬 Full batch RMSE: {batch_rmse:.4f}")
                #     print(f"Batch target stats — Min: {targets.min()}, Max: {targets.max()}, Mean: {targets.mean():.4f}")


        
            # 🚮 Clear batch from memory
            del X, y, train_loader, X_batch, y_batch, output, loss
            gc.collect()
            if torch.cuda.is_available():
                torch.cuda.empty_cache()
        
        avg_train_loss = epoch_train_loss / total_batches
        print(f"📉 Avg Train Loss: {avg_train_loss:.4f}")

        # === Validation ===
        model.eval()
        val_loss = 0.0
        
        with torch.no_grad():
            for X_val, y_val in val_loader:
                X_val = X_val.to(device)
                y_val = y_val.to(device)
                val_preds = model(X_val)
                loss = criterion(val_preds, y_val)
                val_loss += loss.item()
        avg_val_loss = val_loss / len(val_loader)
        print(f"🧪 Validation Loss: {avg_val_loss:.4f}")

        # === Save Metrics, Plot, and Checkpoints ===
        history.append((epoch, avg_train_loss, avg_val_loss))
        save_epoch_metrics(log_path, epoch, avg_train_loss, avg_val_loss)
        save_model_checkpoint(model, optimizer, epoch, avg_val_loss, checkpoint_path)

        if avg_val_loss < best_val_loss:
            print(f"💾 New best model! ({best_val_loss:.4f} → {avg_val_loss:.4f})")
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), save_best_path)

    async_loader.executor.shutdown(wait=True)

    # === Final loss plot ===
    plot_loss_curve(history, plot_path)
    print(f"📊 Loss plot saved to {plot_path}")

In [None]:
train_lstm_v3(
    model,
    train_dir="sequence_chunks_v3",
    val_dir="val_sequences_polars",
    num_epochs=5,
    total_batches=305,
    resume_path="model_latest.pth"
)

# Evaluation

In [36]:
model = LSTMForecast(
    input_size=29,  # assuming 29 original features
    hidden_size=128,
    num_layers=3,
    output_len=28,
    item_vocab_size=3049,
    dept_vocab_size=7,
    item_emb_dim=6,
    dept_emb_dim=3,
    dropout=0.4
)

In [37]:
model.load_state_dict(torch.load("best_model_lstm_v3.pth", map_location=torch.device("cuda" if torch.cuda.is_available() else "cpu")))
model.eval()

LSTMForecast(
  (item_emb): Embedding(3049, 6)
  (dept_emb): Embedding(7, 3)
  (lstm): LSTM(36, 128, num_layers=3, batch_first=True, dropout=0.4)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (leaky_relu): LeakyReLU(negative_slope=0.01)
  (fc2): Linear(in_features=128, out_features=28, bias=True)
)

In [38]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

LSTMForecast(
  (item_emb): Embedding(3049, 6)
  (dept_emb): Embedding(7, 3)
  (lstm): LSTM(36, 128, num_layers=3, batch_first=True, dropout=0.4)
  (fc1): Linear(in_features=128, out_features=128, bias=True)
  (leaky_relu): LeakyReLU(negative_slope=0.01)
  (fc2): Linear(in_features=128, out_features=28, bias=True)
)

In [39]:
# Load 56-day input sequences and item IDs
X_eval = np.load("eval_sequences_polars/X_eval_final.npy")  # shape: (num_items, 56, num_features)
ids = np.load("eval_sequences_polars/ids_eval_final.npy", allow_pickle=True)

In [41]:
# Prepare batched DataLoader
batch_size = 32  # or smaller if needed
dataset = TensorDataset(torch.tensor(X_eval, dtype=torch.float32))
loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Run batched inference
all_preds = []

with torch.no_grad():
    for (X_batch,) in loader:
        X_batch = X_batch.to(device)
        preds = model(X_batch).cpu().numpy()
        all_preds.append(preds)

y_pred = np.vstack(all_preds)  # shape: (num_items, 28)

In [45]:
ids_eval = np.load("eval_sequences_polars/ids_eval_final.npy", allow_pickle=True)
df_eval = pd.DataFrame(y_pred, columns=[f"F{i}" for i in range(1, 29)])
df_eval.insert(0, "id", ids_eval)

In [46]:
sales = pd.read_csv("Kaggle Files/sales_train_evaluation.csv")
val_cols = [f"d_{d}" for d in range(1914, 1942)]  # 28 days
df_val = sales[["id"] + val_cols].copy()

# Rename to F1–F28 to match Kaggle format
df_val.columns = ["id"] + [f"F{i}" for i in range(1, 29)]
df_val["id"] = df_val["id"].str.replace("_evaluation", "_validation")

In [48]:
submission = pd.concat([df_val, df_eval], axis=0)
submission.to_csv("submission.csv", index=False)