# Improvement

In this notebook we attempt to improve the results obtained in the paper. Our first strategy is to
train the models for the full dataset rather than 200k records, let's see how
that goes. The code that follows is copied from the previous notebook with only
change being that we did not sample 200k records. This is why we ahve commented it less since all the details for this are already in Notebook 1.

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch

# Read relevant columns from the Rossman store sales dataset
relevant_columns = ["Store", "DayOfWeek", "Date", "Sales", "Promo"]
dataset = pd.read_csv("rossmann-store-sales/train.csv",
                      usecols=relevant_columns)

# Filter out records with zero sales
dataset = dataset[dataset["Sales"] != 0]

# Extract year, month, day from the "Date" column
dataset[["Year", "Month", "Day"]] = dataset["Date"].str.split("-", expand=True)
dataset.drop(columns=["Date"], inplace=True)

# Now merge dataset with store states information
state_df = pd.read_csv("rossmann-store-sales/store_states.csv")
dataset = pd.merge(dataset, state_df, how="left", on="Store")
del state_df

# Encode categorical variables using LabelEncoder. This just assigns a number to each category
label_encoder = LabelEncoder()
for col in dataset.columns.difference(["Sales"]): # We don't encode the target variable ofc
    dataset[col] = label_encoder.fit_transform(dataset[col])

# We select relevant columns and shuffle the dataset
dataset = dataset[["Store", "DayOfWeek", "Day",
                   "Month", "Year", "Promo", "State", "Sales"]]

shuffled_set = dataset.sample(frac=1)
shuffled_set = torch.tensor(shuffled_set.values, dtype=torch.float) # Convert to tensor for PyTorch training later with float dtype

# Create a temporal set for evaluation purposes. This is the same as the dataset but in reverse order as the dataset is in reverse chronological order
temporal_set = dataset.iloc[::-1].copy()
temporal_set = torch.tensor(temporal_set.values, dtype=torch.float)

del dataset # We don't need the dataset anymore

# An output encoder for normalization of the target variable. 

class OutputEncoder():
    def __init__(self, max_output):
        self.max_output = max_output

    def encode(self, output):
        with torch.no_grad():
            return torch.log(output) / torch.log(self.max_output) # We use log to normalize the output

    def decode(self, output):
        with torch.no_grad():
            return torch.exp(output * torch.log(self.max_output)) # We use exp to reverse the normalization


# Normalize the target variable in both sets
output_encoder = OutputEncoder(torch.max(temporal_set[:, -1]))
temporal_set[:, -1] = output_encoder.encode(temporal_set[:, -1])
shuffled_set[:, -1] = output_encoder.encode(shuffled_set[:, -1])

# Function for train-test split

def test_train_split(dataset):
    split_threshold = int(0.9 * dataset.size(0)) # 90% of the dataset is used for training

    X_train = dataset[:split_threshold, :-1].long()  # We don't need the target variable for the input. We also convert to long dtype because in PyTorch, embedding layers typically expect input of type torch.long (or torch.int64). This is because embedding layers are designed to work with discrete indices, such as those used to represent categories or words.
    X_test = dataset[split_threshold:, :-1].long()

    y_train = dataset[:split_threshold, -1] # We only need the target variable for the output
    y_test = dataset[split_threshold:, -1]

    # NOTE: This is changed from previous notebook. Now we are no longer selecting only 200k rows
    return X_train, y_train, X_test, y_test


# Parameters for embedding layers
parameters = {
    "store": (1115, 10),
    "day_of_week": (7, 6),
    "day": (31, 10),
    "month": (12, 6),
    "year": (3, 2),
    "promotion": (2, 1),
    "state": (12, 6)
}

# Neural network with embedding layers


class EmbeddingNN(torch.nn.Module):
    def __init__(self):
        super(EmbeddingNN, self).__init__()
        # Created a list of Embedding layers based on parameters
        emb_list = [torch.nn.Embedding(n, d) for n, d in parameters.values()]
        # List of Embedding layers converted into a ModuleList
        self.emb_layers = torch.nn.ModuleList(emb_list)
        # Calculating the total input size for the feed-forward layers
        input_size = sum([tuple[1] for tuple in parameters.values()])
        # Feed-forward layers as a sequential model
        self.feed_forward = torch.nn.Sequential(
            # First linear layer: Input size to 1000 units
            torch.nn.Linear(input_size, 1000),
            # Applied Rectified Linear Unit (ReLU) activation function
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 500),
            # Applied Rectified Linear Unit (ReLU) activation function
            torch.nn.ReLU(),
            # Third linear layer: 500 units to 1 unit
            torch.nn.Linear(500, 1),
            # Applied Sigmoid activation function (for binary classification)
            torch.nn.Sigmoid()
        )

    def forward(self, X):
        # Concatenate the embeddings obtained from each categorical feature using list comprehension
        # Iterate over the embedding layers and apply them to corresponding columns in the input X
        # The resulting embeddings are concatenated along the specified dimension (dim=1) to form a single tensor
        embeddings = torch.cat([emb(X[:, i])
                               for i, emb in enumerate(self.emb_layers)], dim=1)
        # Now pass the concatenated embeddings through the feed-forward layers
        return self.feed_forward(embeddings)

# Neural network with one-hot encoding


class OneHotNN(torch.nn.Module):
    def __init__(self):
        super(OneHotNN, self).__init__()

        # Calculating the total input size for the feed-forward layers
        input_size = sum([tuple[0] for tuple in parameters.values()])

        # Feed-forward layers as a sequential mode
        self.feed_forward = torch.nn.Sequential(
            # First linear layer: Input size to 1000 units
            torch.nn.Linear(input_size, 1000),
            # Applied Rectified Linear Unit (ReLU) activation function
            torch.nn.ReLU(),
            # Second linear layer: 1000 units to 500 units
            torch.nn.Linear(1000, 500),
            # Applied Rectified Linear Unit (ReLU) activation function
            torch.nn.ReLU(),
            # Third linear layer: 500 units to 1 unit
            torch.nn.Linear(500, 1),
            # Applied Sigmoid activation function (for binary classification)
            torch.nn.Sigmoid()
        )

    def forward(self, X):
        # Converting categorical features into one-hot encoding
        one_hot = torch.cat([torch.nn.functional.one_hot(X[:, i], num_emb).float()
                             for i, (num_emb, _) in enumerate(parameters.values())], dim=1)
        # Now pass the one-hot encoded features through the feed-forward layers
        return self.feed_forward(one_hot)

# Model training function


def train_model(model, X, y):
    # Loss function: Mean Absolute Error nd optimizer: Adam
    loss_fn = torch.nn.L1Loss()
    optim = torch.optim.Adam(model.parameters(), lr=0.001)

    epochs = 10
    batch_size = 128
    total_samples = len(X)

    model.train()
    for _ in range(epochs):
        # Iterating over the dataset in mini-batches
        for i in range(0, total_samples, batch_size):
            inputs = X[i:i+batch_size]
            targets = y[i:i+batch_size]
            # Zero the gradients, this clears previous gradients
            optim.zero_grad()
            # Forward pass: computes predicted outputs by passing inputs to the model
            outputs = model(inputs).squeeze()
            loss = loss_fn(outputs, targets)
            # Backward pass: calculates gradient of the loss with respect to the model parameters
            loss.backward()
            # Update model parameters using the optimizer
            optim.step()

# Mean Absolute Percentage Error (MAPE)


def MAPE(y_pred, y_true):
    return torch.mean(torch.abs((y_true - y_pred) / y_true))

# Model evaluation

def evaluate(cls, dataset):
    X_train, y_train, X_test, y_test = test_train_split(dataset)

    # Create and train multiple models for ensemble learning
    models = [cls() for _ in range(5)]
    for model in models:
        train_model(model, X_train, y_train)

    # Evaluate each model on the test set and store predictions
    y_preds = []
    for model in models:
        model.eval()
        # Get the predictions and decode them using the output encoder
        y_pred = model(X_test).squeeze()
        y_pred = output_encoder.decode(y_pred)
        y_preds.append(y_pred)
    # Predictions are stacked and then compute the mean prediction
    stacked_preds = torch.stack(y_preds)
    y_pred = torch.mean(stacked_preds, dim=0)
    # Decode true values using the output encoder
    y_true = output_encoder.decode(y_test)
    # Return the MAPE score for the ensemble predictions
    return MAPE(y_pred, y_true)


print(f"Shuffled OneHotNN: {evaluate(OneHotNN, shuffled_set):.3f}")
print(f"Shuffled EmbeddingNN: {evaluate(EmbeddingNN, shuffled_set):.3f}")
print(f"Temporal OneHotNN: {evaluate(OneHotNN, temporal_set):.3f}")
print(f"Temporal EmbeddingNN: {evaluate(EmbeddingNN, temporal_set):.3f}")

Shuffled OneHotNN: 0.065
Shuffled EmbeddingNN: 0.072
Temporal OneHotNN: 0.094
Temporal EmbeddingNN: 0.106


We obtained the following results, with the numbers representing the `MAPE` scores.

|  | OneHotNN | EmbeddingNN |
| --- | --- | --- |
| Shuffled Data | 0.065 | 0.072 |
| Temporal Data | 0.094 | 0.106 |

We see improvements to all the results except temporal with embeddings. The
reason can be that with temporal data, the test and train test are fundamentally
different and adding more training data makes the model overfit and reduces
generalizability.

Let's try to add more columns. Looking at all options, `Customers` seems like
the best choice, since intuitively it makes sense that it would be a strong
predictor of `Sales`. We also do feature scaling on this column. Because its
range was very large.

In [2]:
from sklearn.preprocessing import StandardScaler

# Add Customers column
relevant_columns = ["Store", "DayOfWeek",
                    "Date", "Sales", "Promo", "Customers"]
dataset = pd.read_csv("rossmann-store-sales/train.csv",
                      usecols=relevant_columns)

# Scale Customers by z-score
scaler = StandardScaler()
dataset["Customers"] = scaler.fit_transform(dataset[["Customers"]])

# Filtered out records with zero sales
dataset = dataset[dataset["Sales"] != 0]

dataset[["Year", "Month", "Day"]] = dataset["Date"].str.split("-", expand=True)
dataset.drop(columns=["Date"], inplace=True)

# Now merge dataset with store states information
state_df = pd.read_csv("rossmann-store-sales/store_states.csv")
dataset = pd.merge(dataset, state_df, how="left", on="Store")
del state_df

# Encode categorical variables using LabelEncoder
label_encoder = LabelEncoder()
# Customers column is not categorical
for col in dataset.columns.difference(["Sales", "Customers"]):
    dataset[col] = label_encoder.fit_transform(dataset[col])
# We select relevant columns and shuffle the dataset
dataset = dataset[["Store", "DayOfWeek", "Day", "Month",
                   "Year", "Promo", "State", "Customers", "Sales"]]


shuffled_set = dataset.sample(frac=1)
shuffled_set = torch.tensor(shuffled_set.values, dtype=torch.float)

# Create a temporal set for evaluation purposes
temporal_set = dataset.iloc[::-1].copy()
temporal_set = torch.tensor(temporal_set.values, dtype=torch.float)
# Memory clean up
del dataset

# Created an output encoder for normalization of the target variable
output_encoder = OutputEncoder(torch.max(temporal_set[:, -1]))

# Normalize the target variable in both sets
temporal_set[:, -1] = output_encoder.encode(temporal_set[:, -1])
shuffled_set[:, -1] = output_encoder.encode(shuffled_set[:, -1])

# Neural network with embedding layers

class EmbeddingNN(torch.nn.Module):
    def __init__(self):
        super(EmbeddingNN, self).__init__()
        # Created a list of Embedding layers based on parameters
        emb_list = [torch.nn.Embedding(n, d) for n, d in parameters.values()]
        # List of Embedding layers converted into a ModuleList
        self.emb_layers = torch.nn.ModuleList(emb_list)

        # Increase input size to make space for Customers feature. Importantly, we are not using an embedding layer for Customers
        input_size = sum([tuple[1] for tuple in parameters.values()]) + 1

        # this part is same as before

        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(input_size, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 500),
            torch.nn.ReLU(),
            torch.nn.Linear(500, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, X):
        # Concatenate the embeddings obtained from each categorical feature using list comprehension
        # Iterate over the embedding layers and apply them to corresponding columns in the input X
        # The resulting embeddings are concatenated along the specified dimension (dim=1) to form a single tensor
        embeddings = torch.cat([emb(X[:, i])
                               for i, emb in enumerate(self.emb_layers)], dim=1)

        # Append Customers column
        embeddings = torch.cat((embeddings, X[:, -1].unsqueeze(1)), dim=1)

        return self.feed_forward(embeddings)


class OneHotNN(torch.nn.Module):
    def __init__(self):
        super(OneHotNN, self).__init__()
        # Increase input size to make space for Customers feature
        input_size = sum([tuple[0] for tuple in parameters.values()]) + 1
        # Feed-forward layers as a sequential model
        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(input_size, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 500),
            torch.nn.ReLU(),
            torch.nn.Linear(500, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, X):
        # Convert categorical features into one-hot encoding
        one_hot = torch.cat([torch.nn.functional.one_hot(X[:, i], num_emb).float()
                             for i, (num_emb, _) in enumerate(parameters.values())], dim=1)
        # Append Customers column
        one_hot = torch.cat((one_hot, X[:, -1].unsqueeze(1)), dim=1)

        return self.feed_forward(one_hot)


print(f"Shuffled OneHotNN: {evaluate(OneHotNN, shuffled_set):.3f}")
print(f"Shuffled EmbeddingNN: {evaluate(EmbeddingNN, shuffled_set):.3f}")
print(f"Temporal OneHotNN: {evaluate(OneHotNN, temporal_set):.3f}")
print(f"Temporal EmbeddingNN: {evaluate(EmbeddingNN, temporal_set):.3f}")

Shuffled OneHotNN: 0.062
Shuffled EmbeddingNN: 0.067
Temporal OneHotNN: 0.088
Temporal EmbeddingNN: 0.104


We obtained the following results, with the numbers representing the `MAPE` scores.

|  | OneHotNN | EmbeddingNN |
| --- | --- | --- |
| Shuffled Data | 0.062 | 0.067 |
| Temporal Data | 0.088 | 0.104 |

### Comparison with Paper's Results

|  | OneHotNN | EmbeddingNN |
| --- | --- | --- |
| Shuffled Data | 0.070 | 0.070 |
| Temporal Data | 0.101 | 0.093 |


We see improvements over the previous result, although for some categories it
still does not beat their score. The breakdown of all this is explained in our Methodology PDF.