# Improvement

In this notebook we attempt to improve the paper.  Our first strategy is to
train the models for the full dataset rather than 200k records, let's see how
that goes. The code that follows is copied from the previous notebook with only
change being that we did not sample 200k records.

In [1]:
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import torch

relevant_columns = ["Store", "DayOfWeek", "Date", "Sales", "Promo"]
dataset = pd.read_csv("rossmann-store-sales/train.csv", usecols=relevant_columns)

dataset = dataset[dataset["Sales"] != 0]

dataset[["Year", "Month", "Day"]] = dataset["Date"].str.split("-", expand=True)
dataset.drop(columns=["Date"], inplace=True)

state_df = pd.read_csv("rossmann-store-sales/store_states.csv")
dataset = pd.merge(dataset, state_df, how="left", on="Store")
del state_df

label_encoder = LabelEncoder()
for col in dataset.columns.difference(["Sales"]):
    dataset[col] = label_encoder.fit_transform(dataset[col])

dataset = dataset[["Store", "DayOfWeek", "Day", "Month", "Year", "Promo", "State", "Sales"]]


shuffled_set = dataset.sample(frac=1)
shuffled_set = torch.tensor(shuffled_set.values, dtype=torch.float)

temporal_set = dataset.iloc[::-1].copy()
temporal_set = torch.tensor(temporal_set.values, dtype=torch.float)

del dataset

class OutputEncoder():
    def __init__(self, max_output):
        self.max_output = max_output

    def encode(self, output):
        with torch.no_grad():
            return torch.log(output) / torch.log(self.max_output)

    def decode(self, output):
        with torch.no_grad():
            return torch.exp(output * torch.log(self.max_output))

output_encoder = OutputEncoder(torch.max(temporal_set[:, -1]))

temporal_set[:, -1] = output_encoder.encode(temporal_set[:, -1])
shuffled_set[:, -1] = output_encoder.encode(shuffled_set[:, -1])

def test_train_split(dataset):
    split_threshold = int(0.9 * dataset.size(0))

    X_train = dataset[:split_threshold, :-1].long()
    X_test = dataset[split_threshold:, :-1].long()

    y_train = dataset[:split_threshold, -1]
    y_test = dataset[split_threshold:, -1]

    # NOTE: This is changed from previous notebook
    return X_train, y_train, X_test, y_test


parameters = {
    "store": (1115, 10),
    "day_of_week": (7, 6),
    "day": (31, 10),
    "month": (12, 6),
    "year": (3, 2),
    "promotion": (2, 1),
    "state": (12, 6)
}
    
class EmbeddingNN(torch.nn.Module):
    def __init__(self):
        super(EmbeddingNN, self).__init__()

        emb_list = [torch.nn.Embedding(n, d) for n, d in parameters.values()]
        self.emb_layers = torch.nn.ModuleList(emb_list)

        input_size = sum([tuple[1] for tuple in parameters.values()])

        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(input_size, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 500),
            torch.nn.ReLU(),
            torch.nn.Linear(500, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, X):
        embeddings = torch.cat([emb(X[:, i]) for i, emb in enumerate(self.emb_layers)], dim=1)
        
        return self.feed_forward(embeddings)

class OneHotNN(torch.nn.Module):
    def __init__(self):
        super(OneHotNN, self).__init__()
        input_size = sum([tuple[0] for tuple in parameters.values()])

        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(input_size, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 500),
            torch.nn.ReLU(),
            torch.nn.Linear(500, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, X):
        one_hot = torch.cat([torch.nn.functional.one_hot(X[:, i], num_emb).float()
                             for i, (num_emb, _) in enumerate(parameters.values())], dim=1)

        return self.feed_forward(one_hot)

def train_model(model, X, y):
    loss_fn = torch.nn.L1Loss()
    optim = torch.optim.Adam(model.parameters(), lr=0.001)

    epochs = 10
    batch_size = 128
    total_samples = len(X)

    model.train()
    for _ in range(epochs):
        for i in range(0, total_samples, batch_size):
            inputs = X[i:i+batch_size]
            targets = y[i:i+batch_size]
            
            optim.zero_grad()
            outputs = model(inputs).squeeze()
            loss = loss_fn(outputs, targets)
            loss.backward()
            optim.step()

def MAPE(y_pred, y_true):
    return torch.mean(torch.abs((y_true - y_pred) / y_true))

def evaluate(cls, dataset):
    X_train, y_train, X_test, y_test = test_train_split(dataset)

    models = [cls() for _ in range(5)]
    for model in models:
        train_model(model, X_train, y_train)

    y_preds = []
    for model in models:
        model.eval()
        y_pred = model(X_test).squeeze()
        y_pred = output_encoder.decode(y_pred)
        y_preds.append(y_pred)

    stacked_preds = torch.stack(y_preds)
    y_pred = torch.mean(stacked_preds, dim=0)

    y_true = output_encoder.decode(y_test)
    return MAPE(y_pred, y_true)

print(f"Shuffled OneHotNN: {evaluate(OneHotNN, shuffled_set):.3f}")
print(f"Shuffled EmbeddingNN: {evaluate(EmbeddingNN, shuffled_set):.3f}")
print(f"Temporal OneHotNN: {evaluate(OneHotNN, temporal_set):.3f}")
print(f"Temporal EmbeddingNN: {evaluate(EmbeddingNN, temporal_set):.3f}")

Shuffled OneHotNN: 0.065
Shuffled EmbeddingNN: 0.072
Temporal OneHotNN: 0.094
Temporal EmbeddingNN: 0.106


We see improvements to all the results except temporal with embeddings. The
reason can be that with temporal data, the test and train test are fundamentally
different and adding more training data makes the model overfit and reduces
generalizability.

Let's try to add more columns. Looking at all options, `Customers` seems like
the best choice, since intuitively it makes sense that it would be a strong
predictor of `Sales`. We also do feature scaling on this column. Because its
range was very large.

In [2]:
from sklearn.preprocessing import StandardScaler

# Add Customers column
relevant_columns = ["Store", "DayOfWeek", "Date", "Sales", "Promo", "Customers"]
dataset = pd.read_csv("rossmann-store-sales/train.csv", usecols=relevant_columns)

# Scale Customers by z-score
scaler = StandardScaler()
dataset["Customers"] = scaler.fit_transform(dataset[["Customers"]])

dataset = dataset[dataset["Sales"] != 0]

dataset[["Year", "Month", "Day"]] = dataset["Date"].str.split("-", expand=True)
dataset.drop(columns=["Date"], inplace=True)

state_df = pd.read_csv("rossmann-store-sales/store_states.csv")
dataset = pd.merge(dataset, state_df, how="left", on="Store")
del state_df

label_encoder = LabelEncoder()
# Customers column is not categorical
for col in dataset.columns.difference(["Sales", "Customers"]):
    dataset[col] = label_encoder.fit_transform(dataset[col])

dataset = dataset[["Store", "DayOfWeek", "Day", "Month", "Year", "Promo", "State", "Customers", "Sales"]]


shuffled_set = dataset.sample(frac=1)
shuffled_set = torch.tensor(shuffled_set.values, dtype=torch.float)

temporal_set = dataset.iloc[::-1].copy()
temporal_set = torch.tensor(temporal_set.values, dtype=torch.float)

del dataset

output_encoder = OutputEncoder(torch.max(temporal_set[:, -1]))

temporal_set[:, -1] = output_encoder.encode(temporal_set[:, -1])
shuffled_set[:, -1] = output_encoder.encode(shuffled_set[:, -1])

class EmbeddingNN(torch.nn.Module):
    def __init__(self):
        super(EmbeddingNN, self).__init__()

        emb_list = [torch.nn.Embedding(n, d) for n, d in parameters.values()]
        self.emb_layers = torch.nn.ModuleList(emb_list)

        # Increase input size to make space for Customers feature
        input_size = sum([tuple[1] for tuple in parameters.values()]) + 1

        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(input_size, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 500),
            torch.nn.ReLU(),
            torch.nn.Linear(500, 1),
            torch.nn.Sigmoid()
        )

    def forward(self, X):
        embeddings = torch.cat([emb(X[:, i]) for i, emb in enumerate(self.emb_layers)], dim=1)

        # Append Customers column
        embeddings = torch.cat((embeddings, X[:, -1].unsqueeze(1)), dim=1)
        
        return self.feed_forward(embeddings)

class OneHotNN(torch.nn.Module):
    def __init__(self):
        super(OneHotNN, self).__init__()
        # Increase input size to make space for Customers feature
        input_size = sum([tuple[0] for tuple in parameters.values()]) + 1

        self.feed_forward = torch.nn.Sequential(
            torch.nn.Linear(input_size, 1000),
            torch.nn.ReLU(),
            torch.nn.Linear(1000, 500),
            torch.nn.ReLU(),
            torch.nn.Linear(500, 1),
            torch.nn.Sigmoid()
        )
    def forward(self, X):
        one_hot = torch.cat([torch.nn.functional.one_hot(X[:, i], num_emb).float()
                             for i, (num_emb, _) in enumerate(parameters.values())], dim=1)
        # Append Customers column
        one_hot = torch.cat((one_hot, X[:, -1].unsqueeze(1)), dim=1)

        return self.feed_forward(one_hot)

print(f"Shuffled OneHotNN: {evaluate(OneHotNN, shuffled_set):.3f}")
print(f"Shuffled EmbeddingNN: {evaluate(EmbeddingNN, shuffled_set):.3f}")
print(f"Temporal OneHotNN: {evaluate(OneHotNN, temporal_set):.3f}")
print(f"Temporal EmbeddingNN: {evaluate(EmbeddingNN, temporal_set):.3f}")

Shuffled OneHotNN: 0.062
Shuffled EmbeddingNN: 0.067
Temporal OneHotNN: 0.088
Temporal EmbeddingNN: 0.104


We see improvements over the previous result, although for some categories it
still does not beat their score.