## Imports

In [333]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import preprocessing
import random
# torch.set_num_threads(1)

## Data Preprocessing

In [334]:

file_path = ['rossmann-store-sales/train.csv',
             'rossmann-store-sales/test.csv', 'rossmann-store-sales/store.csv', 'rossmann-store-sales/store_states.csv']
store_df = pd.read_csv(file_path[2], low_memory=False, dtype=str)
store_states_df = pd.read_csv(file_path[3], low_memory=False,  dtype=str)

print(f"Size of the store df: {store_df.shape}")

train_df = pd.read_csv(file_path[0], low_memory=False, dtype=str) # split this into test train for validation ++ we may also need to reverse the order of the data
test_df = pd.read_csv(file_path[1], low_memory=False, dtype=str)
train_df = pd.merge(train_df, store_df, how="inner", on="Store") #  one thing to note is that when they pre process data, they keep the store and test data separate, 
test_df = pd.merge(test_df, store_df, how="inner", on="Store")
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Year'], train_df['Month'], train_df['Day'] = train_df['Date'].dt.year.values, train_df['Date'].dt.month.values, train_df['Date'].dt.day.values
train_df.drop(['Date'], axis=1, inplace=True)
train_df = pd.merge(train_df, store_states_df, how="inner", on="Store")
test_df = pd.merge(test_df, store_states_df, how="inner", on="Store")

print(f"The column names before dropping are : {train_df.columns.tolist()}")
print(f"Size of the train dataset after merging is : {train_df.shape}")
# print(f"Size of the test dataset after merging is : {test_df.shape}")
print(train_df['Store'].dtype)

# helper function to replace nan values if any
train_df.fillna('0', inplace=True)

train_df = train_df[train_df["Sales"]!="0"]
train_df = train_df[train_df["Open"]!=""]
cols_to_drop = ['StateHoliday', 'SchoolHoliday', 'CompetitionOpenSinceMonth', 'StoreType', 'Assortment', 'PromoInterval', 'Promo2SinceWeek', 'Promo2SinceYear', 'Promo2', 'CompetitionOpenSinceYear', 'CompetitionDistance', 'Customers']

train_df.drop(cols_to_drop, axis=1, inplace=True)

train_data_y = pd.DataFrame(train_df['Sales'])
train_data_x = train_df.drop(['Sales'], axis=1)

for feature in train_data_x.columns: # this is to convert the categorical data into numerical data
    label_encoder = preprocessing.LabelEncoder()
    train_data_x.loc[:,feature] = label_encoder.fit_transform(train_data_x[feature].astype(str).fillna("0").values)
train_data_x = train_data_x.reindex(['Open','Store', 'DayOfWeek',  'Promo', 'Year', 'Month', 'Day' ,'State'], axis=1)
# train_data_x = train_data_x.astype(int)
# train_data_y = train_data_y.astype(int)
print(f"Size of the train dataset after merging and preprocessing is : {train_df.shape}")
##############################################################################################################


Size of the store df: (1115, 10)
The column names before dropping are : ['Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Year', 'Month', 'Day', 'State']
Size of the train dataset after merging is : (1017209, 21)
object
Size of the train dataset after merging and preprocessing is : (844338, 9)


In [335]:
train_data_x.head(10) # shows random 10 instead of first ten like head
# print(train_data_x.shape)

# they kept 8 columns as shown below

Unnamed: 0,Open,Store,DayOfWeek,Promo,Year,Month,Day,State
0,0,0,4,1,2,9,24,4
1,0,0,3,1,2,9,23,4
2,0,0,2,1,2,9,21,4
3,0,0,1,1,2,9,20,4
4,0,0,0,1,2,9,19,4
6,0,0,5,0,2,9,17,4
7,0,0,4,0,2,9,16,4
8,0,0,3,0,2,9,15,4
9,0,0,2,0,2,9,14,4
10,0,0,1,0,2,9,13,4


In [336]:
print(train_data_x.shape) # matches
print(train_data_y.shape)

(844338, 8)
(844338, 1)


In [337]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pickle

train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = True
saved_embeddings_fname = "embeddings.pickle"  # set save_embeddings to True to create this file

train_data_x = train_data_x.astype(np.int64)
train_data_y = train_data_y.astype(np.int64)

def split_features(X): # this function takes a numpy array and splits it into a list of np arrays. It returns a list of all the features/columns in the dataset
 # Extract the column with index 0 (store) and append to the list
    X_list = []
    store_index = X[:, 0].reshape(-1, 1)
    X_list.append(np.array(store_index, dtype=np.int64))

    # Extract the column with index 1 (day_of_week) and append to the list
    day_of_week = X[:, 1].reshape(-1, 1)
    X_list.append(np.array(day_of_week, dtype=np.int64))

    # Extract the column with index 2 (promo) and append to the list
    promo = X[:, 2].reshape(-1, 1)
    X_list.append(np.array(promo, dtype=np.int64))

    # Extract the column with index 3 (year) and append to the list
    year = X[:, 3].reshape(-1, 1)
    X_list.append(np.array(year, dtype=np.int64))

    # Extract the column with index 4 (month) and append to the list
    month = X[:, 4].reshape(-1, 1)
    X_list.append(np.array(month, dtype=np.int64))

    # Extract the column with index 5 (day) and append to the list
    day = X[:, 5].reshape(-1, 1)
    X_list.append(np.array(day, dtype=np.int64))
    # Extract the column with index 6 (state) and append to the list
    state = X[:, 6].reshape(-1, 1)
    X_list.append(np.array(state, dtype=np.int64))

    return X_list
 
split_features(train_data_x.values)


# okay they ar enot really suing this/ they only use this to feed embeddings leanred duing NN to other models as input
def embed_features(X, saved_embeddings_fname): # this function creates and saves the embeddings for the categorical data
   f_embeddings = open(saved_embeddings_fname, "rb")
   embeddings = pickle.load(f_embeddings)

   index_embedding_mapping = {1: 0, 2: 1, 4: 2, 5: 3, 6: 4, 7: 5} # this is the mapping of the columns index to the embeddings index. They skipped columns 1 and 3. Opena and Promo because they are 0, 1? I guess binary categories are useless to embed. Embedding these binary features may not provide significant benefits, as their information is already somewhat captured by the binary nature.
   X_embedded = []
   print(X.shape)

   (num_records, num_features) = X.shape
   for record in X: # a record is a row in the dataset
      # print(record)
      embedded_features = []
      for i, feat in enumerate(record): # this is to embed the features
         # print(i, feat)
         feat = int(feat)
         if i not in index_embedding_mapping.keys():
               embedded_features += [feat]
         else:
               embedding_index = index_embedding_mapping[i]
               embedded_features += embeddings[embedding_index][feat].tolist()

      X_embedded.append(embedded_features)

   return pd.DataFrame(X_embedded) # returns the embedded features
   # return np.array(X_embedded) # returns the embedded features

# j = embed_features((np.array(train_data_x)), saved_embeddings_fname)

# print(j)

# print(j.shape) # okay shape matches theirs as well


  

In [338]:

# train_data_x = j
# train_data_y = np.array(train_data_y)

X_train, X_val, y_train, y_val = train_test_split(train_data_x, train_data_y, test_size=0.1, random_state=42) # this is to split the data into train and test sets
print(y_train[:5])
for data in [X_train, X_val]:
    data.drop(['Open'], axis=1, inplace=True)

        Sales
934085   6760
937882   6833
977052   8578
165948   8467
658089   5214


So basically there is an entity embedding layer for each categoircal feature (Store, DOW, etc). OPen and Promo are not inlcuded since they are binary. 
These are the embedding dimesions for different categories used in the paper            
cat_emb_dim={
                'Store': 10,
                'DayOfWeek': 6,
                'Promo': 1,
                'Year': 2,
                'Month': 6,
                'Day': 10,
                'State': 6},}


In [339]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

class PyTorchModel(nn.Module):
    def __init__(self):
        super(PyTorchModel, self).__init__()
        
        # Define Embeddings
        self.store_embedding = nn.Embedding(1115, 10)
        self.dow_embedding = nn.Embedding(7, 6)
        self.year_embedding = nn.Embedding(3, 2)
        self.month_embedding = nn.Embedding(12, 6)
        self.day_embedding = nn.Embedding(31, 10)
        self.state_embedding = nn.Embedding(12, 6)
        
        # Define other layers
        self.promo_layer = nn.Linear(1, 1)
        self.dense1 = nn.Linear(41, 1000)
        self.dense2 = nn.Linear(1000, 500)
        self.dense3 = nn.Linear(500, 1)
        
    def forward(self, input_store, input_dow, input_promo, input_year, input_month, input_day, input_state):
        # Apply embeddings
        input_promo = input_promo.unsqueeze(1)
        output_store = self.store_embedding(input_store.view(-1, 1))
        output_dow = self.dow_embedding(input_dow.view(-1, 1))
        output_year = self.year_embedding(input_year.view(-1, 1))
        output_month = self.month_embedding(input_month.view(-1, 1))
        output_day = self.day_embedding(input_day.view(-1, 1))
        output_state = self.state_embedding(input_state.view(-1, 1))
        
        # Reshape embeddings
        output_store = output_store.view(-1, 10)
        output_dow = output_dow.view(-1, 6)
        output_year = output_year.view(-1, 2)
        output_month = output_month.view(-1, 6)
        output_day = output_day.view(-1, 10)
        output_state = output_state.view(-1, 6)
        
        # print(output_store.shape)
        # print(output_dow.shape)
        # print(input_promo.shape)
                
        # Concatenate embeddings
        concatenated = torch.cat([output_store, output_dow, input_promo, output_year, output_month, output_day, output_state], dim=1)
        # print(concatenated.shape)
        # Feed forward through other layers
        x = F.relu(self.dense1(concatenated))
        x = F.relu(self.dense2(x))
        x = self.dense3(x)
        x = torch.sigmoid(x)
        
        return x

# Example usage
pytorch_model = PyTorchModel()
criterion = nn.L1Loss()
optimizer = optim.Adam(pytorch_model.parameters(), lr=0.001)

# Convert numpy arrays to PyTorch tensors
print(y_train[:5])
max_log_y = max(np.max(np.log(y_train)), np.max(np.log(y_val))) # this is to normalize the data
y_train = np.log(y_train)/max_log_y
# print("After \n",y_train[:5])
y_val = np.log(y_val)/max_log_y
X_train_tensor = torch.from_numpy(np.array(X_train))
y_train_tensor = torch.from_numpy(np.array(y_train))

# Training loop
epochs = 10
batch_size = 128

for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]
        # print(batch_X.shape)
        # print(batch_y.shape)
        
        optimizer.zero_grad()
        output = pytorch_model(batch_X[:, 0], batch_X[:, 1], batch_X[:, 2], batch_X[:, 3], batch_X[:, 4], batch_X[:, 5], batch_X[:, 6])

        loss = criterion(output.flatten(), batch_y.float())
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}')

# # Prediction example
# with torch.no_grad():
#     output = pytorch_model(*[torch.from_numpy(example).long() for example in features])
#     prediction = output.numpy()
#     print("Prediction:", prediction)


        Sales
934085   6760
937882   6833
977052   8578
165948   8467
658089   5214


  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


Epoch 1/10, Loss: 0.030744114890694618
Epoch 2/10, Loss: 0.030887659639120102
Epoch 3/10, Loss: 0.03071647882461548
Epoch 4/10, Loss: 0.030838701874017715
Epoch 5/10, Loss: 0.03091205097734928
Epoch 6/10, Loss: 0.030927734449505806
Epoch 7/10, Loss: 0.03092590905725956
Epoch 8/10, Loss: 0.030922962352633476
Epoch 9/10, Loss: 0.030890056863427162
Epoch 10/10, Loss: 0.03087809681892395


NameError: name 'features' is not defined