In [63]:
import pandas as pd
import datasets
import numpy as np
import torch
import torch.nn as nn


In [104]:

file_path = ['rossmann-store-sales/train.csv',
             'rossmann-store-sales/test.csv', 'rossmann-store-sales/store.csv']
store_df = pd.read_csv(file_path[2], low_memory=False)

print(f"Size of the store df: {store_df.shape}")

train_df = pd.read_csv(file_path[0], low_memory=False)
test_df = pd.read_csv(file_path[1], low_memory=False)
train_df = pd.merge(train_df, store_df, how="inner", on="Store")
test_df = pd.merge(test_df, store_df, how="inner", on="Store")

print(f"The column names are : {train_df.columns.tolist()}")
print(f"Size of the dataset is : {train_df.shape}")

cols_to_not = ['StoreType', 'Assortment', 'PromoInterval', 'Date']

# print(train_df.isnull().sum())
# train_df.dropna(inplace=True)  # not sure if should drop/should replace with median or mean imo
train_df["CompetitionDistance"].replace(
    np.nan, train_df["CompetitionDistance"].mean(), inplace=True)  # replace with mean
# replace with 0 since for the remaining ones, it means there is no Promo
train_df.fillna(0, inplace=True)

train_df['StateHoliday'] = train_df['StateHoliday'].map(
    {0: 0, '0': 0, 'a': 1, 'b': 2, 'c': 3})

column_data_types = {
    col: np.int64 if col not in cols_to_not else 'object' for col in train_df.columns.tolist()}

train_df = train_df.astype(column_data_types)

for col in train_df.columns:
    train_df[col] = train_df[col].astype('category').cat.as_ordered()

train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Year'] = train_df['Date'].dt.year.values
train_df['Month'] = train_df['Date'].dt.month.values
train_df['Day'] = train_df['Date'].dt.day.values
train_df.drop(['Date'], axis=1, inplace=True)

train_df.drop_duplicates(inplace=True)

# customers is not categorical hmmm

train_data_y = pd.DataFrame(train_df['Sales'])
train_data_x = train_df.drop(['Sales'], axis=1)


##############################################################################################################
# print(len(df))
# train_data_y.head(10)

# for col in train_data_x.columns:
#     print(f"\nUnique values in {col} column: ")
#     print(train_data_x[col].unique())

# find nan rows
# print(df.dtypes)

Size of the store df: (1115, 10)
The column names are : ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
Size of the dataset is : (1017209, 18)
{'Store': <class 'numpy.int64'>, 'DayOfWeek': <class 'numpy.int64'>, 'Date': 'object', 'Sales': <class 'numpy.int64'>, 'Customers': <class 'numpy.int64'>, 'Open': <class 'numpy.int64'>, 'Promo': <class 'numpy.int64'>, 'StateHoliday': <class 'numpy.int64'>, 'SchoolHoliday': <class 'numpy.int64'>, 'StoreType': 'object', 'Assortment': 'object', 'CompetitionDistance': <class 'numpy.int64'>, 'CompetitionOpenSinceMonth': <class 'numpy.int64'>, 'CompetitionOpenSinceYear': <class 'numpy.int64'>, 'Promo2': <class 'numpy.int64'>, 'Promo2SinceWeek': <class 'numpy.int64'>, 'Promo2SinceYear': <class 'numpy.int64'>, 'PromoInterval': 'ob

In [105]:
train_data_x.sample(10) # shows random 10 instead of first ten like head

Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day
1015206,1113,1,792,1,1,0,0,a,c,9260,0,0,0,0,0,0,2013,4,29
119058,131,5,457,1,0,0,1,c,a,920,7,2015,0,0,0,0,2014,4,25
646655,710,2,368,1,1,0,1,d,a,1500,9,2008,1,14,2011,"Jan,Apr,Jul,Oct",2013,3,26
111043,122,1,710,1,1,0,0,a,c,58260,4,2013,0,0,0,0,2014,1,20
603545,663,6,778,1,0,0,0,a,c,7860,5,2005,0,0,0,0,2015,1,31
955359,1048,5,661,1,1,0,0,d,c,1860,9,2012,1,40,2012,"Jan,Apr,Jul,Oct",2015,1,30
24656,27,3,1139,1,1,0,1,a,a,60,1,2005,1,5,2011,"Jan,Apr,Jul,Oct",2013,8,14
226384,250,4,654,1,1,0,0,d,a,3520,0,0,1,18,2012,"Feb,May,Aug,Nov",2015,4,16
238386,263,1,449,1,1,0,0,a,c,1140,5,2013,1,40,2014,"Jan,Apr,Jul,Oct",2015,6,15
130457,144,2,631,1,0,0,0,a,c,16570,0,0,0,0,0,0,2014,2,11


In [64]:


# Define the neural network class
class EmbeddingNetwork(nn.Module):
    def __init__(self, input_size, embedding_size, output_size):
        super(EmbeddingNetwork, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size) # Embedding layer. The size of the embedding is arbitrary, but the number of rows must match the size of the vocabulary/number of categories. The number of columns is the size of the embedding vectors. 
        self.fc = nn.Linear(embedding_size, output_size) # Fully connected layer or the dense layer. This is where the actual computation happens. The actual NN. We pass the output of the embedding layer to the fully connected layer. The fully connected layer is a linear layer, which means that it performs a linear transformation on the input. The input size is the size of the embedding vectors, and the output size is the size of the output.

    def forward(self, x): # The forward method is called when we pass data through the network.
        embedded = self.embedding(x)
        embedded = embedded.mean(dim=1)  # Average the embeddings
        output = self.fc(embedded)
        return output

# Create an instance of the neural network
input_size = 100  # Size of the input vocabulary
embedding_size = 50  # Size of the embedding vectors
output_size = 10  # Size of the output
model = EmbeddingNetwork(input_size, embedding_size, output_size)

# Print the model architecture
print(model)


EmbeddingNetwork(
  (embedding): Embedding(100, 50)
  (fc): Linear(in_features=50, out_features=10, bias=True)
)
