In [63]:
import pandas as pd
import datasets
import numpy as np
import torch
import torch.nn as nn


In [90]:

file_path = ['rossmann-store-sales/train.csv', 'rossmann-store-sales/test.csv', 'rossmann-store-sales/store.csv']
store_df = pd.read_csv(file_path[2], low_memory=False)

print(f"Size of the store df: {store_df.shape}")

train_df = pd.read_csv(file_path[0], low_memory=False)
test_df = pd.read_csv(file_path[1], low_memory=False)
train_df = pd.merge(train_df,store_df,how="inner",on="Store")
test_df = pd.merge(test_df,store_df,how="inner",on="Store")

print(f"The column names are : {train_df.columns.tolist()}")
print(f"Size of the dataset before processing is : {train_df.shape}")


train_df['StateHoliday'] = train_df['StateHoliday'].map({0:0, '0':0, 'a':1, 'b':2, 'c':3})

column_data_types = {col: np.int64 if col !=
                     'Date' else 'object' for col in column_names}
train_df = train_df.astype(column_data_types)

for col in train_df.columns:
    train_df[col] = train_df[col].astype('category').cat.as_ordered()
    
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Year'] = train_df['Date'].dt.year.values
train_df['Month'] = train_df['Date'].dt.month.values
train_df['Day'] = train_df['Date'].dt.day.values
train_df.drop(['Date'], axis=1, inplace=True)

train_df.drop_duplicates(inplace=True)

train_df.dropna(inplace=True)  # not sure if should drop/should replace with median or mean
train_data_y = pd.DataFrame(train_df['Sales'])
train_data_x = train_df.drop(['Sales'], axis=1)

print(f"Size of the dataset after processing is : {train_df.shape}")


## customers is not categorical hmmm

# df.head(10)

train_data_x.head(10)
# train_data_y.head(10)

##############################################################################################################
# print(len(df))

# for col in train_data_x.columns:
#     print(f"\nUnique values in {col} column: ")
#     print(train_data_x[col].unique())

# find nan rows
# print(df.isnull().sum())
# print(df.dtypes)

Size of the store df: (1115, 10)
The column names are : ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
Size of the dataset before processing is : (1017209, 18)
Size of the dataset after processing is : (324326, 20)


Unnamed: 0,Store,DayOfWeek,Customers,Open,Promo,StateHoliday,SchoolHoliday,StoreType,Assortment,CompetitionDistance,CompetitionOpenSinceMonth,CompetitionOpenSinceYear,Promo2,Promo2SinceWeek,Promo2SinceYear,PromoInterval,Year,Month,Day
942,2,5,625,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,31
943,2,4,601,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,30
944,2,3,727,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,29
945,2,2,646,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,28
946,2,1,638,1,1,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,27
947,2,7,0,0,0,0,0,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,26
948,2,6,316,1,0,0,0,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,25
949,2,5,468,1,0,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,24
950,2,4,521,1,0,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,23
951,2,3,650,1,0,0,1,a,a,570.0,11.0,2007.0,1,13.0,2010.0,"Jan,Apr,Jul,Oct",2015,7,22


In [64]:


# Define the neural network class
class EmbeddingNetwork(nn.Module):
    def __init__(self, input_size, embedding_size, output_size):
        super(EmbeddingNetwork, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size) # Embedding layer. The size of the embedding is arbitrary, but the number of rows must match the size of the vocabulary/number of categories. The number of columns is the size of the embedding vectors. 
        self.fc = nn.Linear(embedding_size, output_size) # Fully connected layer or the dense layer. This is where the actual computation happens. The actual NN. We pass the output of the embedding layer to the fully connected layer. The fully connected layer is a linear layer, which means that it performs a linear transformation on the input. The input size is the size of the embedding vectors, and the output size is the size of the output.

    def forward(self, x): # The forward method is called when we pass data through the network.
        embedded = self.embedding(x)
        embedded = embedded.mean(dim=1)  # Average the embeddings
        output = self.fc(embedded)
        return output

# Create an instance of the neural network
input_size = 100  # Size of the input vocabulary
embedding_size = 50  # Size of the embedding vectors
output_size = 10  # Size of the output
model = EmbeddingNetwork(input_size, embedding_size, output_size)

# Print the model architecture
print(model)


EmbeddingNetwork(
  (embedding): Embedding(100, 50)
  (fc): Linear(in_features=50, out_features=10, bias=True)
)
