## Imports

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import preprocessing
import random




## Data Preprocessing

In [34]:

file_path = ['rossmann-store-sales/train.csv',
             'rossmann-store-sales/test.csv', 'rossmann-store-sales/store.csv', 'rossmann-store-sales/store_states.csv']
store_df = pd.read_csv(file_path[2], low_memory=False)
store_states_df = pd.read_csv(file_path[3], low_memory=False)

print(f"Size of the store df: {store_df.shape}")

train_df = pd.read_csv(file_path[0], low_memory=False) # split this into test train for validation ++ we may also need to reverse the order of the data
test_df = pd.read_csv(file_path[1], low_memory=False)
train_df = pd.merge(train_df, store_df, how="inner", on="Store") #  one thing to note is that when they pre process data, they keep the store and test data separate, 
test_df = pd.merge(test_df, store_df, how="inner", on="Store")
train_df = pd.merge(train_df, store_states_df, how="inner", on="Store")
test_df = pd.merge(test_df, store_states_df, how="inner", on="Store")

print(f"The column names before dropping are : {train_df.columns.tolist()}")
print(f"Size of the dataset after merging is : {train_df.shape}")




train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Year'], train_df['Month'], train_df['Day'] = train_df['Date'].dt.year.values, train_df['Date'].dt.month.values, train_df['Date'].dt.day.values
train_df.drop(['Date'], axis=1, inplace=True)

# helper function to replace nan values if any
def replace_nan_values(dataframe, replace_with='0'):
    return dataframe.fillna(replace_with) 

replace_nan_values(train_df)
# train_df.fillna('0', inplace=True) # for some reason doing this throws deprecated warning? Even tho its literaaly the same thing as above

train_df = train_df[train_df["Sales"]!="0"]
train_df = train_df[train_df["Open"]!=""]
cols_to_drop = ['Open', 'StateHoliday', 'SchoolHoliday', 'CompetitionOpenSinceMonth', 'StoreType', 'Assortment', 'PromoInterval', 'Promo2SinceWeek', 'Promo2SinceYear', 'Promo2', 'CompetitionOpenSinceYear', 'CompetitionDistance', 'Customers']

train_df.drop(cols_to_drop, axis=1, inplace=True)

train_data_y = pd.DataFrame(train_df['Sales'])
train_data_x = train_df.drop(['Sales'], axis=1)

for feature in train_data_x.columns:
    label_encoder = preprocessing.LabelEncoder()
    train_data_x.loc[:,feature] = label_encoder.fit_transform(train_data_x[feature].astype(str).fillna("0").values)

train_data_x = train_data_x.astype(int)

##############################################################################################################


Size of the store df: (1115, 10)
The column names before dropping are : ['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'State']
Size of the dataset after merging is : (1017209, 19)


In [26]:
train_data_x.sample(10) # shows random 10 instead of first ten like head

Unnamed: 0,Store,DayOfWeek,Promo,State,Year,Month,Day
1013294,126,0,1,6,0,7,19
544295,668,4,1,7,1,10,14
28725,361,6,0,2,2,7,8
380798,470,2,0,6,2,5,17
281523,351,4,0,8,0,9,3
23706,294,3,0,11,0,10,14
685405,842,5,0,6,1,0,25
214608,269,4,1,0,1,9,25
387847,478,0,0,6,0,8,16
137242,175,6,0,6,2,5,0


In [36]:
print(train_data_x.shape)
print(train_data_y.shape)

(1017209, 7)
(1017209, 1)


In [64]:


# Define the neural network class
class EmbeddingNetwork(nn.Module):
    def __init__(self, input_size, embedding_size, output_size):
        super(EmbeddingNetwork, self).__init__()
        self.embedding = nn.Embedding(input_size, embedding_size) # Embedding layer. The size of the embedding is arbitrary, but the number of rows must match the size of the vocabulary/number of categories. The number of columns is the size of the embedding vectors. 
        self.fc = nn.Linear(embedding_size, output_size) # Fully connected layer or the dense layer. This is where the actual computation happens. The actual NN. We pass the output of the embedding layer to the fully connected layer. The fully connected layer is a linear layer, which means that it performs a linear transformation on the input. The input size is the size of the embedding vectors, and the output size is the size of the output.

    def forward(self, x): # The forward method is called when we pass data through the network.
        embedded = self.embedding(x)
        embedded = embedded.mean(dim=1)  # Average the embeddings
        output = self.fc(embedded)
        return output

# not yet defined. This is just arbitrary

# Create an instance of the neural network
input_size = 100  # Size of the input vocabulary
embedding_size = 50  # Size of the embedding vectors
output_size = 10  # Size of the output
model = EmbeddingNetwork(input_size, embedding_size, output_size)

# Print the model architecture
print(model)


EmbeddingNetwork(
  (embedding): Embedding(100, 50)
  (fc): Linear(in_features=50, out_features=10, bias=True)
)


In [23]:
# this is their code. They converted each row into a dictionary and then converted dataset into a list of dictionaries


import pickle
import csv


def csv2dicts(csvfile):
    data = []
    keys = []
    for row_index, row in enumerate(csvfile):
        if row_index == 0:
            keys = row
            print(row)
            continue
        # if row_index % 10000 == 0:
        #     print(row_index)
        data.append({key: value for key, value in zip(keys, row)})
    return data


def set_nan_as_string(data, replace_str='0'):
    for i, x in enumerate(data):
        for key, value in x.items():
            if value == '':
                x[key] = replace_str
        data[i] = x


train_data = "rossmann-store-sales/train.csv"
store_data = "rossmann-store-sales/store.csv"
store_states = 'rossmann-store-sales/store_states.csv'

with open(train_data) as csvfile:
    data = csv.reader(csvfile, delimiter=',')
    with open('train_data.pickle', 'wb') as f:
        data = csv2dicts(data)
        data = data[::-1]
        pickle.dump(data, f, -1)
        print(data[:3])


with open(store_data) as csvfile, open(store_states) as csvfile2:
    data = csv.reader(csvfile, delimiter=',')
    state_data = csv.reader(csvfile2, delimiter=',')
    with open('store_data.pickle', 'wb') as f:
        data = csv2dicts(data)
        state_data = csv2dicts(state_data)
        set_nan_as_string(data)
        for index, val in enumerate(data):
            state = state_data[index]
            val['State'] = state['State']
            data[index] = val
        pickle.dump(data, f, -1)
        print(data[:2])
        
        
import pickle
from datetime import datetime
from sklearn import preprocessing
import numpy as np
import random
random.seed(42)

with open('train_data.pickle', 'rb') as f:
    train_data = pickle.load(f)
    num_records = len(train_data)
with open('store_data.pickle', 'rb') as f:
    store_data = pickle.load(f)


def feature_list(record):
    dt = datetime.strptime(record['Date'], '%Y-%m-%d')
    store_index = int(record['Store'])
    year = dt.year
    month = dt.month
    day = dt.day
    day_of_week = int(record['DayOfWeek'])
    try:
        store_open = int(record['Open'])
    except:
        store_open = 1

    promo = int(record['Promo'])

    return [store_open,  # they only kept these columns
            store_index,
            day_of_week,
            promo,
            year,
            month,
            day,
            store_data[store_index - 1]['State']
            ]


train_data_X = []
train_data_y = []

for record in train_data:
    if record['Sales'] != '0' and record['Open'] != '':
        fl = feature_list(record)
        train_data_X.append(fl)
        train_data_y.append(int(record['Sales']))
print("Number of train datapoints: ", len(train_data_y))

print(min(train_data_y), max(train_data_y))
full_X = train_data_X
full_X = np.array(full_X)
train_data_X = np.array(train_data_X)
print(train_data_X[0])
les = []
for i in range(train_data_X.shape[1]):
    le = preprocessing.LabelEncoder()
    le.fit(full_X[:, i])
    les.append(le)
    train_data_X[:, i] = le.transform(train_data_X[:, i])

with open('les.pickle', 'wb') as f:
    pickle.dump(les, f, -1)

train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)
print(train_data_X[0])
with open('feature_train_data.pickle', 'wb') as f:
    pickle.dump((train_data_X, train_data_y), f, -1)
    print(train_data_X[0], train_data_y[0])

['Store', 'DayOfWeek', 'Date', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday']
[{'Store': '1115', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1114', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}, {'Store': '1113', 'DayOfWeek': '2', 'Date': '2013-01-01', 'Sales': '0', 'Customers': '0', 'Open': '0', 'Promo': '0', 'StateHoliday': 'a', 'SchoolHoliday': '1'}]
['Store', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval']
['Store', 'State']
[{'Store': '1', 'StoreType': 'c', 'Assortment': 'a', 'CompetitionDistance': '1270', 'CompetitionOpenSinceMonth': '9', 'CompetitionOpenSinceYear': '2008', 'Promo2': '0', 'Promo2SinceWeek': '0', 'Promo2SinceYear': 