## Imports

In [15]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import preprocessing
import random


## Data Preprocessing

In [150]:

file_path = ['rossmann-store-sales/train.csv',
             'rossmann-store-sales/test.csv', 'rossmann-store-sales/store.csv', 'rossmann-store-sales/store_states.csv']
store_df = pd.read_csv(file_path[2], low_memory=False, dtype=str)
store_states_df = pd.read_csv(file_path[3], low_memory=False,  dtype=str)

print(f"Size of the store df: {store_df.shape}")

train_df = pd.read_csv(file_path[0], low_memory=False, dtype=str) # split this into test train for validation ++ we may also need to reverse the order of the data
test_df = pd.read_csv(file_path[1], low_memory=False, dtype=str)
train_df = pd.merge(train_df, store_df, how="inner", on="Store") #  one thing to note is that when they pre process data, they keep the store and test data separate, 
test_df = pd.merge(test_df, store_df, how="inner", on="Store")
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Year'], train_df['Month'], train_df['Day'] = train_df['Date'].dt.year.values, train_df['Date'].dt.month.values, train_df['Date'].dt.day.values
train_df.drop(['Date'], axis=1, inplace=True)
train_df = pd.merge(train_df, store_states_df, how="inner", on="Store")
test_df = pd.merge(test_df, store_states_df, how="inner", on="Store")

print(f"The column names before dropping are : {train_df.columns.tolist()}")
print(f"Size of the train dataset after merging is : {train_df.shape}")
# print(f"Size of the test dataset after merging is : {test_df.shape}")
print(train_df['Store'].dtype)

# helper function to replace nan values if any
train_df.fillna('0', inplace=True)

train_df = train_df[train_df["Sales"]!="0"]
train_df = train_df[train_df["Open"]!=""]
cols_to_drop = ['StateHoliday', 'SchoolHoliday', 'CompetitionOpenSinceMonth', 'StoreType', 'Assortment', 'PromoInterval', 'Promo2SinceWeek', 'Promo2SinceYear', 'Promo2', 'CompetitionOpenSinceYear', 'CompetitionDistance', 'Customers']

train_df.drop(cols_to_drop, axis=1, inplace=True)

train_data_y = pd.DataFrame(train_df['Sales'])
train_data_x = train_df.drop(['Sales'], axis=1)

for feature in train_data_x.columns: # this is to convert the categorical data into numerical data
    label_encoder = preprocessing.LabelEncoder()
    train_data_x.loc[:,feature] = label_encoder.fit_transform(train_data_x[feature].astype(str).fillna("0").values)

train_data_x = train_data_x.astype(int)
print(f"Size of the train dataset after merging and preprocessing is : {train_df.shape}")
##############################################################################################################


Size of the store df: (1115, 10)
The column names before dropping are : ['Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Year', 'Month', 'Day', 'State']
Size of the train dataset after merging is : (1017209, 21)
object
Size of the train dataset after merging and preprocessing is : (844338, 9)


In [151]:
train_data_x.sample(10) # shows random 10 instead of first ten like head
# print(train_data_x.shape)

Unnamed: 0,Store,DayOfWeek,Open,Promo,Year,Month,Day,State
237110,296,1,0,0,0,1,6,11
761490,934,5,0,0,1,5,14,9
274227,342,4,0,1,0,7,8,4
480791,591,3,0,0,0,0,22,6
244889,306,5,0,0,2,5,5,0
907224,1110,0,0,1,1,5,8,9
134205,170,5,0,0,0,10,24,0
51610,638,1,0,1,1,5,25,9
31471,394,0,0,1,2,4,11,6
376986,466,4,0,1,2,7,29,4


In [152]:
print(train_data_x.shape)
# print(train_data_y.shape)

(844338, 8)


In [153]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pickle

train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = True
saved_embeddings_fname = "embeddings.pickle"  # set save_embeddings to True to create this file


def split_features(X): # this function takes a numpy array and splits it into a list of np arrays. It returns a list of all the features/columns in the dataset
 # Extract the column with index 0 (store) and append to the list
    X_list = []
    store_index = X[:, 0].reshape(-1, 1)
    X_list.append(np.array(store_index, dtype=np.int64))

    # Extract the column with index 1 (day_of_week) and append to the list
    day_of_week = X[:, 1].reshape(-1, 1)
    X_list.append(np.array(day_of_week, dtype=np.int64))

    # Extract the column with index 2 (promo) and append to the list
    promo = X[:, 2].reshape(-1, 1)
    X_list.append(np.array(promo, dtype=np.int64))

    # Extract the column with index 3 (year) and append to the list
    year = X[:, 3].reshape(-1, 1)
    X_list.append(np.array(year, dtype=np.int64))

    # Extract the column with index 4 (month) and append to the list
    month = X[:, 4].reshape(-1, 1)
    X_list.append(np.array(month, dtype=np.int64))

    # Extract the column with index 5 (day) and append to the list
    day = X[:, 5].reshape(-1, 1)
    X_list.append(np.array(day, dtype=np.int64))
    # Extract the column with index 6 (state) and append to the list
    state = X[:, 6].reshape(-1, 1)
    X_list.append(np.array(state, dtype=np.int64))

    return X_list
 
# split_features(train_data_x.values)
def embed_features(X, saved_embeddings_fname): # this function creates and saves the embeddings for the categorical data
   f_embeddings = open(saved_embeddings_fname, "rb")
   embeddings = pickle.load(f_embeddings)

   index_embedding_mapping = {1: 0, 2: 1, 4: 2, 5: 3, 6: 4, 7: 5}
   X_embedded = []
   print(X.shape)

   (num_records, num_features) = X.shape
   for record in X:
      embedded_features = []
      for i, feat in enumerate(record):
         feat = int(feat)
         if i not in index_embedding_mapping.keys():
               embedded_features += [feat]
         else:
               embedding_index = index_embedding_mapping[i]
               embedded_features += embeddings[embedding_index][feat].tolist()

      X_embedded.append(embedded_features)

   return np.array(X_embedded)

j = embed_features((np.array(train_data_x)), saved_embeddings_fname)

print(j)



  

(844338, 8)
[[ 0.00000000e+00 -4.57369201e-02  6.36358187e-03 ...  1.06697967e-02
  -3.60607207e-02  3.40544581e-02]
 [ 0.00000000e+00 -2.90307291e-02  4.89163883e-02 ...  1.06697967e-02
  -3.60607207e-02  3.40544581e-02]
 [ 0.00000000e+00 -4.62355018e-02  3.47966589e-02 ...  1.06697967e-02
  -3.60607207e-02  3.40544581e-02]
 ...
 [ 1.30000000e+02 -4.57369201e-02  6.36358187e-03 ...  1.06697967e-02
  -3.60607207e-02  3.40544581e-02]
 [ 1.30000000e+02 -2.90307291e-02  4.89163883e-02 ...  1.06697967e-02
  -3.60607207e-02  3.40544581e-02]
 [ 1.30000000e+02 -4.62355018e-02  3.47966589e-02 ...  1.06697967e-02
  -3.60607207e-02  3.40544581e-02]]
