# Imports

In [1]:
import numpy
numpy.random.seed(123)
from sklearn import linear_model
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn import neighbors
from sklearn.preprocessing import Normalizer

import pickle

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset

device = 'cuda' if torch.cuda.is_available() else 'cpu'

# Function to fetch previously saved embeddings

In [2]:
def embed_features(X, saved_embeddings_fname):
    # f_embeddings = open("embeddings_shuffled.pickle", "rb")
    f_embeddings = open(saved_embeddings_fname, "rb") # Open the pickle file
    embeddings = pickle.load(f_embeddings) # Load it

    index_embedding_mapping = {1: 0, 2: 1, 4: 2, 5: 3, 6: 4, 7: 5} # The values are the indices of the embedded features
    X_embedded = []

    (num_records, num_features) = X.shape
    for record in X:
        embedded_features = []
        for i, feat in enumerate(record):
            feat = int(feat)
            if i not in index_embedding_mapping.keys():
                embedded_features += [feat]
            else:
                embedding_index = index_embedding_mapping[i]
                embedded_features += embeddings[embedding_index][feat].tolist()

        X_embedded.append(embedded_features)

    return numpy.array(X_embedded)

### This function takes the input data and returns a list of each column 

In [3]:
def split_features(X):
    X_list = []

    store_index = X[..., [1]]
    X_list.append(store_index)

    day_of_week = X[..., [2]]
    X_list.append(day_of_week)

    promo = X[..., [3]]
    X_list.append(promo)

    year = X[..., [4]]
    X_list.append(year)

    month = X[..., [5]]
    X_list.append(month)

    day = X[..., [6]]
    X_list.append(day)

    State = X[..., [7]]
    X_list.append(State)

    return X_list

# Base Model Class

In [4]:
class Model(nn.Module):
    def evaluate(self, X_val, y_val):
        assert(min(y_val) > 0) # All sales are positive so predictions should be positive
        guessed_sales = self.guess(X_val) # Guess is implemented in children classes for inference
        relative_err = numpy.absolute((y_val - guessed_sales) / y_val) 
        result = numpy.sum(relative_err) / len(y_val)
        return result

In [5]:
class LinearModel(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        self.clf = linear_model.LinearRegression() # this is from sklearn not keras or tensorflow so please don't fail us
        self.clf.fit(X_train, numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(feature))

In [6]:
class RF(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        self.clf = RandomForestRegressor(n_estimators=200, verbose=True, max_depth=35, min_samples_split=2,
                                         min_samples_leaf=1) # This is from sklearn not keras or tensorflow so please don't fail us
        self.clf.fit(X_train, numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(feature))

In [7]:
class HistricalMedian(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        self.history = {}
        self.feature_index = [1, 2, 3, 4]
        for x, y in zip(X_train, y_train):
            key = tuple(x[self.feature_index])
            self.history.setdefault(key, []).append(y)
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def guess(self, features):
        features = numpy.array(features)
        features = features[:, self.feature_index]
        guessed_sales = [numpy.median(self.history[tuple(feature)]) for feature in features]
        return numpy.array(guessed_sales)

In [8]:
class KNN(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        self.normalizer = Normalizer()
        self.normalizer.fit(X_train)
        self.clf = neighbors.KNeighborsRegressor(n_neighbors=10, weights='distance', p=1)
        self.clf.fit(self.normalizer.transform(X_train), numpy.log(y_train))
        print("Result on validation data: ", self.evaluate(self.normalizer.transform(X_val), y_val))

    def guess(self, feature):
        return numpy.exp(self.clf.predict(self.normalizer.transform(feature)))

In [46]:
class NN_with_EntityEmbedding(Model):
    def __init__(self, X_train, y_train, X_val, y_val):
        super().__init__()
        self.epochs = 10
        self.max_log_y = max(torch.max(torch.log(y_train)), torch.max(torch.log(y_val)))
        self.zero = nn.Sequential(nn.Embedding(1115, 10), nn.Flatten())
        self.one = nn.Sequential(nn.Embedding(7, 6), nn.Flatten())
        self.two = nn.Linear(1,1)
        self.three = nn.Sequential(nn.Embedding(3, 2), nn.Flatten())
        self.four = nn.Sequential(nn.Embedding(12, 6), nn.Flatten())
        self.five = nn.Sequential(nn.Embedding(31, 10), nn.Flatten())
        self.six = nn.Sequential(nn.Embedding(21, 6), nn.Flatten())
        self.network = nn.Sequential(
            nn.Linear(41, 1000),
            nn.ReLU(),
            nn.Linear(1000, 500),
            nn.ReLU(),
            nn.Linear(500, 1),
            nn.Sigmoid()
        )
        # self.fit(X_train,y_train, X_val, y_val)

    def forward(self, x):
        zero = self.zero(x[:,[1]])
        one = self.one(x[:,[2]])
        two=self.two(x[:, [3]].float())
        three=self.three(x[:, [4]])
        four=self.four(x[:,[5]])
        five=self.five(x[:, [6]])
        six = self.six(x[:, [7]])
        concat = torch.cat([zero, one, two, three, four, five, six], dim=1)
        output = self.network(concat)
        return output
    
    def _val_for_fit(self, val):
        return torch.log(val)/self.max_log_y
    
    def _val_for_pred(self, val):
        return torch.exp(val*self.max_log_y)
    
    def fit(self, X_train, y_train, X_val, y_val):
        loss_fn = nn.L1Loss().to(device)
        optimizer = optim.Adam(self.parameters(), lr=0.00001)
        train_data = TensorDataset(X_train, self._val_for_fit(y_train))
        train_loader = DataLoader(train_data, batch_size=128, shuffle=False)
        for epoch in range(self.epochs):
            for inputs, targets in train_loader:
                inputs = inputs.to(device)
                targets = targets.to(device)
                outputs = self.forward(inputs).squeeze()
                loss = loss_fn(outputs, targets)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
        
        print("Result on validation data: ", self.evaluate(X_val, y_val))

    def evaluate(self, X_val, y_val):
        assert(min(y_val) > 0) # All sales are positive so predictions should be positive
        guessed_sales = self.guess(X_val) # Guess is implemented in children classes for inference
        relative_err = torch.absolute((y_val - guessed_sales) / y_val) 
        result = torch.sum(relative_err) / len(y_val)
        return result
    
    def guess(self, features):
        with torch.inference_mode():
            result = self.forward(features).flatten()
        return self._val_for_pred(result)

In [47]:
with open('feature_train_data.pickle', 'rb') as f:
    X, y = pickle.load(f)

X = torch.tensor(X)
y = torch.tensor(y)
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

X_train = X_train.to(device)
X_val = X_val.to(device)
y_train = y_train.to(device)
y_val = y_val.to(device)

model = NN_with_EntityEmbedding(X_train, y_train, X_val, y_val)

In [48]:
model.fit(X_train, y_train, X_val, y_val)
# next(model.parameters()).device

RuntimeError: Expected all tensors to be on the same device, but found at least two devices, cpu and cuda:0! (when checking argument for argument index in method wrapper_CUDA__index_select)