## Imports

In [333]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from sklearn import preprocessing
import random
# torch.set_num_threads(1)

## Data Preprocessing

In [334]:

file_path = ['rossmann-store-sales/train.csv',
             'rossmann-store-sales/test.csv', 'rossmann-store-sales/store.csv', 'rossmann-store-sales/store_states.csv']
store_df = pd.read_csv(file_path[2], low_memory=False, dtype=str)
store_states_df = pd.read_csv(file_path[3], low_memory=False,  dtype=str)

print(f"Size of the store df: {store_df.shape}")

train_df = pd.read_csv(file_path[0], low_memory=False, dtype=str) # split this into test train for validation ++ we may also need to reverse the order of the data
test_df = pd.read_csv(file_path[1], low_memory=False, dtype=str)
train_df = pd.merge(train_df, store_df, how="inner", on="Store") #  one thing to note is that when they pre process data, they keep the store and test data separate, 
test_df = pd.merge(test_df, store_df, how="inner", on="Store")
train_df['Date'] = pd.to_datetime(train_df['Date'])
train_df['Year'], train_df['Month'], train_df['Day'] = train_df['Date'].dt.year.values, train_df['Date'].dt.month.values, train_df['Date'].dt.day.values
train_df.drop(['Date'], axis=1, inplace=True)
train_df = pd.merge(train_df, store_states_df, how="inner", on="Store")
test_df = pd.merge(test_df, store_states_df, how="inner", on="Store")

print(f"The column names before dropping are : {train_df.columns.tolist()}")
print(f"Size of the train dataset after merging is : {train_df.shape}")
# print(f"Size of the test dataset after merging is : {test_df.shape}")
print(train_df['Store'].dtype)

# helper function to replace nan values if any
train_df.fillna('0', inplace=True)

train_df = train_df[train_df["Sales"]!="0"]
train_df = train_df[train_df["Open"]!=""]
cols_to_drop = ['StateHoliday', 'SchoolHoliday', 'CompetitionOpenSinceMonth', 'StoreType', 'Assortment', 'PromoInterval', 'Promo2SinceWeek', 'Promo2SinceYear', 'Promo2', 'CompetitionOpenSinceYear', 'CompetitionDistance', 'Customers']

train_df.drop(cols_to_drop, axis=1, inplace=True)

train_data_y = pd.DataFrame(train_df['Sales'])
train_data_x = train_df.drop(['Sales'], axis=1)

for feature in train_data_x.columns: # this is to convert the categorical data into numerical data
    label_encoder = preprocessing.LabelEncoder()
    train_data_x.loc[:,feature] = label_encoder.fit_transform(train_data_x[feature].astype(str).fillna("0").values)
train_data_x = train_data_x.reindex(['Open','Store', 'DayOfWeek',  'Promo', 'Year', 'Month', 'Day' ,'State'], axis=1)
# train_data_x = train_data_x.astype(int)
# train_data_y = train_data_y.astype(int)
print(f"Size of the train dataset after merging and preprocessing is : {train_df.shape}")
##############################################################################################################


Size of the store df: (1115, 10)
The column names before dropping are : ['Store', 'DayOfWeek', 'Sales', 'Customers', 'Open', 'Promo', 'StateHoliday', 'SchoolHoliday', 'StoreType', 'Assortment', 'CompetitionDistance', 'CompetitionOpenSinceMonth', 'CompetitionOpenSinceYear', 'Promo2', 'Promo2SinceWeek', 'Promo2SinceYear', 'PromoInterval', 'Year', 'Month', 'Day', 'State']
Size of the train dataset after merging is : (1017209, 21)
object
Size of the train dataset after merging and preprocessing is : (844338, 9)


In [335]:
train_data_x.head(10) # shows random 10 instead of first ten like head
# print(train_data_x.shape)

# they kept 8 columns as shown below

Unnamed: 0,Open,Store,DayOfWeek,Promo,Year,Month,Day,State
0,0,0,4,1,2,9,24,4
1,0,0,3,1,2,9,23,4
2,0,0,2,1,2,9,21,4
3,0,0,1,1,2,9,20,4
4,0,0,0,1,2,9,19,4
6,0,0,5,0,2,9,17,4
7,0,0,4,0,2,9,16,4
8,0,0,3,0,2,9,15,4
9,0,0,2,0,2,9,14,4
10,0,0,1,0,2,9,13,4


In [336]:
print(train_data_x.shape) # matches
print(train_data_y.shape)

(844338, 8)
(844338, 1)


In [337]:
import numpy as np
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
import pickle

train_ratio = 0.9
shuffle_data = False
one_hot_as_input = False
embeddings_as_input = False
save_embeddings = True
saved_embeddings_fname = "embeddings.pickle"  # set save_embeddings to True to create this file

train_data_x = train_data_x.astype(np.int64)
train_data_y = train_data_y.astype(np.int64)

def split_features(X): # this function takes a numpy array and splits it into a list of np arrays. It returns a list of all the features/columns in the dataset
 # Extract the column with index 0 (store) and append to the list
    X_list = []
    store_index = X[:, 0].reshape(-1, 1)
    X_list.append(np.array(store_index, dtype=np.int64))

    # Extract the column with index 1 (day_of_week) and append to the list
    day_of_week = X[:, 1].reshape(-1, 1)
    X_list.append(np.array(day_of_week, dtype=np.int64))

    # Extract the column with index 2 (promo) and append to the list
    promo = X[:, 2].reshape(-1, 1)
    X_list.append(np.array(promo, dtype=np.int64))

    # Extract the column with index 3 (year) and append to the list
    year = X[:, 3].reshape(-1, 1)
    X_list.append(np.array(year, dtype=np.int64))

    # Extract the column with index 4 (month) and append to the list
    month = X[:, 4].reshape(-1, 1)
    X_list.append(np.array(month, dtype=np.int64))

    # Extract the column with index 5 (day) and append to the list
    day = X[:, 5].reshape(-1, 1)
    X_list.append(np.array(day, dtype=np.int64))
    # Extract the column with index 6 (state) and append to the list
    state = X[:, 6].reshape(-1, 1)
    X_list.append(np.array(state, dtype=np.int64))

    return X_list
 
split_features(train_data_x.values)


# okay they ar enot really suing this/ they only use this to feed embeddings leanred duing NN to other models as input
def embed_features(X, saved_embeddings_fname): # this function creates and saves the embeddings for the categorical data
   f_embeddings = open(saved_embeddings_fname, "rb")
   embeddings = pickle.load(f_embeddings)

   index_embedding_mapping = {1: 0, 2: 1, 4: 2, 5: 3, 6: 4, 7: 5} # this is the mapping of the columns index to the embeddings index. They skipped columns 1 and 3. Opena and Promo because they are 0, 1? I guess binary categories are useless to embed. Embedding these binary features may not provide significant benefits, as their information is already somewhat captured by the binary nature.
   X_embedded = []
   print(X.shape)

   (num_records, num_features) = X.shape
   for record in X: # a record is a row in the dataset
      # print(record)
      embedded_features = []
      for i, feat in enumerate(record): # this is to embed the features
         # print(i, feat)
         feat = int(feat)
         if i not in index_embedding_mapping.keys():
               embedded_features += [feat]
         else:
               embedding_index = index_embedding_mapping[i]
               embedded_features += embeddings[embedding_index][feat].tolist()

      X_embedded.append(embedded_features)

   return pd.DataFrame(X_embedded) # returns the embedded features
   # return np.array(X_embedded) # returns the embedded features

# j = embed_features((np.array(train_data_x)), saved_embeddings_fname)

# print(j)

# print(j.shape) # okay shape matches theirs as well


  

In [338]:

# train_data_x = j
# train_data_y = np.array(train_data_y)

X_train, X_val, y_train, y_val = train_test_split(train_data_x, train_data_y, test_size=0.1, random_state=42) # this is to split the data into train and test sets
print(y_train[:5])
for data in [X_train, X_val]:
    data.drop(['Open'], axis=1, inplace=True)

        Sales
934085   6760
937882   6833
977052   8578
165948   8467
658089   5214


So basically there is an entity embedding layer for each categoircal feature (Store, DOW, etc). OPen and Promo are not inlcuded since they are binary. 
These are the embedding dimesions for different categories used in the paper            
cat_emb_dim={
                'Store': 10,
                'DayOfWeek': 6,
                'Promo': 1,
                'Year': 2,
                'Month': 6,
                'Day': 10,
                'State': 6},}


In [339]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import torch.nn.functional as F

class PyTorchModel(nn.Module):
    def __init__(self):
        super(PyTorchModel, self).__init__()
        
        # Define Embeddings
        self.store_embedding = nn.Embedding(1115, 10)
        self.dow_embedding = nn.Embedding(7, 6)
        self.year_embedding = nn.Embedding(3, 2)
        self.month_embedding = nn.Embedding(12, 6)
        self.day_embedding = nn.Embedding(31, 10)
        self.state_embedding = nn.Embedding(12, 6)
        
        # Define other layers
        self.promo_layer = nn.Linear(1, 1)
        self.dense1 = nn.Linear(41, 1000)
        self.dense2 = nn.Linear(1000, 500)
        self.dense3 = nn.Linear(500, 1)
        
    def forward(self, input_store, input_dow, input_promo, input_year, input_month, input_day, input_state):
        # Apply embeddings
        input_promo = input_promo.unsqueeze(1)
        output_store = self.store_embedding(input_store.view(-1, 1))
        output_dow = self.dow_embedding(input_dow.view(-1, 1))
        output_year = self.year_embedding(input_year.view(-1, 1))
        output_month = self.month_embedding(input_month.view(-1, 1))
        output_day = self.day_embedding(input_day.view(-1, 1))
        output_state = self.state_embedding(input_state.view(-1, 1))
        
        # Reshape embeddings
        output_store = output_store.view(-1, 10)
        output_dow = output_dow.view(-1, 6)
        output_year = output_year.view(-1, 2)
        output_month = output_month.view(-1, 6)
        output_day = output_day.view(-1, 10)
        output_state = output_state.view(-1, 6)
        
        # print(output_store.shape)
        # print(output_dow.shape)
        # print(input_promo.shape)
                
        # Concatenate embeddings
        concatenated = torch.cat([output_store, output_dow, input_promo, output_year, output_month, output_day, output_state], dim=1)
        # print(concatenated.shape)
        # Feed forward through other layers
        x = F.relu(self.dense1(concatenated))
        x = F.relu(self.dense2(x))
        x = self.dense3(x)
        x = torch.sigmoid(x)
        
        return x

# Example usage
pytorch_model = PyTorchModel()
criterion = nn.L1Loss()
optimizer = optim.Adam(pytorch_model.parameters(), lr=0.001)

# Convert numpy arrays to PyTorch tensors
print(y_train[:5])
max_log_y = max(np.max(np.log(y_train)), np.max(np.log(y_val))) # this is to normalize the data
y_train = np.log(y_train)/max_log_y
# print("After \n",y_train[:5])
y_val = np.log(y_val)/max_log_y
X_train_tensor = torch.from_numpy(np.array(X_train))
y_train_tensor = torch.from_numpy(np.array(y_train))

# Training loop
epochs = 10
batch_size = 128

for epoch in range(epochs):
    for i in range(0, len(X_train), batch_size):
        batch_X = X_train_tensor[i:i+batch_size]
        batch_y = y_train_tensor[i:i+batch_size]
        # print(batch_X.shape)
        # print(batch_y.shape)
        
        optimizer.zero_grad()
        output = pytorch_model(batch_X[:, 0], batch_X[:, 1], batch_X[:, 2], batch_X[:, 3], batch_X[:, 4], batch_X[:, 5], batch_X[:, 6])

        loss = criterion(output.flatten(), batch_y.float())
        loss.backward()
        optimizer.step()

    print(f'Epoch {epoch + 1}/{epochs}, Loss: {loss.item()}')

# # Prediction example
# with torch.no_grad():
#     output = pytorch_model(*[torch.from_numpy(example).long() for example in features])
#     prediction = output.numpy()
#     print("Prediction:", prediction)


        Sales
934085   6760
937882   6833
977052   8578
165948   8467
658089   5214


  return F.l1_loss(input, target, reduction=self.reduction)
  return F.l1_loss(input, target, reduction=self.reduction)


Epoch 1/10, Loss: 0.030744114890694618
Epoch 2/10, Loss: 0.030887659639120102
Epoch 3/10, Loss: 0.03071647882461548
Epoch 4/10, Loss: 0.030838701874017715
Epoch 5/10, Loss: 0.03091205097734928
Epoch 6/10, Loss: 0.030927734449505806
Epoch 7/10, Loss: 0.03092590905725956
Epoch 8/10, Loss: 0.030922962352633476
Epoch 9/10, Loss: 0.030890056863427162
Epoch 10/10, Loss: 0.03087809681892395


NameError: name 'features' is not defined

In [None]:



#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Fri Jun  1 17:51:34 2018

@author: raulsanchez
"""

import numpy as np
import pandas as pd
import pickle
from sklearn.utils.validation import check_X_y
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.utils.data as data_utils
from torch.autograd import Variable


class NeuralNet(nn.Module, BaseEstimator, RegressorMixin):
    '''
    Parameters
    ----------
    cat_emb_dim : dict
        Dictionary containing the embedding sizes.

    layers : list
        NN. Layer arquitecture
    drop_out_layers : dict
        Dictionary with layer dropout
    drop_out_emb : float
        embedding drop out
    batch_size : int
        Mini Batch size
    val_idx : list

    allow_cuda : bool

    act_func : string

    lr : float

    alpha : float

    epochs : int
    '''
    def __init__(
        self,
        act_func='relu',
        train_size=.8,
        batch_size=128,
        random_seed=None,
        verbose=True,
        verbose_epoch=100):

        super(NeuralNet, self).__init__()

        # General
        self.act_func = act_func
        self.train_size = train_size
        self.batch_size = int(batch_size)
        self.verbose = verbose
        self.verbose_epoch = verbose_epoch
        self.random_seed = random_seed

        if not(self.random_seed is None):
            torch.manual_seed(self.random_seed)

    def activ_func(self, x):
        '''
        Applies an activation function
        '''

        act_funcs = {
            'relu': F.relu,
            'selu': F.selu}

        return act_funcs[self.act_func](x)

    def make_dataloader(self, X, y=None, shuffle=False, num_workers=8):
        '''
        Wraps a dataloader to iterate over (X, y)
        '''
        if y is not None and isinstance(y, pd.DataFrame):
            y = y.values.ravel()

        check_X_y(X, y)

        loader = data_utils.DataLoader(
            data_utils.TensorDataset(
                torch.from_numpy(X.values).float(),
                torch.from_numpy(y).float() if y is not None else None
            ),
            batch_size=self.batch_size,
            shuffle=shuffle,
            num_workers=num_workers)

        return loader

    def split_train_test(self):
        '''
        Splits Train-Test partitions
        '''

        err_msg = 'X size %s does not match y size %s'
        assert self.X.shape[0] == self.y.shape[0], err_msg % (
            self.X.shape, self.y.shape)

        if (self.train_size < 1) and (self.train_size > 0):
            X_train, X_test, y_train, y_test = train_test_split(
                self.X, self.y, train_size=self.train_size)
        else:
            X_train = self.X
            X_test = self.X
            y_train = self.y
            y_test = self.y

        self.X_train = X_train
        self.X_test = X_test
        self.y_train = y_train
        self.y_test = y_test


class EntEmbNN(NeuralNet):
    '''
    Parameters
    ----------
    cat_emb_dim : dict
        Dictionary containing the embedding sizes.

    layers : list
        NN. Layer arquitecture
    drop_out_layers : dict
        Dictionary with layer dropout
    drop_out_emb : float
        embedding drop out
    batch_size : int
        Mini Batch size
    val_idx : list

    allow_cuda : bool

    act_func : string

    lr : float

    alpha : float

    epochs : int
    '''
    def __init__(
        self,
        cat_emb_dim = {},
        dense_layers = [1000, 500],
        drop_out_layers = [0., 0.],
        drop_out_emb = 0.,
        act_func = 'relu',
        loss_function='MSELoss',
        train_size=1.,
        batch_size=128,
        epochs=10,
        lr=0.001,
        alpha=0.0,
        rand_seed=1,
        allow_cuda=False,
        random_seed=None,
        output_sigmoid=False,
        verbose=True):

        super(EntEmbNN, self).__init__()

        # Model specific params.
        self.cat_emb_dim = cat_emb_dim
        self.dense_layers = dense_layers
        self.output_sigmoid = output_sigmoid

        # Reg. parameters
        self.drop_out_layers = drop_out_layers
        self.drop_out_emb = drop_out_emb
        self.alpha = alpha

        # Training params
        self.act_func = act_func
        self.train_size = train_size
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.loss_function = loss_function

        # Misc
        self.allow_cuda = allow_cuda
        self.verbose = verbose
        self.random_seed = random_seed

        # Internal
        self.embeddings = {}
        self.train_loss = []
        self.train_epoch_loss = []
        self.epochs_reports = []

        self.labelencoders = {}
        self.scaler = None

        self.num_features = None
        self.cat_features = None
        self.layers = {}

    def get_loss(self, loss_name):
        if loss_name == 'SmoothL1Loss':
            return torch.nn.SmoothL1Loss()
        elif loss_name == 'L1Loss':
            return torch.nn.L1Loss()
        elif loss_name == 'MSELoss':
            return torch.nn.MSELoss()
        elif loss_name == 'BCELoss':
            return torch.nn.BCELoss()
        elif loss_name == 'BCEWithLogitsLoss':
            return torch.nn.BCEWithLogitsLoss()
        else:
            print(
                'Invalid Loss name: %s, using default: %s' % (
                    loss_name, 'MSELoss')
            )
            return torch.nn.MSELoss()

    def init_embeddings(self):
        '''
        Initializes the embeddings
        '''

        # Get embedding sizes from categ. features
        for f in self.cat_features:
            le = self.labelencoders[f]

            emb_dim = self.cat_emb_dim[f]

            self.embeddings[f] = nn.Embedding(
                len(le.classes_),
                emb_dim)

            # Weight initialization as original paper
            torch.nn.init.uniform_(
                self.embeddings[f].weight.data,
                a=-.05, b=.05)

            # Add emb. layer to model
            self.add_module(
                '[Emb %s]' % f,
                self.embeddings[f])

    def init_dense_layers(self):
        '''
        Initializes dense layers
        '''

        input_size = (
            # Numb of Embedding neurons in input layer
            sum([
                self.embeddings[f].weight.data.shape[1]
                for f in self.cat_features
            ])
        ) + (
            # Numb of regular neurons for numerical features
            len(self.num_features)
        )

        NN_arquitecture = (
            [input_size]
        ) + (
            self.dense_layers
        ) + (
            [1]
        )

        for layer_idx, current_layer_size in enumerate(NN_arquitecture[:-1]):
            next_layer_size = NN_arquitecture[layer_idx + 1]

            layer_name = 'l%s' % (layer_idx + 1)
            layer = nn.Linear(current_layer_size, next_layer_size)

            self.add_module(layer_name, layer)

            self.layers[layer_name] = layer

    def X_fit(self, X):
        """
        """
        # Identify categorical vs numerical features
        self.cat_features = list(self.cat_emb_dim.keys())
        self.num_features = list(set(
            self.X.columns.tolist()
        ).difference(self.cat_features))

        # Create encoders for categorical features
        self.labelencoders = {}
        for c in self.cat_features:
            le = LabelEncoder()
            le.fit( X[c].astype(str).tolist())
            self.labelencoders[c] = le

    def X_transform(self, X):
        """
        X = X_train.iloc[test_idx]
        """
        X = X.copy()
        for c in self.cat_features:
            codes = X[c].astype(str)

            missin_codes = ~codes.isin(self.labelencoders[c].classes_)

            codes[missin_codes] = self.labelencoders[c].classes_[0]

            X[c] = self.labelencoders[c].transform(
                codes
            )

        X = X[self.cat_features + self.num_features]

        return X

    def X_emd_replace(self, data):
        '''
        Returns the formated X-input, which is composed by the categorical
        embeddings and the respective continuous inputs.
        '''

        ''' Replace embeddings '''
        data_emb = []
        for f_idx, f in enumerate(self.cat_features):
            # Get column feature
            f_data = data[:, f_idx]

            if self.allow_cuda:
                f_data = f_data.cuda()

            # Retrieves the embeddings
            emb_cat = self.embeddings[f](f_data.long())

            #Apply Dropout
            emb_cat = F.dropout(
                emb_cat,
                p=self.drop_out_emb,
                training=self.training)

            data_emb.append(emb_cat)

        ''' Concat numeric features '''
        if len(self.num_features) > 0:
            data_emb.append(
                data[:, len(self.cat_features):]
            )

        return torch.cat(data_emb, 1)

    def fit(self, X, y):
        """

        """

        self.X = X.copy()
        self.y = y.copy()

        self.X_fit(self.X)

        self.split_train_test()

        # Create embeddings and layers
        self.init_embeddings()
        self.init_dense_layers()

        # GPU Flag
        if self.allow_cuda:
            self.cuda()

        self.iterate_n_epochs(epochs=self.epochs)

    def iterate_n_epochs(self, epochs):
        '''
        Makes N training iterations
        epochs = self.epochs
        '''

        self.epoch_cnt = 0
        self.optimizer = torch.optim.Adam(
            self.parameters(),
            lr=self.lr,
            weight_decay=self.alpha
        )

        while(self.epoch_cnt < epochs):
            self.train()
            loss_func = self.get_loss(self.loss_function)

            dataloader = self.make_dataloader(
                # Format X such as categ.first then numeric.
                self.X_transform(self.X_train),
                self.y_train,
                shuffle=True,
            )

            train_epoch_loss = []
            for batch_idx, (x, target) in enumerate(dataloader):
                self.optimizer.zero_grad()

                if self.allow_cuda:
                    x, target = x.cuda(), target.cuda()
                x, target = Variable(x), Variable(target).float()

                self.X_train.iloc[0]
                output = self.forward(x)

                loss = loss_func(
                    output.reshape(1, -1)[0],
                    target.float())

                loss.backward()
                self.optimizer.step()

                train_epoch_loss.append(loss.item())

                # if (batch_idx % self.verbose_epoch) == 0:
                #     if self.verbose:
                #         print('\t\t%s' % (
                #             sum(train_epoch_loss) / len(train_epoch_loss)
                #             )
                #         )

            self.train_epoch_loss.append(
                sum(train_epoch_loss) / len(train_epoch_loss)
            )
            self.train_loss += train_epoch_loss

            self.epoch_cnt += 1
            # self.eval_model()

    def forward(self, x):
        '''
        Forward pass
        x_ = x
        '''

        # Parse batch with embeddings
        x = self.X_emd_replace(x)

        # Forward pass on dense layers
        n_layers = len(self.layers.items())
        for layer_idx in range(n_layers):
            layer_name = 'l%s' % (layer_idx + 1)
            layer = self.layers[layer_name]

            is_inner_layer = (
                layer_idx < len(self.dense_layers)
            )

            x = layer(x)

            # Do not apply act.func on last layer
            if is_inner_layer:

                # Apply dropout
                x = F.dropout(
                    x,
                    p=self.drop_out_layers[layer_idx],
                    training=self.training)

                x = self.activ_func(x)

            elif self.output_sigmoid:
                x = torch.sigmoid(x)

        return x

    def predict_raw(self, X):
        '''
        Predict scores

        self = NNmodel
        X  = self.X_test
        '''

        #Set pytorch model in eval. mode
        self.eval()

        dataloader = self.make_dataloader(self.X_transform(X))

        y_pred = []
        for batch_idx, (x, _) in enumerate(dataloader):
            if self.allow_cuda:
                x = x.cuda()
            x = Variable(x)

            output = self.forward(x)

            if self.allow_cuda:
                output = output.cpu()
            y_pred += output.data.numpy().flatten().tolist()

        y_pred = np.array(y_pred)

        return y_pred

#    def get_embeddings(self):
#
#        embeddings = {}
#        for c in self.cat_features:
#            categ_names = self.X[c].drop_duplicates()
#            categ_codes = categ_names.cat.codes
#            categories = pd.Series(
#                [x for x in categ_names],
#                index=categ_codes.values)
#            categories.sort_index(inplace=True)
#            categories.index = categories.index + 1
#
#            emb = self.embeddings[c].weight.data
#            if self.allow_cuda:
#                emb = emb.cpu()
#
#            emb = pd.DataFrame(
#                emb.numpy(),
#                index=categories.values)
#            emb = emb.add_prefix('latent_')
#            embeddings[c] = emb
#
#        return embeddings
    def dump_embeddings(self, emb_path):
        """
        Dump embeddings to hdf file.
        """
        print('Saving in: %s' % emb_path)
        for emb_name, emb_pytorch in self.embeddings.items():
            emb = pd.DataFrame(
                emb_pytorch.weight.data.numpy(),
                index=self.labelencoders[emb_name].classes_)
            print('\t%s' % emb_name)

            emb.to_hdf(emb_path, key=emb_name)

    def get_data_embeddings(self, X_raw):
        """
        Transforma a X matrix substituing the embeddings on the X matrix.
        """

        data_emb = []
        for emb_name in self.cat_features:

            emb_pytorch = self.embeddings[emb_name]

            emb = pd.DataFrame(
                emb_pytorch.weight.data.numpy(),
                index=self.labelencoders[emb_name].classes_
            ).add_prefix('%s_' % emb_name)

            x = emb.loc[X_raw[emb_name]]
            x = x.reset_index().drop('index', axis=1)
            x.index = X_raw.index

            data_emb.append(x)

        data_emb.append(X_raw[self.num_features])

        X_emb = pd.concat(data_emb, axis=1)

        return X_emb

    def load(self, filename):
        """
        Loads a saved model
        """

        f = open(filename, 'rb')
        tmp_dict = pickle.load(f)
        f.close()

        self.__dict__.update(tmp_dict)

    def save(self, filename):
        """
        Saves as pickle object
        """

        f = open(filename, 'wb')
        pickle.dump(self.__dict__, f, 2)
        f.close()

In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 26 21:54:50 2018

@author: raulsanchez
"""

import pandas as pd
import numpy as np
from sklearn import metrics
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import LabelBinarizer

def RMSLE(y_true, y_pred): 
    """
    Root Mean Squared Logarithmic Error
    """
    n = y_true.shape[0]
    
    return np.sqrt(
        (1 /n)  * (
            ( np.log1p(y_pred) - np.log1p(y_true) ) ** 2
        ).sum()
        )
    
    
def gini(solution, submission):
    '''
    '''                                 
    
    df = sorted(zip(solution, submission), key=lambda x : (x[1], x[0]),  reverse=True)
    random = [float(i+1)/float(len(df)) for i in range(len(df))]                
    totalPos = np.sum([x[0] for x in df])                                       
    cumPosFound = np.cumsum([x[0] for x in df])                                     
    Lorentz = [float(x)/totalPos for x in cumPosFound]                          
    Gini = [l - r for l, r in zip(Lorentz, random)]                             
    return np.sum(Gini)                                                         


def gini_norm(y_pred, y_true):
    '''
    '''
    
    normalized_gini = gini(y_pred, y_true)/gini(y_true, y_true)
    return normalized_gini    

def MAPE(y_true, y_pred):
    '''
    mean absolute percentage error
    '''
    
    abs_err = np.absolute((y_true - y_pred))
    percent_err = abs_err / y_true
    mape = np.sum(percent_err) / len(y_true)
    
    return mape

def RMSPE(y_true, y_pred):
    '''
    Root Mean Square Percentage Error (RMSPE).
    '''
    square_percent_err = ((y_true - y_pred) / y_true) ** 2
    mean_square_percent_err = pd.Series(square_percent_err).mean()
    
    rmspe = np.sqrt(mean_square_percent_err)
    
    return rmspe

def eval_regression(y_true, y_pred):
    reg_metrics = {
        'mean_absolute_error': metrics.mean_absolute_error(
            y_true=y_true, 
            y_pred=y_pred),
        'explained_variance_score': metrics.explained_variance_score(
            y_true=y_true, 
            y_pred=y_pred),
        'mean_squared_error': metrics.mean_squared_error(
            y_true=y_true, 
            y_pred=y_pred),
        'median_absolute_error': metrics.median_absolute_error(
            y_true=y_true, 
            y_pred=y_pred),
        'r2_score': metrics.r2_score(
            y_true=y_true, 
            y_pred=y_pred),
        'gini_normalized': gini_norm(
            y_true=y_true, 
            y_pred=y_pred),
        'mean_absolute_percentage_error': MAPE(
            y_true=y_true, 
            y_pred=y_pred),
        'root_mean_squared_logarithmic_error': RMSLE(
            y_true=y_true, 
            y_pred=y_pred)
        }
        
    return pd.Series(reg_metrics)

    
def classification_report(y_true, y_pred, y_score=None, average='micro'):
    '''
    Params:
    --------
    y_true : 1d array-like, or label indicator array / sparse matrix
    Ground truth (correct) target values.

    y_pred : 1d array-like, or label indicator array / sparse matrix
    Estimated targets as returned by a classifier.
    
    y_score : nd array-like with the probabilities of the classes.
    
    average : str. either 'micro' or 'macro', for more details
        of how they are computed see:
            http://scikit-learn.org/stable/auto_examples/model_selection/plot_roc.html#multiclass-settings
    Return:
    --------
    pd.DataFrame : contains the classification report as pandas.DataFrame 
    
    Example:
    ---------
    from sklearn.metrics import classification_report
    
    from sklearn.model_selection import train_test_split
    from sklearn.ensemble import RandomForestClassifier
    from sklearn.datasets import make_classification
    
    X, y = make_classification(n_samples=5000, n_features=10,
                               n_informative=5, n_redundant=0,
                               n_classes=10, random_state=0, 
                               shuffle=False)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y)
    
    model = RandomForestClassifier(max_depth=2, random_state=0)
    model.fit(X_train, y_train)
    
    sk_report = classification_report(
        digits=6,
        y_true=y_test, 
        y_pred=model.predict(X_test))
    
    report_with_auc = class_report(
        y_true=y_test, 
        y_pred=model.predict(X_test), 
        y_score=model.predict_proba(X_test))
    
    print(sk_report)
    
    Out:
             precision    recall  f1-score   support

          0   0.267101  0.645669  0.377880       127
          1   0.361905  0.290076  0.322034       131
          2   0.408451  0.243697  0.305263       119
          3   0.345455  0.327586  0.336283       116
          4   0.445652  0.333333  0.381395       123
          5   0.413793  0.095238  0.154839       126
          6   0.428571  0.474820  0.450512       139
          7   0.446809  0.169355  0.245614       124
          8   0.302703  0.466667  0.367213       120
          9   0.373333  0.448000  0.407273       125

    avg / total   0.379944  0.351200  0.335989      1250
        
    print(report_with_auc)
    
    Out:
                precision    recall  f1-score  support    pred       AUC
    0             0.267101  0.645669  0.377880    127.0   307.0  0.810550
    1             0.361905  0.290076  0.322034    131.0   105.0  0.777579
    2             0.408451  0.243697  0.305263    119.0    71.0  0.823277
    3             0.345455  0.327586  0.336283    116.0   110.0  0.844390
    4             0.445652  0.333333  0.381395    123.0    92.0  0.811389
    5             0.413793  0.095238  0.154839    126.0    29.0  0.654790
    6             0.428571  0.474820  0.450512    139.0   154.0  0.876458
    7             0.446809  0.169355  0.245614    124.0    47.0  0.777237
    8             0.302703  0.466667  0.367213    120.0   185.0  0.799735
    9             0.373333  0.448000  0.407273    125.0   150.0  0.825959
    avg / total   0.379944  0.351200  0.335989   1250.0  1250.0  0.800534
    
    '''
    
    if y_true.shape != y_pred.shape:
        print("Error! y_true %s is not the same shape as y_pred %s" % (
              y_true.shape,
              y_pred.shape)
        )
        return
    
    lb = LabelBinarizer()
    
    if len(y_true.shape) == 1:
        lb.fit(y_true)
    
    #Value counts of predictions
    labels, cnt = np.unique(
        y_pred,
        return_counts=True)
    n_classes = len(labels)
    all_labels = set(labels).union(np.unique(y_true))
    
    pred_cnt = pd.Series(cnt, index=labels)
    pred_cnt = pred_cnt.loc[
        all_labels
    ].fillna(0)
    
    metrics_summary = precision_recall_fscore_support(
            y_true=y_true,
            y_pred=y_pred,
            labels=list(all_labels))

    avg = list(precision_recall_fscore_support(
            y_true=y_true, 
            y_pred=y_pred,
            average='weighted'))

    metrics_sum_index = ['precision', 'recall', 'f1-score', 'support']
    class_report_df = pd.DataFrame(
        list(metrics_summary),
        index=metrics_sum_index,
        columns=all_labels)
    
    for l in (all_labels - set(class_report_df.columns)):
        class_report_df[l] = 0
    
    support = class_report_df.loc['support']
    total = support.sum() 
    class_report_df['avg / total'] = avg[:-1] + [total]
    
    class_report_df = class_report_df.T
    class_report_df['pred'] = pred_cnt
    class_report_df['pred'].iloc[-1] = total
    
    if not (y_score is None):
        fpr = dict()
        tpr = dict()
        roc_auc = dict()
        for label_it, label in enumerate(labels):
            fpr[label], tpr[label], _ = roc_curve(
                (y_true == label).astype(int), 
                y_score[:, label_it])
            
            roc_auc[label] = auc(fpr[label], tpr[label])
        
        if average == 'micro':
            if n_classes <= 2:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                    lb.transform(y_true).ravel(), 
                    y_score[:, 1].ravel())
            else:
                fpr["avg / total"], tpr["avg / total"], _ = roc_curve(
                        lb.transform(y_true).ravel(), 
                        y_score.ravel())
            
            roc_auc["avg / total"] = auc(
                fpr["avg / total"], 
                tpr["avg / total"])
        
        elif average == 'macro':
            # First aggregate all false positive rates
            all_fpr = np.unique(np.concatenate([
                fpr[i] for i in labels]
            ))
            
            # Then interpolate all ROC curves at this points
            mean_tpr = np.zeros_like(all_fpr)
            for i in labels:
                mean_tpr += interp(all_fpr, fpr[i], tpr[i])
            
            # Finally average it and compute AUC
            mean_tpr /= n_classes
            
            fpr["macro"] = all_fpr
            tpr["macro"] = mean_tpr
            
            roc_auc["avg / total"] = auc(fpr["macro"], tpr["macro"])
        
        class_report_df['AUC'] = pd.Series(roc_auc)
    
    return class_report_df

In [None]:
#!/usr/bin/env python2
# -*- coding: utf-8 -*-
"""
Created on Thu Aug  9 14:25:31 2018

@author: lsanchez
"""

# from EENN import EntEmbNN
# import eval_utils


class EntEmbNNRegression(EntEmbNN):
    '''
    Parameters
    ----------
    cat_emb_dim : dict
        Dictionary containing the embedding sizes.

    layers : list
        NN. Layer arquitecture
    drop_out_layers : dict
        Dictionary with layer dropout
    drop_out_emb : float
        embedding drop out
    batch_size : int
        Mini Batch size
    val_idx : list

    allow_cuda : bool

    act_func : string

    lr : float

    alpha : float

    epochs : int
    '''

    def __init__(
        self,
        cat_emb_dim = {},
        dense_layers = [1000, 500],
        drop_out_layers = [0., 0.],
        drop_out_emb = 0.,
        act_func = 'relu',
        loss_function='MSELoss',
        train_size=1.,
        batch_size=128,
        epochs=10,
        lr=0.001,
        alpha=0.0,
        rand_seed=1,
        allow_cuda=False,
        random_seed=None,
        output_sigmoid=False,
        verbose=False):

        super(EntEmbNNRegression, self).__init__()

        # Model specific params.
        self.cat_emb_dim = cat_emb_dim
        self.dense_layers = dense_layers
        self.output_sigmoid = output_sigmoid

        # Reg. parameters
        self.drop_out_layers = drop_out_layers
        self.drop_out_emb = drop_out_emb
        self.alpha = alpha

        # Training params
        self.act_func = act_func
        self.train_size = train_size
        self.batch_size = batch_size
        self.epochs = epochs
        self.lr = lr
        self.loss_function = loss_function

        # Misc
        self.allow_cuda = allow_cuda
        self.verbose = verbose
        self.random_seed = random_seed

        # Internal
        self.embeddings = {}
        self.train_loss = []
        self.epochs_reports = []

        self.labelencoders = {}
        self.scaler = None

        self.num_features = None
        self.cat_features = None
        self.layers = {}

    def predict(self, X):
        """
        """

        return self.predict_raw(X)

    def eval_model(self):
        '''
        Model evaluation
        '''

        self.eval()

        test_y_pred = self.predict(self.X_test)

        report = eval_regression(
            y_true=self.y_test,
            y_pred=test_y_pred)

        msg = "\t[%s] Test: MSE:%s MAE: %s gini: %s R2: %s MAPE: %s"

        msg_params = (
            self.epoch_cnt,
            round(report['mean_squared_error'], 6),
            round(report['mean_absolute_error'], 6),
            round(report['gini_normalized'], 6),
            round(report['r2_score'], 6),
            round(report['mean_absolute_percentage_error'], 6))

        self.epochs_reports.append(report)

        if self.verbose:
            print(msg % (msg_params))

In [None]:
import pandas as pd
# import datasets
# import eval_utils
import numpy as np

# from EENNRegression import EntEmbNNRegression

# X, y, X_test, y_test = datasets.get_X_train_test_data()
X, X_test,y, y_test = train_test_split(train_data_x, train_data_y, test_size=0.1)

# This normalization comes from original Entity Emb. original Code
# y = pd.DataFrame(y)
# y_test = pd.DataFrame(y_test)
y_max = max(y.max(axis=None), y_test.max(axis=None))
y = np.log(y) / np.log(y_max)
y_test = np.log(y_test) / np.log(y_max)
# X = pd.DataFrame(X)
# X_test = pd.DataFrame(X_test)
# print(X.shape)
# X_test.head(10)
# y_test.head(10)
for data in [X, X_test]:
    data.drop('Open', inplace=True, axis=1)

In [None]:
models = []
for _ in range(5):
    m = EntEmbNNRegression(
        cat_emb_dim={
            'Store': 10,
            'DayOfWeek': 6,
            'Promo': 1,
            'Year': 2,
            'Month': 6,
            'Day': 10,
            'State': 6},
        alpha=0,
        dense_layers=[1000, 500],
        drop_out_layers=[0., 0.],
        drop_out_emb=0.,
        loss_function='L1Loss',
        train_size=1., 
        verbose=True)

    m.fit(X, y)
    models.append(m)
    print('\n')

test_y_pred = np.array([model.predict(X_test) for model in models])
test_y_pred = test_y_pred.mean(axis=0)

print('MAPE: %s' % MAPE(
    y_true=y_test.values.flatten(),
    y_pred=test_y_pred))













ValueError: estimator requires y to be passed, but the target y is None