In [None]:
import tensorflow as tf
import numpy as np
import copy
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [None]:
def sequence_log_loss(y, y_hat, sequence_lengths, max_sequence_length, eps=1e-15):
    """
    Calculates average log loss on variable length sequences.

    Args:
        y: Label tensor of shape [batch size, max_sequence_length, input units].
        y_hat: Prediction tensor, same shape as y.
        sequence_lengths: Sequence lengths.  Tensor of shape [batch_size].
        max_sequence_length: maximum length of padded sequence tensor.

    Returns:
        Log loss. 0-dimensional tensor.
    """
    y = tf.cast(y, tf.float32)
    y_hat = tf.math.minimum(tf.math.maximum(y_hat, eps), 1.0 - eps)
    log_losses = y*tf.math.log(y_hat) + (1.0 - y)*tf.math.log(1.0 - y_hat)
    sequence_mask = tf.cast(tf.sequence_mask(sequence_lengths, maxlen=max_sequence_length), tf.float32)
    avg_log_loss = -tf.reduce_sum(log_losses*sequence_mask) / tf.cast(tf.math.reduce_sum(sequence_lengths), tf.float32)
    return avg_log_loss
def log_loss(y, y_hat, eps=1e-15):
    """
    Calculates log loss between two tensors.

    Args:
        y: Label tensor.
        y_hat: Prediction tensor

    Returns:
        Log loss. 0-dimensional tensor.
    """
    y = tf.cast(y, tf.float32)
    y_hat = tf.math.minimum(tf.math.maximum(y_hat, eps), 1.0 - eps)
    log_loss = -tf.math.reduce_mean(y*tf.math.log(y_hat) + (1.0 - y)*tf.math.log(1.0 - y_hat))
    return log_loss
def sequence_rmse(y, y_hat, sequence_lengths, max_sequence_length):
    """
    Calculates RMSE on variable length sequences.

    Args:
        y: Label tensor of shape [batch size, max_sequence_length, input units].
        y_hat: Prediction tensor, same shape as y.
        sequence_lengths: Sequence lengths.  Tensor of shape [batch_size].
        max_sequence_length: maximum length of padded sequence tensor.

    Returns:
        RMSE. 0-dimensional tensor.
    """
    y = tf.cast(y, tf.float32)
    squared_error = tf.square(y - y_hat)
    sequence_mask = tf.cast(tf.sequence_mask(sequence_lengths, maxlen=max_sequence_length), tf.float32)
    avg_squared_error = tf.math.reduce_sum(squared_error*sequence_mask) / tf.cast(tf.math.reduce_sum(sequence_lengths), tf.float32)
    rmse = tf.math.sqrt(avg_squared_error)
    return rmse

In [None]:
class DataFrame(object):

    """Minimal pd.DataFrame analog for handling n-dimensional numpy matrices with additional
    support for shuffling, batching, and train/test splitting.

    Args:
        columns: List of names corresponding to the matrices in data.
        data: List of n-dimensional data matrices ordered in correspondence with columns.
            All matrices must have the same leading dimension.  Data can also be fed a list of
            instances of np.memmap, in which case RAM usage can be limited to the size of a
            single batch.
    """

    def __init__(self, columns, data):
        assert len(columns) == len(data), 'columns length does not match data length'

        lengths = [mat.shape[0] for mat in data]
        assert len(set(lengths)) == 1, 'all matrices in data must have same first dimension'

        self.length = lengths[0]
        self.columns = columns
        self.data = data
        self.dict = dict(zip(self.columns, self.data))
        self.idx = np.arange(self.length)

    def shapes(self):
        return pd.Series(dict(zip(self.columns, [mat.shape for mat in self.data])))

    def dtypes(self):
        return pd.Series(dict(zip(self.columns, [mat.dtype for mat in self.data])))

    def shuffle(self):
        np.random.shuffle(self.idx)

    def train_test_split(self, train_size, random_state=np.random.randint(10000)):
        train_idx, test_idx = train_test_split(self.idx, train_size=train_size, random_state=random_state)
        train_df = DataFrame(copy.copy(self.columns), [mat[train_idx] for mat in self.data])
        test_df = DataFrame(copy.copy(self.columns), [mat[test_idx] for mat in self.data])
        return train_df, test_df

    def batch_generator(self, batch_size, shuffle=True, num_epochs=3, allow_smaller_final_batch=False):
        epoch_num = 0
        while epoch_num < num_epochs:
            if shuffle:
                self.shuffle()

            for i in range(0, self.length, batch_size):
                batch_idx = self.idx[i: i + batch_size]
                if not allow_smaller_final_batch and len(batch_idx) != batch_size:
                    break
                yield DataFrame(columns=copy.copy(self.columns), data=[mat[batch_idx].copy() for mat in self.data])

            epoch_num += 1

    def iterrows(self):
        for i in self.idx:
            yield self[i]

    def mask(self, mask):
        return DataFrame(copy.copy(self.columns), [mat[mask] for mat in self.data])

    def __iter__(self):
        return self.dict.items().__iter__()

    def __len__(self):
        return self.length

    def __getitem__(self, key):
        if isinstance(key, str):
            return self.dict[key]

        elif isinstance(key, int):
            return pd.Series(dict(zip(self.columns, [mat[self.idx[key]] for mat in self.data])))

    def __setitem__(self, key, value):
        assert value.shape[0] == len(self), 'matrix first dimension does not match'
        if key not in self.columns:
            self.columns.append(key)
            self.data.append(value)
        self.dict[key] = value
class DataReader(object):

    def __init__(self, data_dir):
        data_cols = [
            'user_id',
            'aisle_id',
            'department_id',
            'eval_set',
            'is_ordered_history',
            'index_in_order_history',
            'order_dow_history',
            'order_hour_history',
            'days_since_prior_order_history',
            'order_size_history',
            'order_number_history',
            'num_products_from_aisle_history',
            'history_length',
        ]
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]
        self.test_df = DataFrame(columns=data_cols, data=data)


        self.train_df, self.val_df = self.test_df.train_test_split(train_size=0.9)
    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=80,
            is_test=False
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=80,
            is_test=False
        )

    def test_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.test_df,
            shuffle=False,
            num_epochs=1,
            is_test=True
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        batch_gen = df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)
        for batch in batch_gen:
            batch['order_dow_history'] = np.roll(batch['order_dow_history'], -1, axis=1)
            batch['order_hour_history'] = np.roll(batch['order_hour_history'], -1, axis=1)
            batch['days_since_prior_order_history'] = np.roll(batch['days_since_prior_order_history'], -1, axis=1)
            batch['order_number_history'] = np.roll(batch['order_number_history'], -1, axis=1)
            batch['next_is_ordered'] = np.roll(batch['is_ordered_history'], -1, axis=1)
            if not is_test:
                batch['history_length'] = batch['history_length'] - 1
            yield batch


In [None]:
class LSTM_layer(tf.keras.layers.Layer):
    def __init__(self, state_size, return_final_state=False):
        super().__init__()
        self.cell_fw = tf.keras.layers.LSTMCell(state_size)
        self.return_final_state = return_final_state
        self.rnn_layers = tf.keras.layers.RNN(cell = [tf.keras.layers.LSTMCell(state_size)], return_sequences=True, return_state=True)
    def call(self, inputs):
        outputs, output_state = self.rnn_layers(inputs)
        if self.return_final_state:
            return outputs, output_state
        else:
            return outputs
class TCL(tf.keras.layers.Layer):
    def __init__(self, output_units, convolution_width, causal=False, dilation_rate=[32], bias=True, activation=None, dropout=None):
        super().__init__()
        self.output_units = output_units
        self.convolution_width = convolution_width
        self.causal = causal
        self.dilation_rate = dilation_rate
        self.bias = bias
        self.activation = activation
        self.dropout = dropout
    def build(self, inputs_shape):
        self.w = tf.Variable(name='weights', initial_value=tf.keras.initializers.VarianceScaling()(shape=[self.convolution_width, inputs_shape[2], self.output_units]), dtype=tf.float32)
        if self.bias:
            self.b = tf.Variable(name='biases', initial_value=tf.constant_initializer(0.1)(shape=[self.output_units]), dtype=tf.float32)
    def call(self, inputs):
        if self.causal:
            shift = self.dilation_rate[0]*(self.convolution_width-1)
            pad = tf.zeros(shape=[tf.shape(inputs)[0], shift, inputs.shape.as_list()[2]])
            inputs = tf.concat([pad, inputs], axis=1)
        z = tf.nn.convolution(inputs, self.w, padding='VALID', dilations=self.dilation_rate)
        if self.bias:
            z = z+self.b
        z = self.activation(z) if self.activation else z
        dropout_layer = tf.keras.layers.Dropout(self.dropout)
        z = dropout_layer(z) if self.dropout is not None else z
        return z
class Time_distributed_dense_layer(tf.keras.layers.Layer):
    def __init__(self,output_units, bias=True, activation=None, batch_norm=None, dropout=None):
        super().__init__()
        self.output_units = output_units
        self.bias = bias
        self.activation = activation
        self.batch_norm = batch_norm
        self.dropout = dropout
    def build(self, inputs_shape):
        self.w = tf.Variable(name='weights', initial_value=tf.keras.initializers.VarianceScaling()(shape=[inputs_shape[-1], self.output_units]), dtype=tf.float32)
        if self.bias:
            self.b = tf.Variable(name='biases', initial_value=tf.constant_initializer(0.1)(shape=[self.output_units]), dtype=tf.float32)
    def call(self, inputs):
        z = tf.einsum('ijk,kl->ijl', inputs, self.w)
        if self.bias:
            z = z + self.b
        if self.batch_norm is not None:
            batch_layer = tf.keras.layers.BatchNormalization()
            z = batch_layer(z, training=self.batch_norm)
        z = self.activation(z) if self.activation else z
        dropout_layer = tf.keras.layers.Dropout(self.dropout)
        z = dropout_layer(z) if self.dropout is not None else z
        return z

class Dense_layer(tf.keras.layers.Layer):
    def __init__(self, output_units, bias=True, activation=None, batch_norm=None, dropout=None):
        super().__init__()
        self.output_units = output_units
        self.bias = bias
        self.activation = activation
        self.batch_norm = batch_norm
        self.dropout = dropout
    def build(self, inputs_shape):
        self.w = tf.Variable(name='weights', initial_value=tf.keras.initializers.VarianceScaling()(shape=[inputs_shape[-1], self.output_units]), dtype=tf.float32)
        if self.bias:
            self.b = tf.Variable(name='biases', initial_value=tf.constant_initializer(0.05)(shape=[self.output_units]), dtype=tf.float32)
    def call(self, inputs):
        z = tf.matmul(inputs, self.w)
        if self.bias:
            z = z+self.b
        if self.batch_norm is not None:
            batch_layer = tf.keras.layers.BatchNormalization()
            z = batch_layer(z, training=self.batch_norm)
        z = self.activation(z) if self.activation else z
        dropout_layer = tf.keras.layers.Dropout(self.dropout)
        z = dropout_layer(z) if self.dropout is not None else z
        return z

class Wavenet(tf.keras.layers.Layer):
    def __init__(self, dilations, filter_widths, skip_channels, residual_channels):
        super().__init__()
        self.dilations = dilations
        self.filter_widths = filter_widths
        self.skip_channels = skip_channels
        self.residual_channels = residual_channels
        self.tddl0 = Time_distributed_dense_layer(output_units = residual_channels, activation=tf.keras.activations.tanh)
        self.tcl1 = TCL(output_units = 2*residual_channels, convolution_width = 2, causal=True, dilation_rate=[dilations[0]])
        self.tcl2 = TCL(output_units = 2*residual_channels, convolution_width = 2, causal=True, dilation_rate=[dilations[1]])
        self.tcl3 = TCL(output_units = 2*residual_channels, convolution_width = 2, causal=True, dilation_rate=[dilations[2]])
        self.tcl4 = TCL(output_units = 2*residual_channels, convolution_width = 2, causal=True, dilation_rate=[dilations[3]])
        self.tcl5 = TCL(output_units = 2*residual_channels, convolution_width = 2, causal=True, dilation_rate=[dilations[4]])
        self.tcl6 = TCL(output_units = 2*residual_channels, convolution_width = 2, causal=True, dilation_rate=[dilations[5]])
        self.list_tcl = list()
        self.list_tcl.append(self.tcl1)
        self.list_tcl.append(self.tcl2)
        self.list_tcl.append(self.tcl3)
        self.list_tcl.append(self.tcl4)
        self.list_tcl.append(self.tcl5)
        self.list_tcl.append(self.tcl6)
        self.tddl1 = Time_distributed_dense_layer(output_units = residual_channels + skip_channels)
        self.tddl2 = Time_distributed_dense_layer(output_units = residual_channels + skip_channels)
        self.tddl3 = Time_distributed_dense_layer(output_units = residual_channels + skip_channels)
        self.tddl4 = Time_distributed_dense_layer(output_units = residual_channels + skip_channels)
        self.tddl5 = Time_distributed_dense_layer(output_units = residual_channels + skip_channels)
        self.tddl6 = Time_distributed_dense_layer(output_units = residual_channels + skip_channels)
        self.list_tddl = list()
        self.list_tddl.append(self.tddl1)
        self.list_tddl.append(self.tddl2)
        self.list_tddl.append(self.tddl3)
        self.list_tddl.append(self.tddl4)
        self.list_tddl.append(self.tddl5)
        self.list_tddl.append(self.tddl6)
    def call(self, inputs):
        z = self.tddl0(inputs)
        skip_outputs = []
        for i in range(6):
            dilated_conv = self.list_tcl[i](z)
            conv_filter, conv_gate = tf.split(dilated_conv, 2, axis=2)
            dilated_conv = tf.keras.activations.tanh(conv_filter)*tf.keras.activations.sigmoid(conv_gate)
            output_units = self.skip_channels + self.residual_channels
            outputs = self.list_tddl[i](dilated_conv)
            skips, residuals = tf.split(outputs, [self.skip_channels, self.residual_channels], axis=2)
            z += residuals
            skip_outputs.append(skips)
        skip_outputs = tf.keras.activations.relu(tf.concat(skip_outputs, axis=2))
        return skip_outputs

In [None]:
class rnn_aisle(tf.keras.Model):
    def __init__(self, lstm_size):
        super().__init__()
        self.lstm_size = lstm_size
        self.user_id = tf.keras.Input(shape=[], dtype = tf.int32)
        self.aisle_id = tf.keras.Input(shape=[], dtype = tf.int32)
        self.department_id = tf.keras.Input(shape=[], dtype = tf.int32)
        self.history_length = tf.keras.Input(shape=[], dtype = tf.int32)
        self.is_ordered_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.index_in_order_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.order_dow_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.order_hour_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.days_since_prior_order_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.order_size_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.order_number_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.num_products_from_aisle_history = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.next_is_ordered = tf.keras.Input(shape=[100], dtype = tf.int32)
        self.keep_prob = tf.keras.Input(shape=[], dtype = tf.float32)
        self.is_training = tf.keras.Input(shape=[], dtype = tf.bool)
        self.lstm = LSTM_layer(lstm_size)
        self.tddl1 = Time_distributed_dense_layer(50, activation=tf.keras.activations.relu)
        self.tddl2 = Time_distributed_dense_layer(1, activation=tf.keras.activations.sigmoid)
        
        self.aisle_embeddings = tf.Variable(name='aisle_embeddings', initial_value=tf.keras.initializers.VarianceScaling()(shape=[250, 50]), dtype=tf.float32)
        self.department_embeddings = tf.Variable(name='department_embeddings', initial_value=tf.keras.initializers.VarianceScaling()(shape=[50, 10]), dtype=tf.float32)
        self.user_embeddings = tf.Variable(name='user_embeddings', initial_value=tf.keras.initializers.VarianceScaling()(shape=[207000, self.lstm_size]), dtype=tf.float32)
        
    def get_sequence(self):
        x_aisle = tf.concat([
                               tf.nn.embedding_lookup(self.aisle_embeddings, self.aisle_id),
                               tf.nn.embedding_lookup(self.department_embeddings, self.department_id),
                               ], axis=1)
        x_aisle = tf.tile(tf.expand_dims(x_aisle, 1), (1, 100, 1))
        x_user = tf.nn.embedding_lookup(self.user_embeddings, self.user_id)
        x_user = tf.tile(tf.expand_dims(x_user, 1), (1, 100, 1))
        is_ordered_history = tf.one_hot(self.is_ordered_history, 2)
        index_in_order_history = tf.one_hot(self.index_in_order_history, 20)
        order_dow_history = tf.one_hot(self.order_dow_history, 8)
        order_hour_history = tf.one_hot(self.order_hour_history, 25)
        days_since_prior_order_history = tf.one_hot(self.days_since_prior_order_history, 31)
        order_size_history = tf.one_hot(self.order_size_history, 60)
        index_in_order_history_scalar = tf.expand_dims(tf.cast(self.index_in_order_history, tf.float32) / 20.0, 2)
        order_number_history = tf.one_hot(self.order_number_history, 101)
        num_products_from_aisle_history = tf.one_hot(self.num_products_from_aisle_history, 50)
        order_dow_history_scalar = tf.expand_dims(tf.cast(self.order_dow_history, tf.float32) / 8.0, 2)
        order_hour_history_scalar = tf.expand_dims(tf.cast(self.order_hour_history, tf.float32) / 25.0, 2)
        days_since_prior_order_history_scalar = tf.expand_dims(tf.cast(self.days_since_prior_order_history, tf.float32) / 31.0, 2)
        order_size_history_scalar = tf.expand_dims(tf.cast(self.order_size_history, tf.float32) / 60.0, 2)
        order_number_history_scalar = tf.expand_dims(tf.cast(self.order_number_history, tf.float32) / 100.0, 2)
        num_products_from_aisle_history_scalar = tf.expand_dims(tf.cast(self.num_products_from_aisle_history, tf.float32) / 50.0, 2)
        x_history = tf.concat([
            is_ordered_history,
            index_in_order_history,
            order_dow_history,
            order_hour_history,
            days_since_prior_order_history,
            order_size_history,
            num_products_from_aisle_history,
            order_number_history,
            index_in_order_history_scalar,
            order_dow_history_scalar,
            order_hour_history_scalar,
            days_since_prior_order_history_scalar,
            order_size_history_scalar,
            order_number_history_scalar,
            num_products_from_aisle_history_scalar,
        ], axis=2)
        x = tf.concat([x_history, x_aisle, x_user], axis=2)
        return x    
    def call(self, inputs):
        for placeholder_name, data in inputs:
            if hasattr(self, placeholder_name):
                setattr(self, placeholder_name, np.asarray(data, dtype=np.int32))
        x = self.get_sequence()
        h = self.lstm(x)
        h = tf.concat([h, x], axis=2)
        self.h_final = self.tddl1(h)
        y_hat = self.tddl2(self.h_final)
        y_hat = tf.squeeze(y_hat, 2)
        final_temporal_idx = tf.stack([tf.range(tf.shape(self.history_length)[0]), self.history_length - 1], axis=1)
        self.final_states = tf.gather_nd(self.h_final, final_temporal_idx)
        self.final_predictions = tf.gather_nd(y_hat, final_temporal_idx)
        self.prediction_tensors = {
            'user_ids': self.user_id,
            'aisle_ids': self.aisle_id,
            'final_states': self.final_states,
            'predictions': self.final_predictions
        }

        return y_hat

In [None]:
base_dir = '../input/'
dr = DataReader(data_dir=os.path.join(base_dir, 'instacartaisle'))

In [None]:
rnn = rnn_aisle(lstm_size=300)

In [None]:
train_dataset = dr.train_batch_generator(128)

In [None]:
val_dataset = dr.val_batch_generator(128)

In [None]:
import time
epochs = 80
optimizer = tf.keras.optimizers.Adam(0.001)
step0 = 0
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    train = list()
    pred = list()
    label = list()
    for step, x_batch_train in enumerate(train_dataset):
        with tf.GradientTape() as tape:
            preds = rnn(x_batch_train, training = True)
            pred.append(preds)
            label.append(rnn.next_is_ordered)
            loss_value = sequence_log_loss(rnn.next_is_ordered, preds, rnn.history_length, 100)
            train.append(loss_value)
        grads = tape.gradient(loss_value, rnn.trainable_weights)
        optimizer.apply_gradients(zip(grads, rnn.trainable_weights))
        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * 128))
        if step % 2728 == 0 and step > 0:
            print(sum(train)/len(train))
            break
        if step0 > 60000:
            break
    if step0 > 60000:
        break
    val = list()
    for step1, x_batch_val in enumerate(val_dataset):
        val_preds = rnn(x_batch_val, training=True)
        loss_value = sequence_log_loss(rnn.next_is_ordered, val_preds, rnn.history_length, 100)
        val.append(loss_value)
        if step1 % 10 == 0:
            print("validation", step1)
            print(loss_value)
        if step1 % 303 == 0 and step1 > 0:
            print(sum(val)/len(val))
            break
            