In [1]:
import tensorflow as tf
import numpy as np
import copy

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
class DataFrame(object):

    """Minimal pd.DataFrame analog for handling n-dimensional numpy matrices with additional
    support for shuffling, batching, and train/test splitting.

    Args:
        columns: List of names corresponding to the matrices in data.
        data: List of n-dimensional data matrices ordered in correspondence with columns.
            All matrices must have the same leading dimension.  Data can also be fed a list of
            instances of np.memmap, in which case RAM usage can be limited to the size of a
            single batch.
    """

    def __init__(self, columns, data):
        assert len(columns) == len(data), 'columns length does not match data length'

        lengths = [mat.shape[0] for mat in data]
        assert len(set(lengths)) == 1, 'all matrices in data must have same first dimension'

        self.length = lengths[0]
        self.columns = columns
        self.data = data
        self.dict = dict(zip(self.columns, self.data))
        self.idx = np.arange(self.length)

    def shapes(self):
        return pd.Series(dict(zip(self.columns, [mat.shape for mat in self.data])))

    def dtypes(self):
        return pd.Series(dict(zip(self.columns, [mat.dtype for mat in self.data])))

    def shuffle(self):
        np.random.shuffle(self.idx)

    def train_test_split(self, train_size, random_state=np.random.randint(10000)):
        train_idx, test_idx = train_test_split(self.idx, train_size=train_size, random_state=random_state)
        train_df = DataFrame(copy.copy(self.columns), [mat[train_idx] for mat in self.data])
        test_df = DataFrame(copy.copy(self.columns), [mat[test_idx] for mat in self.data])
        return train_df, test_df

    def batch_generator(self, batch_size, shuffle=True, num_epochs=3, allow_smaller_final_batch=False):
        epoch_num = 0
        while epoch_num < num_epochs:
            if shuffle:
                self.shuffle()

            for i in range(0, self.length, batch_size):
                batch_idx = self.idx[i: i + batch_size]
                if not allow_smaller_final_batch and len(batch_idx) != batch_size:
                    break
                yield DataFrame(columns=copy.copy(self.columns), data=[mat[batch_idx].copy() for mat in self.data])

            epoch_num += 1

    def iterrows(self):
        for i in self.idx:
            yield self[i]

    def mask(self, mask):
        return DataFrame(copy.copy(self.columns), [mat[mask] for mat in self.data])

    def __iter__(self):
        return self.dict.items().__iter__()

    def __len__(self):
        return self.length

    def __getitem__(self, key):
        if isinstance(key, str):
            return self.dict[key]

        elif isinstance(key, int):
            return pd.Series(dict(zip(self.columns, [mat[self.idx[key]] for mat in self.data])))

    def __setitem__(self, key, value):
        assert value.shape[0] == len(self), 'matrix first dimension does not match'
        if key not in self.columns:
            self.columns.append(key)
            self.data.append(value)
        self.dict[key] = value

In [3]:
class DataReader(object):

    def __init__(self, data_dir):
        data_cols = ['x', 'y']
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]

        df = DataFrame(columns=data_cols, data=data)

        self.train_df, self.val_df = df.train_test_split(train_size=0.9)

        self.num_products = df['x'].max() + 1
        self.product_dist = np.bincount(self.train_df['x']).tolist()

    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=50
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=50
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        return df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs)


In [4]:
def sequence_log_loss(y, y_hat, sequence_lengths, max_sequence_length, eps=1e-15):
    y = tf.cast(y, tf.float32)
    y_hat = tf.math.minimum(tf.math.maximum(y_hat, eps), 1.0 - eps)
    log_losses = y*tf.math.log(y_hat) + (1.0 - y)*tf.math.log(1.0 - y_hat)
    sequence_mask = tf.cast(tf.sequence_mask(sequence_lengths, maxlen=max_sequence_length), tf.float32)
    avg_log_loss = -tf.reduce_sum(log_losses*sequence_mask) / tf.cast(tf.math.reduce_sum(sequence_lengths), tf.float32)
    return avg_log_loss
def log_loss(y, y_hat, eps=1e-15):
    y = tf.cast(y, tf.float32)
    y_hat = tf.math.minimum(tf.math.maximum(y_hat, eps), 1.0 - eps)
    log_loss = -tf.math.reduce_mean(y*tf.math.log(y_hat) + (1.0 - y)*tf.math.log(1.0 - y_hat))
    return log_loss

In [5]:
class SGNS(tf.keras.Model):
    def __init__(self,embedding_dim, negative_samples, num_products, product_dist):
        super().__init__()
        self.embedding_dim = embedding_dim
        self.negative_samples = negative_samples
        self.num_products = num_products
        self.product_dist = product_dist
        self.x = tf.keras.Input(shape=[], dtype = tf.int32)
        self.y = tf.keras.Input(shape=[], dtype = tf.int32)
        self.embeddings = tf.Variable(
            tf.random.uniform([num_products, embedding_dim], -1.0, 1.0))
        self.nce_weights = tf.Variable(
            tf.random.truncated_normal(
                shape=[num_products, embedding_dim],
                stddev=1.0 / np.sqrt(embedding_dim)
            )
        )
        self.nce_biases = tf.Variable(tf.zeros([num_products]))
    def call(self,inputs):
        for placeholder_name, data in inputs:
            if hasattr(self, placeholder_name):
                setattr(self, placeholder_name, np.asarray(data, dtype=np.int32))
        inputs = tf.nn.embedding_lookup(self.embeddings, self.x)
        sampled_values = tf.nn.fixed_unigram_candidate_sampler(
            true_classes=tf.cast(tf.reshape(self.y, (-1, 1)), tf.int64),
            num_true=1,
            num_sampled=self.negative_samples,
            unique=True,
            range_max=self.num_products,
            distortion=0.75,
            unigrams=self.product_dist
        )
        loss = tf.reduce_mean(
            tf.nn.nce_loss(
                weights=self.nce_weights,
                biases=self.nce_biases,
                labels=self.y,
                inputs=inputs,
                num_sampled=self.negative_samples,
                num_classes=self.num_products,
                sampled_values=sampled_values
            )
        )

        self.parameter_tensors = {
            'product_embeddings': self.embeddings
        }
        return loss


In [6]:
import os
base_dir = '../input/'
dr = DataReader(data_dir=os.path.join(base_dir, 'instacartsgns'))

In [7]:
sgns = SGNS(25,100,num_products = dr.num_products, product_dist=dr.product_dist)

In [8]:
train_dataset = dr.train_batch_generator(128)

In [10]:
import time
epochs = 50
optimizer = tf.keras.optimizers.Adam(0.002)
step0 = 0
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    train = list()
    for step, x_batch_train in enumerate(train_dataset):
        step0 += 1
        with tf.GradientTape() as tape:
            loss_value = sgns(x_batch_train, training = True)
            train.append(loss_value)
        grads = tape.gradient(loss_value, sgns.trainable_weights)
        optimizer.apply_gradients(zip(grads, sgns.trainable_weights))
        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * 128))
        if step % 2728 == 0 and step > 0:
            print(sum(train)/len(train))
            break
        if step0 > 60000:
            break
    if step0 > 60000:
        break
    val = list()
    for step1, x_batch_val in enumerate(val_dataset):
        loss_value = sgns(x_batch_val, training=True)
        val.append(loss_value)
        if step1 % 10 == 0:
            print("validation", step1)
            print(loss_value)
        if step1 % 303 == 0 and step1 > 0:
            print(sum(val)/len(val))
            break
            