In [1]:
import tensorflow as tf
import numpy as np
import copy

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
class DataFrame(object):


    def __init__(self, columns, data):
        assert len(columns) == len(data), 'columns length does not match data length'

        lengths = [mat.shape[0] for mat in data]
        assert len(set(lengths)) == 1, 'all matrices in data must have same first dimension'

        self.length = lengths[0]
        self.columns = columns
        self.data = data
        self.dict = dict(zip(self.columns, self.data))
        self.idx = np.arange(self.length)

    def shapes(self):
        return pd.Series(dict(zip(self.columns, [mat.shape for mat in self.data])))

    def dtypes(self):
        return pd.Series(dict(zip(self.columns, [mat.dtype for mat in self.data])))

    def shuffle(self):
        np.random.shuffle(self.idx)

    def train_test_split(self, train_size, random_state=np.random.randint(10000)):
        train_idx, test_idx = train_test_split(self.idx, train_size=train_size, random_state=random_state)
        train_df = DataFrame(copy.copy(self.columns), [mat[train_idx] for mat in self.data])
        test_df = DataFrame(copy.copy(self.columns), [mat[test_idx] for mat in self.data])
        return train_df, test_df

    def batch_generator(self, batch_size, shuffle=True, num_epochs=3, allow_smaller_final_batch=False):
        epoch_num = 0
        while epoch_num < num_epochs:
            if shuffle:
                self.shuffle()

            for i in range(0, self.length, batch_size):
                batch_idx = self.idx[i: i + batch_size]
                if not allow_smaller_final_batch and len(batch_idx) != batch_size:
                    break
                yield DataFrame(columns=copy.copy(self.columns), data=[mat[batch_idx].copy() for mat in self.data])

            epoch_num += 1

    def iterrows(self):
        for i in self.idx:
            yield self[i]

    def mask(self, mask):
        return DataFrame(copy.copy(self.columns), [mat[mask] for mat in self.data])

    def __iter__(self):
        return self.dict.items().__iter__()

    def __len__(self):
        return self.length

    def __getitem__(self, key):
        if isinstance(key, str):
            return self.dict[key]

        elif isinstance(key, int):
            return pd.Series(dict(zip(self.columns, [mat[self.idx[key]] for mat in self.data])))

    def __setitem__(self, key, value):
        assert value.shape[0] == len(self), 'matrix first dimension does not match'
        if key not in self.columns:
            self.columns.append(key)
            self.data.append(value)
        self.dict[key] = value

In [3]:
class DataReader(object):

    def __init__(self, data_dir="../input/instacart-nnmf-dataset"):
        data_cols = ['i', 'j', 'V_ij']
        data = [np.load(os.path.join(data_dir, '{}.npy'.format(i)), mmap_mode='r') for i in data_cols]

        df = DataFrame(columns=data_cols, data=data)
        self.train_df, self.val_df = df.train_test_split(train_size=0.9)

        self.num_users = df['i'].max() + 1
        self.num_products = df['j'].max() + 1
        # num_users == num_products

    def train_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.train_df,
            shuffle=True,
            num_epochs=50
        )

    def val_batch_generator(self, batch_size):
        return self.batch_generator(
            batch_size=batch_size,
            df=self.val_df,
            shuffle=True,
            num_epochs=50
        )

    def batch_generator(self, batch_size, df, shuffle=True, num_epochs=10000, is_test=False):
        return df.batch_generator(batch_size, shuffle=shuffle, num_epochs=num_epochs, allow_smaller_final_batch=is_test)

In [4]:
def sequence_log_loss(y, y_hat, sequence_lengths, max_sequence_length, eps=1e-15):
    y = tf.cast(y, tf.float32)
    y_hat = tf.math.minimum(tf.math.maximum(y_hat, eps), 1.0 - eps)
    log_losses = y*tf.math.log(y_hat) + (1.0 - y)*tf.math.log(1.0 - y_hat)
    sequence_mask = tf.cast(tf.sequence_mask(sequence_lengths, maxlen=max_sequence_length), tf.float32)
    avg_log_loss = -tf.reduce_sum(log_losses*sequence_mask) / tf.cast(tf.math.reduce_sum(sequence_lengths), tf.float32)
    return avg_log_loss
def log_loss(y, y_hat, eps=1e-15):
    y = tf.cast(y, tf.float32)
    y_hat = tf.math.minimum(tf.math.maximum(y_hat, eps), 1.0 - eps)
    log_loss = -tf.math.reduce_mean(y*tf.math.log(y_hat) + (1.0 - y)*tf.math.log(1.0 - y_hat))
    return log_loss

In [5]:
class NNMF (tf.keras.Model):
    def __init__(self,rank, num_products, num_users):
        super().__init__()
        self.rank = rank
        self.i = tf.keras.Input(shape=[], dtype = tf.int32)
        self.j = tf.keras.Input(shape=[], dtype = tf.int32)
        self.V_ij = tf.keras.Input(shape=[], dtype = tf.float32)
        self.num_users =num_users
        self.num_products = num_products
        self.W = tf.Variable(tf.random.truncated_normal([num_users, rank]))
        self.H = tf.Variable(tf.random.truncated_normal([num_products, rank]))
        self.W_bias = tf.Variable(tf.random.truncated_normal([num_users]))
        self.H_bias = tf.Variable(tf.random.truncated_normal([num_products]))
        self.global_mean = tf.Variable(0.0)
    def call(self,inputs):
        for placeholder_name, data in inputs:
            if placeholder_name != "V_ij":
                if hasattr(self, placeholder_name):
                    setattr(self, placeholder_name, np.asarray(data, dtype=np.int32))
            else:
                if hasattr(self, placeholder_name):
                    setattr(self, placeholder_name, np.asarray(data, dtype=np.float32))
        w_i = tf.gather(self.W, self.i)
        h_j = tf.gather(self.H, self.j)
        w_bias = tf.gather(self.W_bias, self.i)
        h_bias = tf.gather(self.H_bias, self.j)
        interaction = tf.reduce_sum(w_i * h_j, axis=1)
        preds = self.global_mean + w_bias + h_bias + interaction
        rmse = tf.math.sqrt(tf.math.reduce_mean(tf.math.squared_difference(preds, self.V_ij)))
        self.parameter_tensors = {
            'user_embeddings': self.W,
            'product_embeddings': self.H
        }
        return rmse

In [6]:
import os
base_dir = '../input/'
dr = DataReader(data_dir=os.path.join(base_dir, 'instacart-nnmf-dataset'))

In [7]:
nnmf = NNMF(rank = 25, num_products = dr.num_products, num_users = dr.num_users)

In [8]:
train_dataset = dr.train_batch_generator(128)

In [9]:
optimizer = tf.keras.optimizers.Adam(0.001)


In [10]:
import time
epochs = 50
optimizer = tf.keras.optimizers.Adam(0.002)
step0 = 0
for epoch in range(epochs):
    print("\nStart of epoch %d" % (epoch,))
    start_time = time.time()

    # Iterate over the batches of the dataset.
    train = list()
    for step, x_batch_train in enumerate(train_dataset):
        step0 += 1
        with tf.GradientTape() as tape:
            loss_value = nnmf(x_batch_train, training = True)
            train.append(loss_value)
        grads = tape.gradient(loss_value, nnmf.trainable_weights)
        optimizer.apply_gradients(zip(grads, nnmf.trainable_weights))
        # Log every 200 batches.
        if step % 200 == 0:
            print(
                "Training loss (for one batch) at step %d: %.4f"
                % (step, float(loss_value))
            )
            print("Seen so far: %d samples" % ((step + 1) * 128))
        if step % 2728 == 0 and step > 0:
            print(sum(train)/len(train))
            break
        if step0 > 60000:
            break
    if step0 > 60000:
        break
    val = list()
    for step1, x_batch_val in enumerate(val_dataset):
        loss_value = nnmf(x_batch_val, training=True)
        val.append(loss_value)
        if step1 % 10 == 0:
            print("validation", step1)
            print(loss_value)
        if step1 % 303 == 0 and step1 > 0:
            print(sum(val)/len(val))
            break
            