In [2]:
%load_ext autoreload
%autoreload 2

import sys
print(sys.executable)

from pathlib import Path
import random
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: "%.3g" % x))
import tensorflow as tf
import pandas as pd
from utils import *
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
from tqdm import tqdm
from collections import Counter
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.datasets import imdb
from tensorflow.keras import Model, regularizers
from tensorflow.keras.layers import Flatten, Dense, Embedding, RNN, GRU, Bidirectional, Layer, Dropout
from tensorflow import keras
from tensorflow_probability import distributions
from tqdm.notebook import tqdm
from collections import defaultdict
import functools
import copy
from itertools import chain
from process_data import load_processed_dataset

physical_devices = tf.config.experimental.list_physical_devices('GPU')
print(f"tensorflow version: {tf.__version__}")
    
# TENSORFLOW 2 IS A PAIN IN THE ASS

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
/home/lucastong/.pyenv/versions/anaconda3-2019.07/envs/tf2/bin/python
tensorflow version: 2.1.0


In [None]:
class GrumbelSoftmaxActivation(tf.keras.layers.Layer):
    def __init__(self, temp):
        super(GrumbelSoftmaxActivation, self).__init__()
        self.temp = temp
        self.gumbel = distributions.Gumbel(0, 1)
    
    def call(self, values):
        values = tf.nn.softmax(values, axis=1)
        grumbel_sample = self.gumbel.sample(values.shape)
        softmax_input = (tf.math.log(values)+grumbel_sample)/self.temp
        output = tf.nn.softmax(softmax_input, axis=1)
        return output

class LocalAttention(tf.keras.layers.Layer):
    def __init__(self, max_session_len, embedding_size):
        super(LocalAttention, self).__init__()

        print("max_session_len", max_session_len, "embedding_size", embedding_size)
        self.max_session_len = max_session_len
        self.embedding_size = embedding_size
    
        self.tanh_layer = Dense(
            self.embedding_size,
            activation='tanh',
            kernel_initializer='glorot_uniform',
            name="tanh_layer")
        
        u_shape = [self.embedding_size]
        self.u = self.add_weight("importance", shape=u_shape)

    def call(self, values, mask):
        batch_size, cur_session_len, item_shape = values.shape[0],values.shape[1],values.shape[2:]
        item_dims = len(item_shape)
        
        tanh_layer = self.tanh_layer(values)
        
        similarity_vector = tf.tensordot(tanh_layer, self.u, axes=([2], [0]))

        similarity_vector = similarity_vector+mask

        weights = tf.nn.softmax(similarity_vector, axis=1)
        
        values_transpose_axes = [i+2 for i in range(item_dims)]+[0, 1]
        inv_values_transpose_axes = [item_dims, item_dims+1]+[i for i in range(item_dims)]
        weighted_inputs = tf.transpose(
            tf.transpose(values, perm=values_transpose_axes)*weights,
            perm=inv_values_transpose_axes
        )
        output = tf.math.reduce_sum(weighted_inputs, axis=1)
        return output

class TestModel(Model):
    def __init__():
        super(TestModel, self).__init__()

        self.user_embedding_mtx = self.add_weight(
            initializer=tf.random_uniform_initializer(minval=-1, maxval=1), 
            shape=[num_users, user_shape],
        )
        self.item_embedding_mtx = self.add_weight(
            initializer=tf.random_uniform_initializer(minval=-1, maxval=1), 
            shape=[num_item, item_shape],
        )
        
        
        
        
        print("embedding_mtx_shape", embedding_mtx_shape)
        
        self.rnn = Bidirectional(GRU(gru_size, return_sequences=True), merge_mode="concat")
        self.attention1 = LocalAttention(max_session_len, gru_size)
        self.dense2 = Dense(
            dense2_size, 
            activation="linear", 
            kernel_initializer='he_normal'
        )
        self.dense2_act=tf.keras.layers.LeakyReLU()


        self.clustering = Dense(
            softmax_classes,
            activation="linear", 
            kernel_initializer='GlorotNormal'
        )
        self.clustering_act=GrumbelSoftmaxActivation(temp)
        self.clustering_map = Dense(
            embedding_size, 
            activation="linear", 
            kernel_initializer='he_normal'
        )
        self.clustering_map_act=tf.keras.layers.LeakyReLU()

        
        
        
        self.dense3 = Dense(
            dense3_size, 
            activation="linear", 
            kernel_initializer='he_normal'
        )

        self.dense3_act=tf.keras.layers.LeakyReLU()
        self.logits = Dense(
            max_embedding_key,
            activation="linear",
            kernel_initializer='GlorotNormal'
        )

    def call(self, x, mask, training=False):
        print("xshape", x.shape.as_list())
        word_embeddings = tf.nn.embedding_lookup(self.word_embedding_mtx, x)
        print("word_embeddings out", word_embeddings.shape.as_list())
        
        
        rnn_output = self.rnn(word_embeddings)
        print("rnn shape", rnn_output.shape.as_list())
        attention_output = self.attention1(rnn_output, mask)
        print("attention shape", attention_output.shape.as_list())

        
        if training:
            dense2 = Dropout(.5)(self.dense2_act(self.dense2(attention_output)))
            print("dense2 shape", dense2.shape.as_list())
            clusters = self.clustering_act(self.clustering(dense2))
            print("clusters shape", clusters.shape.as_list())
            clusterout = self.clustering_map_act(self.clustering_map(clusters))
            print("cluster shape", clusterout.shape.as_list())
            # cluster_att = tf.concat((word_embeddings, clusterout), axis=2)
            cluster_att = tf.concat((dense2, clusterout), axis=1)
            print("cluster_att shape", cluster_att.shape.as_list())

            dense3 = Dropout(.5)(self.dense3_act(self.dense3(cluster_att)))
            print("dense3 shape", dense3.shape.as_list())

            logits = self.logits(dense3)
            print("logits shape", logits.shape.as_list())

        else:
            dense2 = self.dense2_act(self.dense2(attention_output))
            clusters = self.clustering_act(self.clustering(dense2))
            clusterout = self.clustering_map_act(self.clustering_map(clusters))
            # cluster_att = tf.concat((word_embeddings, clusterout), axis=2)
            cluster_att = tf.concat((dense2, clusterout), axis=1)

            dense3 = self.dense3_act(self.dense3(cluster_att))

            logits = self.logits(dense3)
            
            noise = tf.random.uniform(logits.shape, maxval=10e-6)
            logits = logits + noise # adding the randomness cause topk categorical acc shitty
        
        softmax = tf.nn.softmax(logits)
        return logits, softmax
        
    def get_loss(self, y_true, logits):
        ce_loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)(y_true, logits)

        l1_reg = sum([tf.reduce_sum(tf.math.abs(tf.reshape(weight, [-1]))) for weight in self.trainable_variables])        
        num_trainable_variables = np.sum([np.prod(var.shape) for var in self.trainable_variables])
        l1_reg /= num_trainable_variables
        l1_reg *= 10
        loss = ce_loss+l1_reg
        return loss
    
    @tf.function
    def train_step(self, sessions, mask, labels, scores):
        with tf.GradientTape() as tape:
            logits, preds = self.call(sessions, mask, training=True)
            loss = self.get_loss(y_true=labels, logits=logits)
        gradients = tape.gradient(loss, model.trainable_variables)
        gradients = [tf.clip_by_value(grad, -1., 1.) for grad in gradients] # clip grads to stop nan problem
        optimizer.apply_gradients(zip(gradients, model.trainable_variables))

        for score in scores:
            score.update_state(labels, tf.cast(preds, tf.float32))

        return preds

    @tf.function
    def test_step(self, sessions, mask, labels, scores):
        logits, preds = self.call(sessions, mask, training=False)
        for score in scores:
            score.update_state(labels, tf.cast(preds, tf.float32))
        return preds

In [None]:
domain = "international"
train_phase = "0"
test_phase = "4"

batch_size = 2048
print("batch_size", batch_size)

X_train, y_train = train4
X_val, y_val = val4
X_test, y_test = test4

model = TestModel(
    max_embedding_key=max_embedding_key, 
    max_session_len=max_session_len, 
    embedding_size=128,
    gru_size=128,
    dense2_size=512,
    dense3_size=256,
    softmax_classes=128,
    temp=.01
)

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)

optimizer = tf.keras.optimizers.Adam(learning_rate=.0003)

# train_loss = tf.keras.metrics.SparseCategoricalCrossentropy(name='train_loss')
train_accs = [
    tf.keras.metrics.SparseCategoricalAccuracy(),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=20),
]
val_accs = [
    tf.keras.metrics.SparseCategoricalAccuracy(),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=20),
]
test_accs = [
    tf.keras.metrics.SparseCategoricalAccuracy(),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=5),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=10),
    tf.keras.metrics.SparseTopKCategoricalAccuracy(k=20),
]

EPOCHS = 60
num_x = len(X_train)

train_accs_rec = defaultdict(lambda: [])
val_accs_rec = defaultdict(lambda: [])
test_accs_rec = defaultdict(lambda: [])
train_losses_rec = []
val_losses_rec = []

for epoch in tqdm(list(range(EPOCHS))):
    print(f"epoch {epoch}/{EPOCHS}")

    for train_acc in train_accs:
        train_acc.reset_states()
    for val_acc in val_accs:
        val_acc.reset_states()
    for test_acc in test_accs:
        test_acc.reset_states()

    for X, y in tqdm(list(batchify(X_train, y_train, shuffle=True, batch_size=batch_size))):
        X, mask = mask_length(X, maskon_vals=0, maskoff_vals=-np.inf)
        model.train_step(tf.constant(X), tf.constant(mask), tf.constant(y), train_accs)

    for X, y in tqdm(list(batchify(X_val, y_val, shuffle=True, batch_size=batch_size))):
        X, mask = mask_length(X, maskon_vals=0, maskoff_vals=-np.inf)
        model.test_step(tf.constant(X), tf.constant(mask), tf.constant(y), val_accs)

    topn_labels = ["acc", "top5 acc", "top10 acc", "top20 acc"]
    for label, train_acc, val_acc in zip(topn_labels, train_accs, val_accs):
        train_accs_rec[label].append(train_acc.result())
        val_accs_rec[label].append(val_acc.result())
        print(f"{label} train: {train_acc.result()}")
        print(f"{label} val: {val_acc.result()}")

    print(f"Epoch {epoch+1}")