# Import Packages

In [2]:
import argparse
import numpy as np
import pandas as pd
from tqdm import tqdm

In [3]:
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.losses import categorical_crossentropy
from tensorflow.keras.layers import Input, Dense, Dropout, GRU

# Load Data

In [4]:
def load_our_data(path, limit):
    return pd.read_csv(path, nrows = limit, sep="\t")

In [5]:
# set sise of data (number of samples). If None (suggested), full datasets are applied.
limit = None
validation_limit = None
testing_limit = None

if limit == None:
    validation_limit = None
    testing_limit = None
else:
    validation_limit = int(0.2 * limit)
    testing_limit = int(0.2 * limit)

In [6]:
prepared_data_path = "../data/rsc15/prepared/"

train_data = load_our_data(path=f"{prepared_data_path}yoochoose-clicks-100k_train_full.txt", limit=limit)
dev_data = load_our_data(path=f"{prepared_data_path}yoochoose-clicks-100k_train_valid.txt", limit=validation_limit)
test_data = load_our_data(path=f"{prepared_data_path}yoochoose-clicks-100k_test.txt", limit=testing_limit)

In [7]:
train_data

Unnamed: 0,SessionId,ItemId,Time
0,3,214716935,1.396437e+09
1,3,214832672,1.396438e+09
2,6,214701242,1.396796e+09
3,6,214826623,1.396797e+09
4,7,214826835,1.396414e+09
...,...,...,...
70273,31813,214691293,1.396769e+09
70274,31812,214662819,1.396365e+09
70275,31812,214836765,1.396365e+09
70276,31812,214836073,1.396365e+09


In [8]:
train_n_items = len(train_data['ItemId'].unique()) + 1
train_samples_qty = len(train_data['SessionId'].unique()) + 1
test_samples_qty = len(test_data['SessionId'].unique()) + 1

# Hyperparameters

In [26]:
# 512 - Number of sequences running through the network in one pass.
batch_size = 512

# 50 - Embedding dimensions
embed_dim = 300

# The dropout drop probability when training on input. If you're network is overfitting, try decreasing this.
x_drop_probability = 0.00

# The dropout keep probability when training on RNN neurons. If you're network is overfitting, try decreasing this.
rnn_keep_probability = 1.00

# 100 - The number of units in the hidden layers.
rnn_size = 200

# 1
num_layers = 1

# Learning rate for training
# typically 0.0001 up to 1: http://datascience.stackexchange.com/questions/410/choosing-a-learning-rate
# best learning_rate = 0.0025
learning_rate = 0.0025

# 20 epochs
epochs = 20

# save model
save_weights = False

# eval_all_epochs
eval_all_epochs = True

In [10]:
class SessionDataset:
    """Credit to yhs-968/pyGRU4REC."""    
    def __init__(self, data, sep='\t', session_key='SessionId', item_key='ItemId', time_key='Time', n_samples=-1, itemmap=None, time_sort=False):
        """
        Args:
            path: path of the csv file
            sep: separator for the csv
            session_key, item_key, time_key: name of the fields corresponding to the sessions, items, time
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        self.add_item_indices(itemmap=itemmap)
        self.df.sort_values([session_key, time_key], inplace=True)

        # Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        # clicks within a session are next to each other, where the clicks within a session are time-ordered.

        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()
        
    def get_click_offsets(self):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()

        return offsets

    def order_session_idx(self):
        """ Order the session indices """
        if self.time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())

        return session_idx_arr
    
    def add_item_indices(self, itemmap=None):
        """ 
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()  # unique item ids
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            itemmap = pd.DataFrame({self.item_key:item_ids,
                                   'item_idx':item2idx[item_ids].values})
        
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')
        
    @property    
    def items(self):
        return self.itemmap.ItemId.unique()       


In [11]:
class SessionDataLoader:
    """Credit to yhs-968/pyGRU4REC."""    
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.
        Args:
            dataset (SessionDataset): the session dataset to generate the batches from
            batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size
        self.done_sessions_counter = 0
        
    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,):  Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        df = self.dataset.df
        session_key='SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = [] # indicator for the sessions to be terminated
        finished = False        

        while not finished:
            minlen = (end - start).min()
            # Item indices (for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                inp = idx_input
                target = idx_target
                yield inp, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            self.done_sessions_counter = len(mask)
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

In [27]:
def create_model():   
    emb_size = 50
    hidden_units = 100
    size = emb_size

    inputs = Input(batch_shape=(batch_size, 1, train_n_items))
    gru, gru_states = GRU(hidden_units, stateful=True, return_state=True, name="GRU")(inputs)
    drop2 = Dropout(0.25)(gru)
    predictions = Dense(train_n_items, activation='softmax')(drop2)
    model = Model(inputs=inputs, outputs=[predictions])
    opt = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
    model.compile(loss=categorical_crossentropy, optimizer=opt)
    model.summary()

    filepath='./model_checkpoint.h5'
    checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
    callbacks_list = []
    return model

In [21]:
def get_metrics(model, train_generator_map, recall_k=20, mrr_k=20):

    test_dataset = SessionDataset(test_data, itemmap=train_generator_map)
    test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)

    n = 0
    rec_sum = 0
    mrr_sum = 0

    print("Evaluating model...")
    for feat, label, mask in test_generator:

        gru_layer = model.get_layer(name="GRU")
        hidden_states = gru_layer.states[0].numpy()
        for elt in mask:
            hidden_states[elt, :] = 0
        gru_layer.reset_states(states=hidden_states)

        target_oh = to_categorical(label, num_classes=train_n_items)
        input_oh  = to_categorical(feat,  num_classes=train_n_items)
        input_oh = np.expand_dims(input_oh, axis=1)

        pred = model.predict(input_oh, batch_size=batch_size)

        for row_idx in range(feat.shape[0]):
            pred_row = pred[row_idx]
            label_row = target_oh[row_idx]

            rec_idx =  pred_row.argsort()[-recall_k:][::-1]
            mrr_idx =  pred_row.argsort()[-mrr_k:][::-1]
            tru_idx = label_row.argsort()[-1:][::-1]

            n += 1

            if tru_idx[0] in rec_idx:
                rec_sum += 1

            if tru_idx[0] in mrr_idx:
                mrr_sum += 1/int((np.where(mrr_idx == tru_idx[0])[0]+1))

    recall = rec_sum/n
    mrr = mrr_sum/n
    return (recall, recall_k), (mrr, mrr_k)

In [22]:
def train_model(model):
    train_dataset = SessionDataset(train_data)
    model_to_train = model

    for epoch in range(1, epochs):
        with tqdm(total=train_samples_qty) as pbar:
            loader = SessionDataLoader(train_dataset, batch_size=batch_size)
            for feat, target, mask in loader:

                gru_layer = model_to_train.get_layer(name="GRU")
                hidden_states = gru_layer.states[0].numpy()
                for elt in mask:
                    hidden_states[elt, :] = 0
                gru_layer.reset_states(states=hidden_states)

                input_oh = to_categorical(feat, num_classes=loader.n_items)
                input_oh = np.expand_dims(input_oh, axis=1)

                target_oh = to_categorical(target, num_classes=loader.n_items)

                tr_loss = model_to_train.train_on_batch(input_oh, target_oh)

                pbar.set_description("Epoch {0}. Loss: {1:.5f}".format(epoch, tr_loss))
                pbar.update(loader.done_sessions_counter)

        if save_weights:
            print("Saving weights...")
            model_to_train.save('./GRU4REC_{}.h5'.format(epoch))

        if eval_all_epochs:
            (rec, rec_k), (mrr, mrr_k) = get_metrics(model_to_train, train_dataset.itemmap)
            print("\t - Recall@{} epoch {}: {:5f}".format(rec_k, epoch, rec))
            print("\t - MRR@{}    epoch {}: {:5f}\n".format(mrr_k, epoch, mrr))

    if not eval_all_epochs:
        (rec, rec_k), (mrr, mrr_k) = get_metrics(model_to_train, train_dataset.itemmap)
        print("\t - Recall@{} epoch {}: {:5f}".format(rec_k, epochs, rec))
        print("\t - MRR@{}    epoch {}: {:5f}\n".format(mrr_k, epochs, mrr))


In [23]:
model = create_model()

Model: "model_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_3 (InputLayer)         [(512, 1, 2934)]          0         
_________________________________________________________________
GRU (GRU)                    [(512, 100), (512, 100)]  910800    
_________________________________________________________________
dropout_2 (Dropout)          (512, 100)                0         
_________________________________________________________________
dense_2 (Dense)              (512, 2934)               296334    
Total params: 1,207,134
Trainable params: 1,207,134
Non-trainable params: 0
_________________________________________________________________


In [28]:
 train_model(model)

Epoch 1. Loss: 5.02643:  96%|██████████▌| 17148/17795 [00:22<00:00, 757.02it/s]


Evaluating model...
	 - Recall@20 epoch 1: 0.498901
	 - MRR@20    epoch 1: 0.251320



Epoch 2. Loss: 4.80492:  96%|██████████▌| 17148/17795 [00:22<00:00, 769.55it/s]


Evaluating model...
	 - Recall@20 epoch 2: 0.526245
	 - MRR@20    epoch 2: 0.264528



Epoch 3. Loss: 4.61273:  96%|██████████▌| 17148/17795 [00:22<00:00, 749.40it/s]


Evaluating model...
	 - Recall@20 epoch 3: 0.545166
	 - MRR@20    epoch 3: 0.276505



Epoch 4. Loss: 4.41084:  96%|██████████▌| 17148/17795 [00:21<00:00, 782.93it/s]


Evaluating model...
	 - Recall@20 epoch 4: 0.561035
	 - MRR@20    epoch 4: 0.287257



Epoch 5. Loss: 4.28362:  96%|██████████▌| 17148/17795 [00:22<00:00, 768.83it/s]


Evaluating model...
	 - Recall@20 epoch 5: 0.571777
	 - MRR@20    epoch 5: 0.295560



Epoch 6. Loss: 4.15981:  96%|██████████▌| 17148/17795 [00:23<00:00, 733.88it/s]


Evaluating model...
	 - Recall@20 epoch 6: 0.584839
	 - MRR@20    epoch 6: 0.302474



Epoch 7. Loss: 4.02473:  96%|██████████▌| 17148/17795 [00:27<00:01, 614.50it/s]


Evaluating model...
	 - Recall@20 epoch 7: 0.594116
	 - MRR@20    epoch 7: 0.308263



Epoch 8. Loss: 3.89509:  96%|██████████▌| 17148/17795 [00:33<00:01, 508.53it/s]


Evaluating model...
	 - Recall@20 epoch 8: 0.603394
	 - MRR@20    epoch 8: 0.312562



Epoch 9. Loss: 3.81041:  96%|██████████▌| 17148/17795 [00:36<00:01, 471.40it/s]


Evaluating model...
	 - Recall@20 epoch 9: 0.609497
	 - MRR@20    epoch 9: 0.317271



Epoch 10. Loss: 3.74212:  96%|█████████▋| 17148/17795 [00:36<00:01, 469.22it/s]


Evaluating model...
	 - Recall@20 epoch 10: 0.616943
	 - MRR@20    epoch 10: 0.320180



Epoch 11. Loss: 3.67324:  96%|█████████▋| 17148/17795 [00:36<00:01, 470.44it/s]


Evaluating model...
	 - Recall@20 epoch 11: 0.621704
	 - MRR@20    epoch 11: 0.324207



Epoch 12. Loss: 3.61325:  96%|█████████▋| 17148/17795 [00:45<00:01, 378.50it/s]


Evaluating model...
	 - Recall@20 epoch 12: 0.625000
	 - MRR@20    epoch 12: 0.326721



Epoch 13. Loss: 3.49706:  96%|█████████▋| 17148/17795 [00:32<00:01, 522.50it/s]


Evaluating model...
	 - Recall@20 epoch 13: 0.628418
	 - MRR@20    epoch 13: 0.327741



Epoch 14. Loss: 3.42895:  96%|█████████▋| 17148/17795 [00:41<00:01, 412.25it/s]


Evaluating model...
	 - Recall@20 epoch 14: 0.632446
	 - MRR@20    epoch 14: 0.329261



Epoch 15. Loss: 3.43630:  96%|█████████▋| 17148/17795 [00:41<00:01, 412.51it/s]


Evaluating model...
	 - Recall@20 epoch 15: 0.636353
	 - MRR@20    epoch 15: 0.331410



Epoch 16. Loss: 3.37538:  96%|█████████▋| 17148/17795 [00:38<00:01, 447.77it/s]


Evaluating model...
	 - Recall@20 epoch 16: 0.638550
	 - MRR@20    epoch 16: 0.331869



Epoch 17. Loss: 3.26416:  96%|█████████▋| 17148/17795 [00:41<00:01, 412.27it/s]


Evaluating model...
	 - Recall@20 epoch 17: 0.639893
	 - MRR@20    epoch 17: 0.333670



Epoch 18. Loss: 3.25009:  96%|█████████▋| 17148/17795 [00:51<00:01, 331.13it/s]


Evaluating model...
	 - Recall@20 epoch 18: 0.642334
	 - MRR@20    epoch 18: 0.334164



Epoch 19. Loss: 3.21333:  96%|█████████▋| 17148/17795 [00:46<00:01, 367.99it/s]


Evaluating model...
	 - Recall@20 epoch 19: 0.642822
	 - MRR@20    epoch 19: 0.335264

