#**Proyecto - Sistemas Recomendadores - IIC3633**

## Implementación en Keras de Session-Based RNNs for Recommendation con soft atenttion

### V2: Implementación de embedding sobre one-hot vectors para entrenamiento más eficiente y modelo más chico

In [1]:
import os
import sys
import subprocess
import math
import pandas as pd
import numpy as np
import sklearn
import psutil
import humanize
import pyreclab
import GPUtil as GPU
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import mean_squared_error
from tensorflow.python.client import device_lib


import tensorflow as tf
config = tf.ConfigProto()
config.gpu_options.allow_growth = True

import warnings
#warnings.filterwarnings("ignore")

import keras
import keras.backend as K
from keras.utils import to_categorical
from keras.losses import cosine_proximity, categorical_crossentropy
from keras.models import Model, Sequential
from keras.initializers import glorot_uniform
from keras.layers.core import Permute, Reshape, RepeatVector
from keras.layers import Input, Dense, Dropout, CuDNNGRU, Embedding, concatenate, Lambda, multiply, merge, Flatten
from keras.callbacks import ModelCheckpoint


Using TensorFlow backend.


In [2]:
# Cargamos dataframes preprocesados de MovieLens20MM
PATH_TO_TRAIN = './data/train.csv' #all_
PATH_TO_DEV = './data/dev.csv'
PATH_TO_TEST = './data/test.csv'

train_data = pd.read_csv(PATH_TO_TRAIN, sep='\t', dtype={'ItemId':np.int64})
dev_data = pd.read_csv(PATH_TO_DEV, sep='\t', dtype={'ItemId':np.int64})
test_data = pd.read_csv(PATH_TO_TEST, sep='\t', dtype={'ItemId': np.int64})

In [3]:
train_data.head()

Unnamed: 0,SessionId,ItemId,Time
0,18,186,1267347706
1,18,858,1236356241
2,18,912,1283426281
3,18,1221,1236356224
4,18,1230,1236293194


In [23]:
class SessionDataset:
    def __init__(self, data, sep='\t', session_key='SessionId', item_key='ItemId', time_key='Time', n_samples=-1, itemmap=None, time_sort=True):
        """
        Args:
            n_samples: the number of samples to use. If -1, use the whole dataset.
            itemmap: mapping between item IDs and item indices
            time_sort: whether to sort the sessions by time or not
        """
        self.df = data
        self.session_key = session_key
        self.item_key = item_key
        self.time_key = time_key
        self.time_sort = time_sort
        self.df.sort_values([session_key, time_key], inplace=True)
        self.add_item_indices(itemmap=itemmap)

        #Sort the df by time, and then by session ID. That is, df is sorted by session ID and
        #clicks within a session are next to each other, where the clicks within a session are time-ordered.

        self.click_offsets = self.get_click_offsets()
        self.session_idx_arr = self.order_session_idx()
        
        
    def get_click_offsets(self):
        """
        Return the offsets of the beginning clicks of each session IDs,
        where the offset is calculated against the first click of the first session ID.
        """
        offsets = np.zeros(self.df[self.session_key].nunique() + 1, dtype=np.int32)
        # group & sort the df by session_key and get the offset values
        offsets[1:] = self.df.groupby(self.session_key).size().cumsum()

        return offsets
    

    def order_session_idx(self):
        """ Order the session indices """
        if self.time_sort:
            # starting time for each sessions, sorted by session IDs
            sessions_start_time = self.df.groupby(self.session_key)[self.time_key].min().values
            # order the session indices by session starting times
            session_idx_arr = np.argsort(sessions_start_time)
        else:
            session_idx_arr = np.arange(self.df[self.session_key].nunique())

        return session_idx_arr
    
    
    def add_item_indices(self, itemmap=None):
        """ 
        Add item index column named "item_idx" to the df
        Args:
            itemmap (pd.DataFrame): mapping between the item Ids and indices
        """
        if itemmap is None:
            item_ids = self.df[self.item_key].unique()  # unique item ids
            item2idx = pd.Series(data=np.arange(len(item_ids)),
                                 index=item_ids)
            itemmap = pd.DataFrame({self.item_key:item_ids,
                                   'item_idx':item2idx[item_ids].values})
        
        self.itemmap = itemmap
        self.df = pd.merge(self.df, self.itemmap, on=self.item_key, how='inner')
        
    
    @property    
    def items(self):
        return self.itemmap.ItemId.unique()
        

class SessionDataLoader:
    def __init__(self, dataset, batch_size=50):
        """
        A class for creating session-parallel mini-batches.
        Args:
             dataset (SessionDataset): the session dataset to generate the batches from
             batch_size (int): size of the batch
        """
        self.dataset = dataset
        self.batch_size = batch_size
        
        
    def __iter__(self):
        """ Returns the iterator for producing session-parallel training mini-batches.
        Yields:
            input (B,): torch.FloatTensor. Item indices that will be encoded as one-hot vectors later.
            target (B,): a Variable that stores the target item indices
            masks: Numpy array indicating the positions of the sessions to be terminated
        """

        # initializations
        df = self.dataset.df
        
        print(df.head(n=100))
        
        session_key='SessionId'
        item_key='ItemId'
        time_key='TimeStamp'
        self.n_items = df[item_key].nunique()+1
        click_offsets = self.dataset.click_offsets
        session_idx_arr = self.dataset.session_idx_arr

        iters = np.arange(self.batch_size)
        maxiter = iters.max()
        start = click_offsets[session_idx_arr[iters]]
        end = click_offsets[session_idx_arr[iters] + 1]
        mask = [] # indicator for the sessions to be terminated
        finished = False        

        while not finished:
            minlen = (end - start).min()
            # Item indices(for embedding) for clicks where the first sessions start
            idx_target = df.item_idx.values[start]
            for i in range(minlen - 1):
                # Build inputs & targets
                idx_input = idx_target
                idx_target = df.item_idx.values[start + i + 1]
                input = idx_input
                target = idx_target
                yield input, target, mask
                
            # click indices where a particular session meets second-to-last element
            start = start + (minlen - 1)
            # see if how many sessions should terminate
            mask = np.arange(len(iters))[(end - start) <= 1]
            for idx in mask:
                maxiter += 1
                if maxiter >= len(click_offsets) - 1:
                    finished = True
                    break
                # update the next starting/ending point
                iters[idx] = maxiter
                start[idx] = click_offsets[session_idx_arr[maxiter]]
                end[idx] = click_offsets[session_idx_arr[maxiter] + 1]

In [5]:
batch_size = 512 #como en el paper
session_max_len = 100
embeddingp=False

n_items = len(train_data['ItemId'].unique())+1
print("Items unicos training:", n_items)

dev_n_items = len(dev_data['ItemId'].unique())+1
print("Items unicos dev:", dev_n_items)

test_n_items = len(test_data['ItemId'].unique())+1
print("Items unicos testing:", test_n_items)

train_samples_qty = len(train_data['SessionId'].unique()) # cantidad sesiones no augmentadas de train
print("Sesiones training:", train_samples_qty)

dev_samples_qty = len(dev_data['SessionId'].unique()) # cantidad sesiones no augmentadas de dev
print("Sesiones validation:",dev_samples_qty)

test_samples_qty = len(test_data['SessionId'].unique()) # cantidad sesiones no augmentadas de test
print("Sesiones testing:", test_samples_qty)

Items unicos training: 11619
Items unicos dev: 10103
Items unicos testing: 10365
Sesiones training: 19850
Sesiones validation: 5747
Sesiones testing: 5270


In [6]:
train_fraction = 1#256 # 1/fraction es la cantidad de sesiones mas recientes a considerar
dev_fraction = 1#2

train_offset_step=train_samples_qty//batch_size
dev_offset_step=dev_samples_qty//batch_size
test_offset_step=test_samples_qty//batch_size


aux = [0]
aux.extend(list(train_data['ItemId'].unique()))
itemids = np.array(aux)
itemidmap = pd.Series(data=np.arange(n_items), index=itemids) 

In [7]:
# Modelo

# ToDo: self-attention

def attention_3d_block(inputs, TIME_STEPS, SINGLE_ATTENTION_VECTOR=True):
    # inputs.shape = (batch_size, time_steps, input_dim)
    input_dim = int(inputs.shape[2])
    a = Permute((2, 1))(inputs)
    #a = Reshape((input_dim, TIME_STEPS))(a) # this line is not useful. It's just to know which dimension is what.
    a = Dense(TIME_STEPS, activation='softmax')(a)
    if SINGLE_ATTENTION_VECTOR:
        a = Lambda(lambda x: K.mean(x, axis=1), name='dim_reduction')(a)
        a = RepeatVector(input_dim)(a)
    a_probs = Permute((2, 1), name='attention_vec')(a)
    output_attention_mul = merge([inputs, a_probs], name='attention_mul', mode='mul')
    return output_attention_mul
    
emb_size = 50
hidden_units = 100
size = emb_size
#size = emb_size if embeddingp else n_items

inputs = Input(batch_shape=(batch_size, 1, n_items))
#emb = Embedding(n_items, emb_size, embeddings_initializer='uniform', input_length=session_max_len)(inputs)
#drop1 = Dropout(0.25)(emb)
gru, gru_states = CuDNNGRU(hidden_units, stateful=True, return_state=True)(inputs)# drop1) #
drop2 = Dropout(0.25)(gru)
#attention_mul = attention_3d_block(drop2, session_max_len)
#attention_mul = Flatten()(attention_mul)
predictions = Dense(n_items, activation='softmax')(drop2)#(attention_mul)#
model = Model(input=inputs, output=[predictions])
#custom_loss = custom_cosine_loss(itemidmap, n_items)
# lr original es 0.0001
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
# Try Nadam, too
model.compile(loss=categorical_crossentropy, optimizer=opt)
model.summary()

#filepath='./bast/model_checkpoint'
#checkpoint = ModelCheckpoint(filepath, monitor='loss', verbose=2, save_best_only=True, mode='min')
callbacks_list = []#[checkpoint]

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (512, 1, 11619)           0         
_________________________________________________________________
cu_dnngru_1 (CuDNNGRU)       [(512, 100), (512, 100)]  3516300   
_________________________________________________________________
dropout_1 (Dropout)          (512, 100)                0         
_________________________________________________________________
dense_1 (Dense)              (512, 11619)              1173519   
Total params: 4,689,819
Trainable params: 4,689,819
Non-trainable params: 0
_________________________________________________________________




In [8]:
def get_states(model):
    return [K.get_value(s) for s,_ in model.state_updates]

def set_states(model, states):
    for (d,_), s in zip(model.state_updates, states):
        K.set_value(d, s)

In [14]:
def get_recall(generator, model, recall_k=20):

    n = 0
    suma = 0
    suma_baseline = 0

    for feat, label, mask in generator:
        input_oh = to_categorical(feat, num_classes=loader.n_items) 
        input_oh = np.expand_dims(input_oh, axis=1)

        target_oh = to_categorical(label, num_classes=loader.n_items)

        pred = model.predict(input_oh, batch_size=batch_size)

        if n%100 == 0:
            try:
                print("{}:{}".format(n, suma/n))
            except:
                pass

        for row_idx in range(feat.shape[0]):
            #baseline_pred = obj.recommend( str(test_batch[0][row_idx][-1]), 20 )
            pred_row = pred[row_idx] # 37484, #.reshape(1, -1) # 50,
            label_row = target_oh[row_idx]        #.reshape(1, -1) # 50,

            idx1 = pred_row.argsort()[-recall_k:][::-1]
            idx2 = label_row.argsort()[-1:][::-1]

            n += 1
            #print(idx1)
            #print(idx2)
            if idx2[0] in idx1:
                suma += 1

            #if idx2[0] in baseline_pred:
            #  suma_baseline += 1

    print("Recall@{} epoch {}: {}".format(recall_k, epoch, suma/n))

In [10]:
train_dataset = SessionDataset(train_data)

#print(dataset.df.head())

for epoch in range(3):
    print("Starting epoch {}...".format(epoch))
    loader = SessionDataLoader(train_dataset, batch_size=batch_size)
    for feat, target, mask in loader:
                
        input_oh = to_categorical(feat, num_classes=loader.n_items) 
        input_oh = np.expand_dims(input_oh, axis=1)
        
        target_oh = to_categorical(target, num_classes=loader.n_items)
        
        tr_loss = model.train_on_batch(input_oh, target_oh)
        
        real_mask = np.ones((batch_size, 1))
        for elt in mask:
            real_mask[elt, :] = 0
        
        hidden_states = get_states(model)[0]#512,100    #get_states(model)[0]
               
        hidden_states = np.multiply(real_mask, hidden_states)
        hidden_states = np.array(hidden_states, dtype=np.float32)
        #hidden_states = np.expand_dims(hidden_states, axis=0)
        
        #set_states(model, hidden_states)\):
        #print(hidden_states.shape)
        #K.set_value(model.layers[1].states, hidden_states)
        model.layers[1].reset_states(hidden_states)

    
    print("Epoch {} last loss: {}".format(epoch, tr_loss))
    
    test_dataset = SessionDataset(test_data)
    test_generator = SessionDataLoader(test_dataset, batch_size=batch_size)
    
    get_recall(test_generator, model)

Starting epoch 0...
Epoch 0 last loss: 9.117029190063477
12800:0.001171875
25600:0.0005859375
38400:0.000390625
51200:0.00029296875
64000:0.000234375
76800:0.0001953125
89600:0.0007366071428571429
102400:0.00166015625
115200:0.002560763888888889
128000:0.0031328125
140800:0.0029119318181818183
153600:0.0026692708333333334
166400:0.003894230769230769
179200:0.005
192000:0.00590625
Recall@20 epoch 0: 0.006038326243455497
Starting epoch 1...
Epoch 1 last loss: 9.143878936767578
12800:0.138125
25600:0.113828125
38400:0.09427083333333333
51200:0.08814453125
64000:0.090515625
76800:0.08895833333333333
89600:0.08435267857142857
102400:0.083828125
115200:0.08611111111111111
128000:0.0853671875
140800:0.08402698863636364
153600:0.08501302083333333
166400:0.0841826923076923
179200:0.08396763392857143
192000:0.08227083333333333
Recall@20 epoch 1: 0.08143304155759162
Starting epoch 2...
Epoch 2 last loss: 8.178423881530762
12800:0.408125
25600:0.361484375
38400:0.3076041666666667
51200:0.308457031

In [24]:
dev_dataset = SessionDataset(dev_data)
dev_generator = SessionDataLoader(dev_dataset, batch_size=batch_size)
    
get_recall(dev_generator, model)

    SessionId  ItemId        Time  item_idx
0          49    2302  1367549005         0
1        1268    2302  1386899188         0
2        1597    2302  1377895328         0
3        2161    2302  1390601637         0
4        3990    2302  1392030145         0
5        4967    2302  1371530281         0
6        4983    2302  1369601471         0
7        5039    2302  1387623912         0
8        6092    2302  1366831892         0
9        6615    2302  1378716484         0
10       7098    2302  1367237291         0
11       7209    2302  1369090925         0
12       7922    2302  1371954565         0
13       8753    2302  1388952359         0
14       9353    2302  1381790220         0
15      10903    2302  1370880281         0
16      16899    2302  1377453013         0
17      17281    2302  1395356543         0
18      21481    2302  1385223173         0
19      21636    2302  1366332013         0
20      27171    2302  1386896988         0
21      28368    2302  138765121

KeyboardInterrupt: 

# Resultados Movie Lens con Batcher paralelo
Recall@20 epoch 3: 0.252019592604712


# All train set
Recall@10 epoch 29: 0.08087340943113773
Recall@20: 0.10473194236526946

vs Hidasi

Recall @ 20 0.2177499329156604
MRR@20: 0.06513681594077811

Pruebas atencion
Baseline
Recall@20 epoch 49: 0.09473825785928144
MultAttn



# Train Set
Recall@10 epoch ..100?: 0.09546921781437126

Recall@10 epoch 14: 0.06404879908501715

Recall @20 epoch 99: 0.08705440681137724

Con session_max_len = 100:

Recall @20 epoch 9: 0.12195335890718563

Con dwell_time NO FUNCIONA BIEN. Hacer ese supuesto en este dataset no tiene sentido.
