# Rank Model

This model will rank the items generated by the candidates model and sort recommendations due to the ranked output. Trys to predict the quantity of times the customer is going to purchase that item.

In [1]:
import pandas as pd
import os, sys
import numpy as np
import seaborn as sns
import gc
import warnings

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from gensim.models import Word2Vec

from tensorflow.keras.regularizers import l1, l2
from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.layers import Input, Flatten, Dense, Conv1D, MaxPooling1D, GlobalMaxPool1D, SpatialDropout1D, \
                          LSTM, GRU, concatenate, Bidirectional, \
                          Reshape, Dropout, GlobalAveragePooling1D

from sklearn.preprocessing import StandardScaler, MinMaxScaler

from tensorflow.keras.utils import to_categorical

from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt

from custom_functions import norm_text, norm_brands, norm_images

pd.set_option('max_colwidth', 250)
warnings.filterwarnings(action='once')

Using TensorFlow backend.


## Load Data

In [2]:
path = os.path.join('../Data/')
path_models = os.path.join('../Models/')

data = pd.read_csv(path + 'FinalItems/data_filtered.csv', sep = ';')
data_processed = pd.read_csv(path + 'FinalItems/data_final.csv')
data_processed['text'] = data_processed['text'].astype(str)

In [3]:
data_processed.head(3)

Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,score_original,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
0,20190101,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.004648,13,4e-06,2.2e-05,0.0,0.045126,0.068173,0.0
1,20190115,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.004648,13,4e-06,2.2e-05,0.0,0.045126,0.068173,0.0
2,20190220,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.004648,13,4e-06,2.2e-05,0.0,0.045126,0.068173,0.0


In [4]:
items_unique = data_processed.item_id.unique()
items_map = {i:val for i,val in enumerate(items_unique)}
items_map_inv = {val:i for i,val in enumerate(items_unique)}
items_map_text = data_processed.set_index('item_id_int').text.to_dict()

customers_unique = data_processed.customer_id.unique()
items_unique = data_processed.item_id.unique()
brand_unique = data_processed.brand.unique()

In [5]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

all_text = data_processed['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_processed['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 15982 unique tokens.


### Load Embedding

In [8]:
modelWV = Word2Vec.load(path_models + 'word2vec_model')

word_vectors = modelWV.wv
vocabulary_size = len(word_index) + 1
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))

for word, i in word_index.items():
    if word in modelWV:
        embedding_matrix[i] = modelWV[word]
    else:
        embedding_matrix[i] = np.random.rand(1, EMBEDDING_DIM)[0]
            

del(word_vectors)

  
  if __name__ == '__main__':


## Build rank model

In [9]:
scaler = MinMaxScaler()
data_to_scale = data_processed[['PRICE','score', 'item_age']]
scaled_data = data_to_scale.values

In [13]:
def get_rank_model():
    user_id_input = Input(shape=[1], name='user')
    item_id_input = Input(shape=[1], name='item')
    brand_id_input = Input(shape = [1], name = 'brand')

    price_input = Input(shape = [1], name = 'price')
    item_age_input = Input(shape = [1], name = 'item_age')

    price_power_input = Input(shape = [1], name = 'power_price')
    item_age_power_input = Input(shape = [1], name = 'power_item_age')

    price_sqrt_input = Input(shape = [1], name = 'sqrt_price')
    item_age_sqrt_input = Input(shape = [1], name = 'sqrt_item_age')
    
    sequence_input = Input(shape = (MAX_SEQUENCE_LENGTH, ), name = 'text')

    embedding_size = 20
    item_embedding_size = 50

    user_embedding = Embedding(output_dim=embedding_size, input_dim = customers_unique.shape[0]+1,
                               input_length=1, name='user_embedding')(user_id_input)
    item_embedding = Embedding(output_dim=item_embedding_size, input_dim = items_unique.shape[0]+1,
                               input_length=1, name='item_embedding')(item_id_input)
    brand_embedding = Embedding(output_dim=embedding_size, input_dim = brand_unique.shape[0]+1,
                               input_length=1, name='brand_embedding')(brand_id_input)
    
    text_embedding = Embedding(*embedding_matrix.shape, weights = [embedding_matrix], trainable = False)(sequence_input)

    user_vecs = Reshape([embedding_size])(user_embedding)
    item_vecs = Reshape([item_embedding_size])(item_embedding)
    brand_vecs = Reshape([embedding_size])(brand_embedding)
    text_flat = Flatten()(text_embedding)

    x1 = concatenate([user_vecs, item_vecs, brand_vecs, price_input, item_age_input,
                     price_power_input, item_age_power_input,
                     price_sqrt_input, item_age_sqrt_input], name = 'VECTOR_PRODUCTO')

    x2 = Dropout(0.5)(x1)
    x3 = Dense (512, activation = 'relu', activity_regularizer=l1(0.0001))(x2)
    x4 = Dropout(0.4)(x3)
    x5 = Dense(256, activation = 'relu', activity_regularizer=l1(0.0001))(x4)
    x6 = Dropout(0.3)(x5)
    x7 = Dense(128, activation = 'relu',activity_regularizer=l1(0.0001))(x6)
    x8 = Dropout(0.2)(x7)

    output = Dense(1, activation = 'linear' )(x8)

    model = Model(inputs=[user_id_input, item_id_input, brand_id_input, price_input, item_age_input,
                         price_power_input, item_age_power_input,
                         price_sqrt_input, item_age_sqrt_input], 
                  outputs = output)

    model.compile(loss = 'mse',
                  optimizer = 'rmsprop',
                  metrics = ['acc'])

    model.summary()
    
    return model

In [14]:
rank_model = get_rank_model()

history = rank_model.fit([data_processed['customer_id_int'], data_processed['item_id_int'], 
                     data_processed['brand_id'], data_processed['PRICE'],
                     data_processed['item_age'],
                     data_processed['power_price'], data_processed['power_item_age'],
                     data_processed['sqrt_price'], data_processed['sqrt_item_age'], data_vec],
                     data_processed['score_original'],
                     epochs = 15,
                     batch_size = 256, 
                     shuffle = True)

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
user (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
brand (InputLayer)              (None, 1)            0                                            
__________________________________________________________________________________________________
user_embedding (Embedding)      (None, 1, 20)        89080       user[0][0]                       
__________________________________________________________________________________________________
item_embed

Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [15]:
rank_model.save(path_models + 'rank_model')

### Lets test our rankings

In [16]:
preds = rank_model.predict([data_processed['customer_id_int'], data_processed['item_id_int'], 
                     data_processed['brand_id'], data_processed['PRICE'],
                     data_processed['item_age'],
                     data_processed['power_price'], data_processed['power_item_age'],
                     data_processed['sqrt_price'], data_processed['sqrt_item_age'], data_vec], verbose = 1)



In [17]:
preds.shape

(186397, 1)

In [18]:
preds[:10]

array([[2.0089362],
       [2.0089362],
       [2.0089362],
       [2.0089362],
       [1.8771589],
       [1.8415782],
       [1.8415782],
       [1.7933329],
       [1.7933329],
       [1.8692055]], dtype=float32)

In [19]:
data_processed['score_original'][:10]

0    13
1    13
2    13
3    13
4     4
5     5
6     5
7     2
8     2
9     2
Name: score_original, dtype: int64

#### Query ranking

We will take customer 128 as query to give an example of our rankings. This customer is a Harry Potter freak as I show below.

In [22]:
c_idx = data_processed.index[data_processed['customer_id_int'] == 128] 
data_processed[data_processed['customer_id_int'] == 128]

Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,score_original,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
1468,20190120,A16759315,dustin,0.002185,128,chaqueta punto hombre cuello alto,0.0,128,898,229,0.0,1,4.772151e-06,0.0,0.0,0.046739,0.0,0.0
1469,20190120,A24965927,dustin,0.001888,128,jersey hombre cuello redondo,0.0,128,899,229,0.0,1,3.565368e-06,0.0,0.0,0.043454,0.0,0.0
2561,20190120,A27344180,esprit,0.001811,128,jersey hombre gris oscuro cuello caja,0.0,128,1580,257,0.0,1,3.280366e-06,0.0,0.0,0.042558,0.0,0.0
3232,20190120,A4690016,warner bros entertainment,0.003518,128,pack harry_potter coleccion completa bluray,0.0,128,2002,325,0.0,1,1.237568e-05,0.0,0.0,0.059312,0.0,0.0
3233,20190120,A10317195,warner bros entertainment,0.002259,128,harry_potter reliquia muerte parte 3d bluray copia digital,0.0,128,2003,325,0.0,1,5.101284e-06,0.0,0.0,0.047525,0.0,0.0
3234,20190120,A12070559,warner bros entertainment,0.012482,128,harry_potter coleccion hogwarts dvd bluray,0.0,128,2004,325,0.0,1,0.0001557989,0.0,0.0,0.111723,0.0,0.0
3235,20190120,A13453366,warner bros entertainment,0.000999,128,harry_potter piedra filosofal dvd,0.0,128,2005,325,0.0,1,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
3236,20190120,A13453371,warner bros entertainment,0.000999,128,harry_potter camara secreta dvd,0.0,128,2006,325,0.0,1,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
3237,20190120,A13453375,warner bros entertainment,0.000999,128,harry_potter prisionero azkaban dvd,0.0,128,2007,325,0.0,1,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
3238,20190120,A13453380,warner bros entertainment,0.000999,128,harry_potter caliz fuego dvd,0.0,128,2008,325,0.0,1,9.98593e-07,0.0,0.0,0.031612,0.0,0.0


In [23]:
preds[c_idx]

array([[1.5226746],
       [1.4778094],
       [1.8714023],
       [1.7846951],
       [1.7682991],
       [1.757714 ],
       [1.7694252],
       [1.760143 ],
       [1.7680278],
       [1.7580414],
       [1.7599478]], dtype=float32)

### Conclussion

- We are going to reject this model because is performing a huge bias with the items that are most usually bought as clothes, so it will trend to rank with higher predictions that kind of items even though our customer is a harry potter freak, as we saw in the example above.

- In order to perform a good ranking we will need the info of when a customer is going to buy again?

---