# Let´s recommend!!

We will load the model that generate candidates and create a function that receives a customer as input and returns a top of N products to be recommended. We will evaluate the results afterwords.

In [1]:
import pandas as pd
import os, sys
import numpy as np
import seaborn as sns
import gc
import warnings 

from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from gensim.models import Word2Vec

Using TensorFlow backend.


In [2]:
path = os.path.join('../../Data/')
data = pd.read_csv('data_filtered_20190422.csv', sep = ';')
data_processed = pd.read_csv('data_final_20190522.csv')
data_processed['text'] = data_processed['text'].astype(str)

In [123]:
items_unique = data_processed.item_id.unique()
items_map = {i:val for i,val in enumerate(items_unique)}
items_map_inv = {val:i for i,val in enumerate(items_unique)}
items_map_text = data_processed.set_index('item_id_int').text.to_dict()

In [124]:
data.head(3)

Unnamed: 0,item_id,availability_date,brand,category,name,price
0,A28233506,,Woman Limited El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo masculino con textura de mujer,"{'final': 199, 'currency': 'EUR'}"
1,A29054782,,Woman Limited El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo doble faz de mujer con cinturón a tono,"{'final': 149, 'currency': 'EUR'}"
2,A27354432,,Woman El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo largo de antelina de mujer Woman El Cor...,"{'final': 89.99, 'currency': 'EUR'}"


In [125]:
data_processed.head(3)

Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
0,20190101,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.003465,4e-06,1.2e-05,0.0,0.045126,0.058867,0.0
1,20190115,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.003465,4e-06,1.2e-05,0.0,0.045126,0.058867,0.0
2,20190101,A26870590,fórmula joven,0.001444,0,vestido laminado mujer formula joven escote pico,0.0,0,1,1,0.001485,2e-06,2e-06,0.0,0.037997,0.038538,0.0


In [126]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

all_text = data_processed['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_processed['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 12056 unique tokens.


In [127]:
model = load_model('candidate_generation_20190522')

## Predictions

In [128]:
test_pred = model.predict([data_processed['customer_id_int'], data_processed['item_id_int'], 
                     data_processed['brand_id'], data_processed['PRICE'],
                     data_vec, data_processed['item_age'], data_processed['score'],
                     data_processed['power_price'], data_processed['power_score'], data_processed['power_item_age'],
                     data_processed['sqrt_price'], data_processed['sqrt_score'], data_processed['sqrt_item_age']],
                     verbose = 1)



In [129]:
table = pd.pivot_table(data_processed, values='score', index=['customer_id_int'],
                      columns=['item_id_int'], aggfunc=np.sum, fill_value=0)

print(table.shape)
columnas = table.columns
table.head()

(1112, 34849)


item_id_int,0,1,2,3,4,5,6,7,8,9,...,34839,34840,34841,34842,34843,34844,34845,34846,34847,34848
customer_id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.006931,0.001485,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
1,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
2,0.0,0.0,0,0.0,0,0.008911,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
3,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
4,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0


In [None]:
def get_tabla_predicciones(predictions):
    data_preds = pd.DataFrame(predictions)
    data_preds['customer_id_int'] = data_processed['customer_id_int']
    data_preds = data_preds.groupby(['customer_id_int']).max()
    data_preds.columns = columnas.values
    return data_preds
        
data_preds = get_tabla_predicciones(test_pred)
print(data_preds.shape)
gc.enable()
del test_pred
gc.collect()
data_preds.head()

In [None]:
def get_afines(data_pred, cliente, items_unique, N = 5):
    top = data_pred.loc[cliente].values.argsort()[-N:][::-1] #items positions
    print ("===================== PRODUCTOS MAS AFINES =====================")
    print([items_map_text[x] for x in (top)])
    print ("=============================================================")

In [None]:
CLIENTE = 128
get_afines(data_pred = data_preds, cliente = CLIENTE, items_unique = items_unique, N = 25)
data_processed[data_processed['customer_id_int'] == CLIENTE]