# Let´s recommend!!

We will load the model that generate candidates and create a function that receives a customer as input and returns a top of N products to be recommended. We will evaluate the results afterwords.

In [34]:
import pandas as pd
import os, sys
import numpy as np
import seaborn as sns
import gc
import warnings 

from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from gensim.models import Word2Vec

In [87]:
path = os.path.join('../../Data/')
data = pd.read_csv(path + 'data_filtered_20190422.csv', sep = ';')
data_processed = pd.read_csv('data_final_20190522.csv')
data_processed['text'] = data_processed['text'].astype(str)

In [88]:
items_unique = data_processed.item_id.unique()
items_map = {i:val for i,val in enumerate(items_unique)}
items_map_inv = {val:i for i,val in enumerate(items_unique)}
items_map_text = data_processed.set_index('item_id_int').text.to_dict()

In [89]:
data.head(3)

Unnamed: 0,item_id,availability_date,brand,category,name,price
0,A28233506,,Woman Limited El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo masculino con textura de mujer,"{'final': 199, 'currency': 'EUR'}"
1,A29054782,,Woman Limited El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo doble faz de mujer con cinturón a tono,"{'final': 149, 'currency': 'EUR'}"
2,A27354432,,Woman El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo largo de antelina de mujer Woman El Cor...,"{'final': 89.99, 'currency': 'EUR'}"


In [90]:
data_processed.head(3)

Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
0,20190101,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.003465,4e-06,1.2e-05,0.0,0.045126,0.058867,0.0
1,20190115,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.003465,4e-06,1.2e-05,0.0,0.045126,0.058867,0.0
2,20190101,A26870590,fórmula joven,0.001444,0,vestido laminado mujer formula joven escote pico,0.0,0,1,1,0.001485,2e-06,2e-06,0.0,0.037997,0.038538,0.0


In [91]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

all_text = data_processed['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_processed['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 11327 unique tokens.


In [92]:
model = load_model('candidate_generation_20190522')

## Predictions

In [49]:
test_pred = model.predict([data_processed['customer_id_int'], data_processed['item_id_int'], 
                     data_processed['brand_id'], data_processed['PRICE'],
                     data_vec, data_processed['item_age'], data_processed['score'],
                     data_processed['power_price'], data_processed['power_score'], data_processed['power_item_age'],
                     data_processed['sqrt_price'], data_processed['sqrt_score'], data_processed['sqrt_item_age']],
                     verbose = 1)



In [51]:
table = pd.pivot_table(data_processed, values='score', index=['customer_id_int'],
                      columns=['item_id_int'], aggfunc=np.sum, fill_value=0)

print(table.shape)
columnas = table.columns
table.head()

(970, 30848)


item_id_int,0,1,2,3,4,5,6,7,8,9,...,30838,30839,30840,30841,30842,30843,30844,30845,30846,30847
customer_id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.006931,0.001485,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0,0.0,0.0,0,0,0,0
1,0.0,0.0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0,0.0,0.0,0,0,0,0
2,0.0,0.0,0,0.0,0,0.008911,0,0,0,0,...,0,0,0,0,0.0,0.0,0,0,0,0
3,0.0,0.0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0,0.0,0.0,0,0,0,0
4,0.0,0.0,0,0.0,0,0.0,0,0,0,0,...,0,0,0,0,0.0,0.0,0,0,0,0


In [53]:
def get_tabla_predicciones(predictions):
    data_preds = pd.DataFrame(predictions)
    data_preds['customer_id_int'] = data_processed['customer_id_int']
    data_preds = data_preds.groupby(['customer_id_int']).max()
    data_preds.columns = columnas.values
    return data_preds
        
data_preds = get_tabla_predicciones(test_pred)
print(data_preds.shape)
gc.enable()
del test_pred
gc.collect()
data_preds.head()

(970, 30848)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,30838,30839,30840,30841,30842,30843,30844,30845,30846,30847
customer_id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.171533,0.005733,5.9e-05,0.014871,0.001094,0.000358,0.000293,0.000179,5.5e-05,0.000374,...,3e-05,6.3e-05,5.4e-05,3.1e-05,3.4e-05,3.8e-05,2.8e-05,2.9e-05,2.7e-05,3.3e-05
1,0.001183,0.000548,5.5e-05,0.089445,0.003917,0.000515,0.000484,0.00024,5.4e-05,0.000434,...,3.3e-05,5.1e-05,2.8e-05,2.9e-05,3.5e-05,3e-05,2.8e-05,2.8e-05,2.7e-05,3.3e-05
2,0.001363,0.000224,5.8e-05,0.000389,0.000174,0.000827,0.000706,0.000257,5.1e-05,0.000618,...,3.8e-05,4.9e-05,3.4e-05,2.8e-05,3.2e-05,3.7e-05,2.5e-05,3.1e-05,3.1e-05,3.3e-05
3,0.055017,0.002447,6.3e-05,0.011304,0.000481,0.000943,0.000797,0.000283,4.8e-05,0.000781,...,3.4e-05,5.5e-05,3.3e-05,2.9e-05,4.6e-05,3.7e-05,2.6e-05,3.4e-05,3.4e-05,3.3e-05
4,0.023861,0.000496,2.2e-05,0.007685,0.000813,5e-05,3.7e-05,3.6e-05,2.6e-05,8.3e-05,...,3.1e-05,3e-05,2.7e-05,3.1e-05,2.9e-05,2.7e-05,2.8e-05,2.7e-05,2.5e-05,2.9e-05


In [114]:
def get_afines(data_pred, cliente, items_unique, N = 5):
    top = data_pred.loc[cliente].values.argsort()[-N:][::-1] #items positions
    print ("===================== PRODUCTOS MAS AFINES =====================")
    print([items_map_text[x] for x in (top)])
    print ("=============================================================")

In [120]:
CLIENTE = 128
get_afines(data_pred = data_preds, cliente = CLIENTE, items_unique = items_unique, N = 25)
data_processed[data_processed['customer_id_int'] == CLIENTE]

['camiseta basica mujer cuello barco', 'jersey hombre cuello redondo', 'chaqueta husky mujer cuello caja', 'chaqueta husky mujer cuello alto', 'jersey oversize mujer cuello caja', 'vestido largo escote barco', 'camiseta mujer algodon supima cuello pico', 'chaqueta hombre cuello alto', 'blusa mujer bordado volante', 'vestido mujer corto manga', 'chaqueta cuadro mujer cuello chimenea', 'vestido tricolor manga corta', 'pantalon pijama mujer largo raya', 'jersey mujer woman corte_ingles cuello caja', 'vestido camisero estampado raya', 'vestido mujer flor lazada', 'plumifero corto mujer woman corte_ingles cuello alto', 'juego toalla bano diamond corte_ingles', 'vestido manga punto calado', 'camiseta basica mujer punto canale', 'vestido mujer verde jareta', 'juego funda nordica algodon facil planchado selva corte_ingles', 'jersey mujer green coast estampado navideno cuello redondo', 'sudadera mujer logo capucha', 'pantalon pitillo mujer cierre delantero']


Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
1012,20190120,A16759315,dustin,0.002185,128,chaqueta punto hombre cuello alto,0.0,128,898,229,0.0,4.772151e-06,0.0,0.0,0.046739,0.0,0.0
1013,20190120,A24965927,dustin,0.001888,128,jersey hombre cuello redondo,0.0,128,899,229,0.0,3.565368e-06,0.0,0.0,0.043454,0.0,0.0
1803,20190120,A27344180,esprit,0.001811,128,jersey hombre gris oscuro cuello caja,0.0,128,1580,257,0.0,3.280366e-06,0.0,0.0,0.042558,0.0,0.0
2313,20190120,A4690016,warner bros entertainment,0.003518,128,pack harry potter coleccion completa bluray,0.0,128,2002,325,0.0,1.237568e-05,0.0,0.0,0.059312,0.0,0.0
2314,20190120,A10317195,warner bros entertainment,0.002259,128,harry potter reliquia muerte parte 3d bluray c...,0.0,128,2003,325,0.0,5.101284e-06,0.0,0.0,0.047525,0.0,0.0
2315,20190120,A12070559,warner bros entertainment,0.012482,128,harry potter coleccion hogwarts dvd bluray,0.0,128,2004,325,0.0,0.0001557989,0.0,0.0,0.111723,0.0,0.0
2316,20190120,A13453366,warner bros entertainment,0.000999,128,harry potter piedra filosofal dvd,0.0,128,2005,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
2317,20190120,A13453371,warner bros entertainment,0.000999,128,harry potter camara secreta dvd,0.0,128,2006,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
2318,20190120,A13453375,warner bros entertainment,0.000999,128,harry potter prisionero azkaban dvd,0.0,128,2007,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
2319,20190120,A13453380,warner bros entertainment,0.000999,128,harry potter caliz fuego dvd,0.0,128,2008,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
