# Let´s recommend!!

We will load the model that generate candidates and create a function that receives a customer as input and returns a top of N products to be recommended. We will evaluate the results afterwords.

In [142]:
import pandas as pd
import os, sys
import numpy as np
import seaborn as sns
import gc
import warnings
import matplotlib.pyplot as plt

from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from gensim.models import Word2Vec

In [122]:
path = os.path.join('../../Data/')
data = pd.read_csv(path + 'data_filtered_20190422.csv', sep = ';')
data_processed = pd.read_csv('data_final_20190522.csv')
data_processed['text'] = data_processed['text'].astype(str)

In [123]:
items_unique = data_processed.item_id.unique()
items_map = {i:val for i,val in enumerate(items_unique)}
items_map_inv = {val:i for i,val in enumerate(items_unique)}
items_map_text = data_processed.set_index('item_id_int').text.to_dict()

In [124]:
data.head(3)

Unnamed: 0,item_id,availability_date,brand,category,name,price
0,A28233506,,Woman Limited El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo masculino con textura de mujer,"{'final': 199, 'currency': 'EUR'}"
1,A29054782,,Woman Limited El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo doble faz de mujer con cinturón a tono,"{'final': 149, 'currency': 'EUR'}"
2,A27354432,,Woman El Corte Inglés,"['Moda', 'Mujer', 'Abrigos']",Abrigo largo de antelina de mujer Woman El Cor...,"{'final': 89.99, 'currency': 'EUR'}"


In [125]:
data_processed.head(3)

Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
0,20190101,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.003465,4e-06,1.2e-05,0.0,0.045126,0.058867,0.0
1,20190115,A26036172,tintoretto,0.002036,0,vestido mujer flor lazada,0.0,0,0,0,0.003465,4e-06,1.2e-05,0.0,0.045126,0.058867,0.0
2,20190101,A26870590,fórmula joven,0.001444,0,vestido laminado mujer formula joven escote pico,0.0,0,1,1,0.001485,2e-06,2e-06,0.0,0.037997,0.038538,0.0


In [126]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

all_text = data_processed['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_processed['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 12056 unique tokens.


In [127]:
model = load_model('candidate_generation_20190522')

## Predictions

In [128]:
test_pred = model.predict([data_processed['customer_id_int'], data_processed['item_id_int'], 
                     data_processed['brand_id'], data_processed['PRICE'],
                     data_vec, data_processed['item_age'], data_processed['score'],
                     data_processed['power_price'], data_processed['power_score'], data_processed['power_item_age'],
                     data_processed['sqrt_price'], data_processed['sqrt_score'], data_processed['sqrt_item_age']],
                     verbose = 1)



In [129]:
table = pd.pivot_table(data_processed, values='score', index=['customer_id_int'],
                      columns=['item_id_int'], aggfunc=np.sum, fill_value=0)

print(table.shape)
columnas = table.columns
table.head()

(1112, 34849)


item_id_int,0,1,2,3,4,5,6,7,8,9,...,34839,34840,34841,34842,34843,34844,34845,34846,34847,34848
customer_id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.006931,0.001485,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
1,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
2,0.0,0.0,0,0.0,0,0.008911,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
3,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0
4,0.0,0.0,0,0.0,0,0.0,0.0,0.0,0,0,...,0.0,0.0,0.0,0,0,0,0,0,0,0.0


In [130]:
def get_tabla_predicciones(predictions):
    data_preds = pd.DataFrame(predictions)
    data_preds['customer_id_int'] = data_processed['customer_id_int']
    data_preds = data_preds.groupby(['customer_id_int']).max()
    data_preds.columns = columnas.values
    return data_preds
        
data_preds = get_tabla_predicciones(test_pred)
print(data_preds.shape)
gc.enable()
del test_pred
gc.collect()
data_preds.head()

(1112, 34849)


Unnamed: 0_level_0,0,1,2,3,4,5,6,7,8,9,...,34839,34840,34841,34842,34843,34844,34845,34846,34847,34848
customer_id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.337613,0.022734,8.7e-05,0.009169,0.003799,0.0011,0.000503,0.002297,7.2e-05,0.000303,...,5.5e-05,0.000107,2.7e-05,2.9e-05,2.3e-05,3.1e-05,2.7e-05,5.3e-05,4.2e-05,2.4e-05
1,0.002988,0.001911,8.7e-05,0.964418,0.045386,0.000697,0.000713,0.001994,4e-05,0.000243,...,3.1e-05,5.8e-05,3.3e-05,3.6e-05,3.1e-05,4.3e-05,2.4e-05,3.1e-05,2.2e-05,2.8e-05
2,0.001847,0.000361,5.3e-05,0.001603,0.001132,0.011029,0.002276,0.006028,8.4e-05,0.000609,...,4.3e-05,5.9e-05,3e-05,3.7e-05,2.9e-05,4.3e-05,3.6e-05,8.8e-05,7.3e-05,2.6e-05
3,0.006196,0.000634,8.2e-05,0.004045,0.000377,0.004397,0.003208,0.018453,0.000101,0.000741,...,3.8e-05,5.3e-05,3e-05,3.6e-05,2.5e-05,3.7e-05,2.7e-05,7.9e-05,7.6e-05,2.5e-05
4,0.002654,0.000147,1.5e-05,0.003008,0.00031,4.6e-05,6.7e-05,0.000191,2.5e-05,4.3e-05,...,1.8e-05,2e-05,2.8e-05,3.2e-05,2.5e-05,3.2e-05,2.3e-05,1.6e-05,1.6e-05,2.3e-05


In [250]:
def diversify(arr, diversity, plot = False):
    div = np.log(arr) / diversity
    exp_preds = np.exp(div)
    preds = exp_preds / np.sum(exp_preds)
    if plot:
        plt.figure(figsize = (10, 8));
        plt.subplot(2, 1, 1);
        sns.distplot(arr); plt.title('Original Distribution');
        plt.subplot(2, 1, 2);
        sns.distplot(preds); plt.title(f'Distribution with {diversity} diversity')
    probas = np.random.multinomial(1, preds, 1)
    return probas

#diversify(data_preds.loc[128], diversity = 100, plot = True)

In [251]:
len(items_unique)

34849

In [271]:
def get_afines(data_pred, cliente, items_unique, N = 5):
    data_pred = diversify(data_pred.loc[cliente] , diversity = 0.01)
    data_pred = data_pred.reshape(len(items_unique))
    #print(data_pred)
    #top = data_pred.argsort()[-N:][::-1] #items positions
    top = data_pred.argsort()[-N:][::-1]
    print(top)
    print ("===================== PRODUCTOS MAS AFINES =====================")
    print([items_map_text[x] for x in (top)])
    print ("=============================================================")

In [272]:
CLIENTE = 128
get_afines(data_pred = data_preds, cliente = CLIENTE, items_unique = items_unique, N = 10)
data_processed[data_processed['customer_id_int'] == CLIENTE]

[14521 34848 11618 11617 11616 11615 11614 11613 11612 11434]
['chaqueta hombre cuello alto', 'calcetin compresion pro racing v30 bike', 'falda ajustada mujer green coast lurex', 'falda ajustada mujer green coast color negro', 'culotte tirante hombre race', 'culotte hombre race', 'vestido corto mujer estampado delantero', 'camiseta mujer estampado manga francesa', 'camiseta mujer efecto camisa interior', 'estuche regalo contorno ojo labio future solution eyelip']


Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
1024,20190120,A16759315,dustin,0.002185,128,chaqueta punto hombre cuello alto,0.0,128,898,229,0.0,4.772151e-06,0.0,0.0,0.046739,0.0,0.0
1025,20190120,A24965927,dustin,0.001888,128,jersey hombre cuello redondo,0.0,128,899,229,0.0,3.565368e-06,0.0,0.0,0.043454,0.0,0.0
1831,20190120,A27344180,esprit,0.001811,128,jersey hombre gris oscuro cuello caja,0.0,128,1580,257,0.0,3.280366e-06,0.0,0.0,0.042558,0.0,0.0
2349,20190120,A4690016,warner bros entertainment,0.003518,128,pack harry_potter coleccion completa bluray,0.0,128,2002,325,0.0,1.237568e-05,0.0,0.0,0.059312,0.0,0.0
2350,20190120,A10317195,warner bros entertainment,0.002259,128,harry_potter reliquia muerte parte 3d bluray c...,0.0,128,2003,325,0.0,5.101284e-06,0.0,0.0,0.047525,0.0,0.0
2351,20190120,A12070559,warner bros entertainment,0.012482,128,harry_potter coleccion hogwarts dvd bluray,0.0,128,2004,325,0.0,0.0001557989,0.0,0.0,0.111723,0.0,0.0
2352,20190120,A13453366,warner bros entertainment,0.000999,128,harry_potter piedra filosofal dvd,0.0,128,2005,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
2353,20190120,A13453371,warner bros entertainment,0.000999,128,harry_potter camara secreta dvd,0.0,128,2006,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
2354,20190120,A13453375,warner bros entertainment,0.000999,128,harry_potter prisionero azkaban dvd,0.0,128,2007,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
2355,20190120,A13453380,warner bros entertainment,0.000999,128,harry_potter caliz fuego dvd,0.0,128,2008,325,0.0,9.98593e-07,0.0,0.0,0.031612,0.0,0.0
