In [1]:
import pandas as pd
import os, sys
import numpy as np
import seaborn as sns
import gc
import warnings 

from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from gensim.models import Word2Vec

Using TensorFlow backend.


In [2]:
path = os.path.join('../../Data/')
data = pd.read_csv('data_filtered_20190422.csv', sep = ';')
data_processed = pd.read_csv('data_final_20190523.csv')
data_processed['text'] = data_processed['text'].astype(str)

In [3]:
print(data_processed.shape)
data_processed.head(3)

(14976, 18)


Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,score_original,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
0,20190101,A26036172,tintoretto,0.003198,0,vestido mujer flor lazada,0.0,0,0,0,0.00208,4,1e-05,4e-06,0.0,0.056548,0.045612,0.0
1,20190101,A26870590,fórmula joven,0.00226,0,vestido laminado mujer formula joven escote pico,0.0,0,1,1,0.00208,4,5e-06,4e-06,0.0,0.047539,0.045612,0.0
2,20190103,MP_0659870_3014,gabrielle,0.008837,1,abrigo mujer negro avalorios,0.0,1,2,2,0.0,1,7.8e-05,0.0,0.0,0.094005,0.0,0.0


In [4]:
idx_customers_map = {i:val for i,val in enumerate(data_processed['customer_id_int'])}
idx_customers_df = pd.DataFrame({'idx': data_processed.index.values, 'customer_id_int': data_processed.customer_id_int})
idx_customers_df.head()

Unnamed: 0,idx,customer_id_int
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1


In [5]:
items_unique = data_processed.item_id.unique() #20781
customers_unique = data_processed.customer_id_int.unique()
items_map = {i:val for i,val in enumerate(items_unique)}
items_map_inv = {val:i for i,val in enumerate(items_unique)}
items_map_text = data_processed.set_index('item_id_int').text.to_dict()

items_map_brand = data_processed.set_index('item_id_int').brand_id.to_dict()
items_map_price = data_processed.set_index('item_id_int').PRICE.to_dict()
items_map_power_price = data_processed.set_index('item_id_int').power_price.to_dict()
items_map_sqrt_price = data_processed.set_index('item_id_int').sqrt_price.to_dict()

items_map_item_age = data_processed.set_index('item_id_int').item_age.to_dict()
items_map_power_item_age = data_processed.set_index('item_id_int').power_item_age.to_dict()
items_map_sqrt_item_age = data_processed.set_index('item_id_int').sqrt_item_age.to_dict()

customers_unique = data_processed.customer_id.unique()
items_unique = data_processed.item_id.unique()
brand_unique = data_processed.brand.unique()

In [6]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

all_text = data_processed['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_processed['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6568 unique tokens.


In [7]:
items_map_text_vec = data_processed.set_index('item_id_int').text.to_dict()
data_sequences_vec = tokenizer.texts_to_sequences(items_map_text_vec.values())
data_seq_vec = pad_sequences(data_sequences_vec, maxlen=MAX_SEQUENCE_LENGTH)

#### We need to create a dictionary to map the items_id to theis corresponding text vector

In [8]:
for i in items_map_text_vec.keys():
    items_map_text_vec[i] = data_seq_vec[i]

## Models

In [44]:
model_candidates = load_model('candidate_generation_20190523')
model_rank = load_model('rank_model_20190523')

In [10]:
input_length = len(model_rank.input_shape)

### Generate Candidates 

In [11]:
test_pred = model_candidates.predict([data_processed['customer_id_int'], data_processed['item_id_int'], 
                             data_processed['brand_id'], data_processed['PRICE'],
                             data_vec, data_processed['item_age'], data_processed['score'],
                             data_processed['power_price'], data_processed['power_score'], data_processed['power_item_age'],
                             data_processed['sqrt_price'], data_processed['sqrt_score'], data_processed['sqrt_item_age']],
                             verbose = 1)



### Prepare Data for Rank Model

Now we need to feed our rank model predictions with our candidates generated with the candidate generator. We need to prepare the data so the rank model can make predictions with it.

In [12]:
#1min
ITEMS_RANKED = 20
candidates = np.zeros((data_processed.shape[0], ITEMS_RANKED))

for i, preds in enumerate(test_pred):
    candidates[i] = preds.argsort()[-ITEMS_RANKED:][::-1]
candidates = candidates.astype(int) 

In [13]:
gc.enable()
del (test_pred, data, model_candidates)
gc.collect()

109

In [14]:
print(data_processed.shape)
print(candidates.shape)

(14976, 18)
(14976, 20)


In [15]:
%%time
X = np.zeros((input_length, data_processed.shape[0]*ITEMS_RANKED), dtype = object)
X_vec = np.zeros((data_processed.shape[0]*ITEMS_RANKED, MAX_SEQUENCE_LENGTH), dtype = object)
for row in range(candidates.shape[0]):
    for i, item in enumerate(candidates[row]):
        X[0, row+i] = data_processed['customer_id_int'][row] #Customer_Id
        X[1, row+i] = item #Item_Id
        X[2, row+i] = items_map_brand[item] #Brand_Id
        X[3, row+i] = items_map_price[item] #Price
        X[4, row+i] = -999 #items_map_text_vec[item] #Text_Vec
        X[5, row+i] = items_map_item_age[item] #item_age
        X[6, row+i] = items_map_power_price[item] #Power_Price
        X[7, row+i] = items_map_power_item_age[item] #Power item age
        X[8, row+i] = items_map_sqrt_price[item]
        X[9, row+i] = items_map_sqrt_item_age[item]
        X_vec[row+i] = items_map_text_vec[item] #items_map_text_vec[item] #Text_Vec

Wall time: 5.87 s


- ITEMS_RANKED = 10: 5min
- ITEMS_RANKED = 100: 20min

### Rank Predictions

In [45]:
preds = model_rank.predict([X[0], X[1],  X[2],
                            X[3], X_vec, X[5],
                            X[6], X[7], X[8], X[9]], verbose = 1)



In [46]:
ranked_preds = preds.reshape(data_processed.shape[0], ITEMS_RANKED)

**Ranked preds** is the DataFrame when we have as rows the observations and in columns the ITEMS_RANKED candidates for each observation. Note that a customer can appear in several observations

### Pivot table to index Customers

In [47]:
print(data_processed.shape) #Dataframe de observaciones
print(candidates.shape) #Dataframe en el que tengo 10 candidatos por cada observacion
print(ranked_preds.shape) #Dataframe en el que tengo los 10 candidatos rankeados(prediccion cuanto mas alta mejor)

#¿Objetivo? Tener una tabla/matriz a nivel de cliente con todos los candidatos rankeados, MAX 100

(14976, 19)
(14976, 20)
(14976, 20)


In [48]:
#data_processed['aux'] = np.random.randint(0, MAX_ITEMS, data_processed.shape[0])
data_processed['zero'] = 0
table = pd.pivot_table(data_processed, values = ['zero'] , index = ['customer_id_int'],
                       columns = ['item_id_int'], fill_value=0)

print(table.shape)
#columnas = table.columns
table.columns = [col[1] for col in table.columns]
del table.index.name
table = table.astype(float)
table.head()

(377, 11735)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11725,11726,11727,11728,11729,11730,11731,11732,11733,11734
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


### Fill table with ranked predictions

Now we need to fill the table we just created with the ranked predictions of our candidates, if an item is not a candidate will take the default value that is 0

In [49]:
%%time
for obs in range(ranked_preds.shape[0]):
    for i, cand in enumerate(candidates[obs]):
        table.loc[idx_customers_map[obs]][cand] = ranked_preds[obs][i]

Wall time: 40.9 s


In [54]:
table.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,11725,11726,11727,11728,11729,11730,11731,11732,11733,11734
0,0.001197,0.001197,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.000356,0.0,0.0,0.001197,0.001197,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.00026,0.0,0.0,0.001197,0.001197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.001197,0.0,0.0,0.0,0.0,0.000371,0.001197,0.001197,0.000245,0.001197,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.001197,0.001197,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


---

In [55]:
#for cust in range(table.shape[0]):
#    cust_candidates_idx = idx_customers_df['idx'][idx_customers_df['customer_id_int'] == cust].values
#    list_cust_candidates_items = []
#    for elem in cust_candidates_idx:
#        list_cust_candidates_items.append(candidates[elem])
#    list_cust_candidates_items = np.unique(np.concatenate(lista, axis = 0)) #array that gets the item_id candidates generated for all the interactions of the customer
    

## Recommend

In [60]:
#Arreglar que matchee los clientes, esta matcheando indices cuando digo data_pred[cliente]
def get_afines(data_pred, customer, items_unique, N = 5):
    try:
        print(f'==> Top {N} Recommended items to Customer {customer}: ')
        print(f'\nThe customer {customer} has bought this items: ')
        print('\n' + '=='*30 + '\n')
        interacted_items = data_processed[['text', 'score_original']][data_processed['customer_id_int'] == customer].groupby('text')\
                            .sum().reset_index().sort_values(['score_original'], ascending = False)
        print('\n'.join([str(i+1) + str(' - ') + str(x) for i, x in enumerate(interacted_items['text'].values[0:30])]))
        top = data_pred[customer].values.argsort()[-N:][::-1] #items positions
        print('\n====================== IDs DE PRODUCTOS RECOMENDADOS ==============')
        print([items_map[item] for item in top])
        print ("\n===================== PRODUCTOS RECOMENDADOS =====================")
        print('\n'.join([str(i+1) + str(' - ') + str(items_map_text[x]) for i, x in enumerate(top)]))
        print ("==================================================================")
    except:
        print(f'The customer {customer} does not exist')

In [69]:
CUSTOMER = 204 #128
get_afines(data_pred = table, customer = CUSTOMER, items_unique = items_unique, N = 10)
#data_processed[data_processed['customer_id_int'] == CUSTOMER].sort_values(['score_original'], ascending=False)

==> Top 10 Recommended items to Customer 204: 

The customer 204 has bought this items: 


1 - sofa tapizado plaza rodano corte_ingles
2 - sofa cama tapizado plaza rodano corte_ingles
3 - pantalon deportivo nina azul
4 - sofa tapizado plaza asiento relax electrico rodano corte_ingles
5 - sofa cama tapizado plaza xl rodano corte_ingles
6 - butaca tapizada moma corte_ingles
7 - sandalia plana mujer piel marron
8 - sandalia plana mujer piel negro
9 - sandalia plana mujer piel vacuna color_azul
10 - pantalon deportivo nina junior coral
11 - parka acolchada nino azul indigo
12 - parka acolchada nino pepe jeans negro
13 - plancha vapor gc suela steamglide
14 - plancha vapor gc454330 suela steamglide plus
15 - sandalia plana mujer piel amarillo
16 - sofa tapizado plaza chaise longue derecha azul selfoss room corte_ingles
17 - sofa tapizado plaza chaise longue derecha antracita selfoss room corte_ingles
18 - sofa tapizado plaza chaise longue derecha gris selfoss room corte_ingles
19 - sofa tap