In [1]:
import pandas as pd
import os, sys
import numpy as np
import seaborn as sns
import gc
import warnings 

from keras.models import Model, Sequential, load_model
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.layers import Embedding
from gensim.models import Word2Vec

Using TensorFlow backend.


In [2]:
path = os.path.join('../../Data/')
data = pd.read_csv('data_filtered_20190422.csv', sep = ';')
data_processed = pd.read_csv('data_final_20190523.csv')
data_processed['text'] = data_processed['text'].astype(str)

In [3]:
data_processed.shape

(14976, 18)

In [4]:
data_processed.head(3)

Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,score_original,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
0,20190101,A26036172,tintoretto,0.003198,0,vestido mujer flor lazada,0.0,0,0,0,0.00208,4,1e-05,4e-06,0.0,0.056548,0.045612,0.0
1,20190101,A26870590,fórmula joven,0.00226,0,vestido laminado mujer formula joven escote pico,0.0,0,1,1,0.00208,4,5e-06,4e-06,0.0,0.047539,0.045612,0.0
2,20190103,MP_0659870_3014,gabrielle,0.008837,1,abrigo mujer negro avalorios,0.0,1,2,2,0.0,1,7.8e-05,0.0,0.0,0.094005,0.0,0.0


In [5]:
idx_customers_map = {i:val for i,val in enumerate(data_processed['customer_id_int'])}
idx_customers_df = pd.DataFrame({'idx': data_processed.index.values, 'customer_id_int': data_processed.customer_id_int})

In [6]:
idx_customers_df.head()

Unnamed: 0,idx,customer_id_int
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1


In [7]:
items_unique = data_processed.item_id.unique() #20781
customers_unique = data_processed.customer_id_int.unique()
items_map = {i:val for i,val in enumerate(items_unique)}
items_map_inv = {val:i for i,val in enumerate(items_unique)}
items_map_text = data_processed.set_index('item_id_int').text.to_dict()

items_map_brand = data_processed.set_index('item_id_int').brand_id.to_dict()
items_map_price = data_processed.set_index('item_id_int').PRICE.to_dict()
items_map_power_price = data_processed.set_index('item_id_int').power_price.to_dict()
items_map_sqrt_price = data_processed.set_index('item_id_int').sqrt_price.to_dict()

items_map_item_age = data_processed.set_index('item_id_int').item_age.to_dict()
items_map_power_item_age = data_processed.set_index('item_id_int').power_item_age.to_dict()
items_map_sqrt_item_age = data_processed.set_index('item_id_int').sqrt_item_age.to_dict()

customers_unique = data_processed.customer_id.unique()
items_unique = data_processed.item_id.unique()
brand_unique = data_processed.brand.unique()

In [8]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

all_text = data_processed['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_processed['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 6568 unique tokens.


In [9]:
items_map_text_vec = data_processed.set_index('item_id_int').text.to_dict()
data_sequences_vec = tokenizer.texts_to_sequences(items_map_text_vec.values())
data_seq_vec = pad_sequences(data_sequences_vec, maxlen=MAX_SEQUENCE_LENGTH)

In [10]:
for i in items_map_text_vec.keys():
    items_map_text_vec[i] = data_seq_vec[i]

## Models

In [11]:
model_candidates = load_model('candidate_generation_20190523')
model_rank = load_model('rank_model_20190523')

In [12]:
input_length = len(model_rank.input_shape)

### Generate Candidates 

In [13]:
test_pred = model_candidates.predict([data_processed['customer_id_int'], data_processed['item_id_int'], 
                             data_processed['brand_id'], data_processed['PRICE'],
                             data_vec, data_processed['item_age'], data_processed['score'],
                             data_processed['power_price'], data_processed['power_score'], data_processed['power_item_age'],
                             data_processed['sqrt_price'], data_processed['sqrt_score'], data_processed['sqrt_item_age']],
                             verbose = 1)



### Prepare Data for Rank Model

Now we need to feed our rank model predictions with our candidates generated with the candidate generator. We need to prepare the data so the rank model can make predictions with it.

In [14]:
#1min
ITEMS_RANKED = 10
candidates = np.zeros((data_processed.shape[0], ITEMS_RANKED))

for i, preds in enumerate(test_pred):
    candidates[i] = preds.argsort()[-ITEMS_RANKED:][::-1]
candidates = candidates.astype(int) 

In [15]:
gc.enable()
del (test_pred, data, model_candidates)
gc.collect()

109

In [16]:
print(data_processed.shape)
print(candidates.shape)

(14976, 18)
(14976, 10)


In [17]:
%%time
X = np.zeros((input_length, data_processed.shape[0]*ITEMS_RANKED), dtype = object)
X_vec = np.zeros((data_processed.shape[0]*ITEMS_RANKED, MAX_SEQUENCE_LENGTH), dtype = object)
for row in range(candidates.shape[0]):
    for i, item in enumerate(candidates[row]):
        #print(candidates[row])
        X[0, row+i] = data_processed['customer_id_int'][row] #Customer_Id
        X[1, row+i] = item #Item_Id
        X[2, row+i] = items_map_brand[item] #Brand_Id
        X[3, row+i] = items_map_price[item] #Price
        X[4, row+i] = -999 #items_map_text_vec[item] #Text_Vec
        X[5, row+i] = items_map_item_age[item] #item_age
        X[6, row+i] = items_map_power_price[item] #Power_Price
        X[7, row+i] = items_map_power_item_age[item] #Power item age
        X[8, row+i] = items_map_sqrt_price[item]
        X[9, row+i] = items_map_sqrt_item_age[item]
        X_vec[row+i] = items_map_text_vec[item] #items_map_text_vec[item] #Text_Vec
    #print(f'{row} - {X[row]}')

Wall time: 2.89 s


In [20]:
preds = model_rank.predict([X[0], X[1], X[2],
                            X[3], X_vec,X[5],
                            X[6], X[7], X[8], X[9]], verbose = 1)



In [21]:
ranked_preds = preds.reshape(data_processed.shape[0], ITEMS_RANKED)

**Ranked preds** is the DataFrame when we have as rows the observations and in columns the ITEMS_RANKED candidates for each observation. Note that a customer can appear in several observations

### Pivot table to index Customers

In [40]:
data_processed['aux'] = np.random.randint(0, ITEMS_RANKED, data_processed.shape[0])

In [41]:
table = pd.pivot_table(data_processed, values='score_original', index=['customer_id_int'],
                       columns=['aux'], aggfunc=np.sum, fill_value=0)

print(table.shape)
columnas = table.columns
table.head()

(377, 10)


aux,0,1,2,3,4,5,6,7,8,9
customer_id_int,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,20,16,32,28,36,20,20,32,36,20
1,9,7,1,6,6,5,5,8,4,3
2,22,44,34,19,38,11,45,17,16,21
3,12,12,45,58,10,40,16,50,39,20
4,1,1,1,1,0,2,2,1,1,1


### Rank Candidates

In [48]:
print(ranked_preds.shape)
print(data_processed.shape) #Con esta matriz sabemos que producto es cada prediccion
print(candidates.shape)
print(data_preds.shape)

(14976, 10)
(14976, 19)
(14976, 10)
(377, 10)


In [None]:
#Arreglar que matchee los clientes, esta matcheando indices cuando digo data_pred[cliente]
def get_afines(data_pred, customer, items_unique, N = 5):
    try:
        top = data_pred[customer].argsort()[-N:][::-1] #items positions
        print(top)
        print ("===================== PRODUCTOS MAS AFINES =====================")
        print([items_map_text[candidates[customer, x]] for x in top])
        print ("=============================================================")
    except:
        print(f'The customer {customer} does not exist')

In [None]:
CUSTOMER = 0 #128
get_afines(data_pred = data_preds, cliente = CUSTOMER, items_unique = items_unique, N = 5)
data_processed[data_processed['customer_id_int'] == CUSTOMER]