In [1]:
import pandas as pd
import os, sys
import numpy as np
import seaborn as sns
import gc
import warnings 

from tensorflow.keras.models import Model, Sequential, load_model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding
from gensim.models import Word2Vec

In [2]:
path = os.path.join('../../Data/')
data = pd.read_csv(path + 'data_filtered_20190422.csv', sep = ';')
data_processed = pd.read_csv('data_final_20190524.csv')
data_processed['text'] = data_processed['text'].astype(str)

In [3]:
print(data_processed.shape)
data_processed.head(3)

(25749, 18)


Unnamed: 0,date,item_id,brand,PRICE,customer_id,text,item_age,customer_id_int,item_id_int,brand_id,score,score_original,power_price,power_score,power_item_age,sqrt_price,sqrt_score,sqrt_item_age
0,20190101,A26036172,tintoretto,0.003198,0,vestido mujer flor lazada,0.0,0,0,0,0.001485,4,1e-05,2e-06,0.0,0.056548,0.038538,0.0
1,20190101,A26870590,fórmula joven,0.00226,0,vestido laminado mujer formula joven escote pico,0.0,0,1,1,0.001485,4,5e-06,2e-06,0.0,0.047539,0.038538,0.0
2,20190103,MP_0659870_3014,gabrielle,0.008837,1,abrigo mujer negro avalorios,0.0,1,2,2,0.0,1,7.8e-05,0.0,0.0,0.094005,0.0,0.0


In [4]:
idx_customers_map = {i:val for i,val in enumerate(data_processed['customer_id_int'])}
idx_customers_df = pd.DataFrame({'idx': data_processed.index.values, 'customer_id_int': data_processed.customer_id_int})
idx_customers_df.head()

Unnamed: 0,idx,customer_id_int
0,0,0
1,1,0
2,2,1
3,3,1
4,4,1


In [5]:
items_unique = data_processed.item_id.unique() #20781
customers_unique = data_processed.customer_id_int.unique()
items_map = {i:val for i,val in enumerate(items_unique)}
items_map_inv = {val:i for i,val in enumerate(items_unique)}
items_map_text = data_processed.set_index('item_id_int').text.to_dict()

items_map_brand = data_processed.set_index('item_id_int').brand_id.to_dict()
items_map_price = data_processed.set_index('item_id_int').PRICE.to_dict()
items_map_power_price = data_processed.set_index('item_id_int').power_price.to_dict()
items_map_sqrt_price = data_processed.set_index('item_id_int').sqrt_price.to_dict()

items_map_item_age = data_processed.set_index('item_id_int').item_age.to_dict()
items_map_power_item_age = data_processed.set_index('item_id_int').power_item_age.to_dict()
items_map_sqrt_item_age = data_processed.set_index('item_id_int').sqrt_item_age.to_dict()

customers_unique = data_processed.customer_id.unique()
items_unique = data_processed.item_id.unique()
brand_unique = data_processed.brand.unique()

In [6]:
MAX_NB_WORDS = 30_000 #decided by cumsum wordcount plot (Script 01)
MAX_SEQUENCE_LENGTH = 24 #decided by max words in a product (Script 00)
EMBEDDING_DIM = 100 #Same dim as our W2V embedding

all_text = data_processed['text']
all_text = all_text.drop_duplicates (keep = False)

tokenizer = Tokenizer(num_words=MAX_NB_WORDS, )
tokenizer.fit_on_texts(all_text)

data_sequences = tokenizer.texts_to_sequences(data_processed['text'])
data_vec = pad_sequences(data_sequences, maxlen=MAX_SEQUENCE_LENGTH)

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 8487 unique tokens.


In [7]:
items_map_text_vec = data_processed.set_index('item_id_int').text.to_dict()
data_sequences_vec = tokenizer.texts_to_sequences(items_map_text_vec.values())
data_seq_vec = pad_sequences(data_sequences_vec, maxlen=MAX_SEQUENCE_LENGTH)

#### We need to create a dictionary to map the items_id to their corresponding text vector

In [8]:
for i in items_map_text_vec.keys():
    items_map_text_vec[i] = data_seq_vec[i]

## Models

In [69]:
model_candidates = load_model('candidate_generation_20190525')
model_rank = load_model('rank_model_20190525')

In [70]:
input_length = len(model_rank.input_shape)

In [89]:
def recommend(customer, N = 5):
    try:
        _data = data_processed[data_processed['customer_id_int'] == customer]
        _data_vec = data_vec[_data.index]
        _candidates = model_candidates.predict([_data['customer_id_int'], _data['item_id_int'], 
                                 _data['brand_id'], _data['PRICE'],
                                 _data_vec, _data['item_age'], _data['score'],
                                 _data['power_price'], _data['power_score'], _data['power_item_age'],
                                 _data['sqrt_price'], _data['sqrt_score'], _data['sqrt_item_age']],
                                 verbose = 1)
        _candidates = pd.DataFrame(_candidates)
        _candidates['customer_id_int'] = customer
        _candidates = _candidates.groupby(['customer_id_int']).max()
        ITEMS_RANKED = 50
        candidates = np.zeros((ITEMS_RANKED))
        del _candidates.index.name

        candidates = _candidates.values.reshape(_candidates.shape[1]).argsort()[-ITEMS_RANKED:][::-1] 
        candidates = candidates.astype(int) 

        X = np.zeros((input_length, ITEMS_RANKED), dtype = object)
        X_vec = np.zeros((ITEMS_RANKED, MAX_SEQUENCE_LENGTH), dtype = object)
        
        for row in range(candidates.shape[0]):
            X[0, row] = data_processed['customer_id_int'][row] #Customer_Id
            X[1, row] = candidates[row] #Item_Id
            X[2, row] = items_map_brand[candidates[row]] #Brand_Id
            X[3, row] = items_map_price[candidates[row]] #Price
            #X[4, row] = -999 #items_map_text_vec[item] #Text_Vec
            X[4, row] = items_map_item_age[candidates[row]] #item_age
            X[5, row] = items_map_power_price[candidates[row]] #Power_Price
            X[6, row] = items_map_power_item_age[candidates[row]] #Power item age
            X[7, row] = items_map_sqrt_price[candidates[row]]
            X[8, row] = items_map_sqrt_item_age[candidates[row]]
            X_vec[row] = items_map_text_vec[candidates[row]] #items_map_text_vec[item] #Text_V
            
        preds = model_rank.predict([X[0], X[1],  X[2],
                                    X[3], X[4],
                                    X[5], X[6], X[7], X[8]], verbose = 1)

        ranked_preds = preds.reshape(ITEMS_RANKED)
    #########################################################
        #_pred = diversify(_pred.values.reshape(_pred.shape[1]), diversity = 0.01, plot = False)
        #print(_pred.)
        print('\n' + '=='*30 + '\n')
        print(f'==> Top {N} Recommended items to Customer {customer}: ')
        print(f'\nThe customer {customer} has bought this items: ')
        print('\n' + '=='*30 + '\n')
        interacted_items = data_processed[['text', 'score_original']][data_processed['customer_id_int'] == customer].groupby('text')\
                            .sum().reset_index().sort_values(['score_original'], ascending = False)
        print('\n'.join([str(i+1) + str(' - ') + str(x) for i, x in enumerate(interacted_items['text'].values[0:20])]))
        top = ranked_preds.argsort()[-N:][::-1] #items positions
        print('\n====================== IDs DE PRODUCTOS RECOMENDADOS ==============')
        print([items_map[item] for item in top])
        print ("\n===================== PRODUCTOS RECOMENDADOS =====================")
        print('\n'.join([str(i+1) + str(' - ') + str(items_map_text[x]) for i, x in enumerate(top)]))
        print ("==================================================================")
    except:
        print(f'\nThe customer {customer} does not exist')

In [90]:
CUSTOMER = 128
recommend(customer = CUSTOMER, N = 20)



==> Top 20 Recommended items to Customer 128: 

The customer 128 has bought this items: 


1 - chaqueta punto hombre cuello alto
2 - harry_potter caliz fuego dvd
3 - harry_potter camara secreta dvd
4 - harry_potter coleccion hogwarts dvd bluray
5 - harry_potter orden fenix dvd
6 - harry_potter piedra filosofal dvd
7 - harry_potter prisionero azkaban dvd
8 - harry_potter reliquia muerte parte 3d bluray copia digital
9 - jersey hombre cuello redondo
10 - jersey hombre gris oscuro cuello caja
11 - pack harry_potter coleccion completa bluray

['A25199985', 'A26911452', 'A28102000', 'A25199959', 'A28101558', 'A26377572', 'A26796604', 'A28101607', 'A26036172', 'A24190567', 'A28689395', 'A26704190', 'A27793006', 'A9643823', 'A26075711', 'A25200048', 'A28689342', 'A27566346', 'A28013828', 'A29539372']

1 - pantalon vaquero pitillo nina azul
2 - pantalon deportivo nino gris
3 - vestido encaje mujer lauren color beige
4 - pantalon vaquero pitillo nina azul
5 - vestido largo mujer lauren efecto

---

### Generate Candidates 

In [None]:
test_pred = model_candidates.predict([data_processed['customer_id_int'], data_processed['item_id_int'], 
                             data_processed['brand_id'], data_processed['PRICE'],
                             data_vec, data_processed['item_age'], data_processed['score'],
                             data_processed['power_price'], data_processed['power_score'], data_processed['power_item_age'],
                             data_processed['sqrt_price'], data_processed['sqrt_score'], data_processed['sqrt_item_age']],
                             verbose = 1)

### Prepare Data for Rank Model

Now we need to feed our rank model predictions with our candidates generated with the candidate generator. We need to prepare the data so the rank model can make predictions with it.

In [None]:
#1min
ITEMS_RANKED = 20
candidates = np.zeros((data_processed.shape[0], ITEMS_RANKED))

for i, preds in enumerate(test_pred):
    candidates[i] = preds.argsort()[-ITEMS_RANKED:][::-1]
candidates = candidates.astype(int) 

In [None]:
gc.enable()
del (test_pred, data, model_candidates)
gc.collect()

In [None]:
print(data_processed.shape)
print(candidates.shape)

In [None]:
%%time
X = np.zeros((input_length, data_processed.shape[0]*ITEMS_RANKED), dtype = object)
X_vec = np.zeros((data_processed.shape[0]*ITEMS_RANKED, MAX_SEQUENCE_LENGTH), dtype = object)
for row in range(candidates.shape[0]):
    for i, item in enumerate(candidates[row]):
        X[0, row+i] = data_processed['customer_id_int'][row] #Customer_Id
        X[1, row+i] = item #Item_Id
        X[2, row+i] = items_map_brand[item] #Brand_Id
        X[3, row+i] = items_map_price[item] #Price
        X[4, row+i] = -999 #items_map_text_vec[item] #Text_Vec
        X[5, row+i] = items_map_item_age[item] #item_age
        X[6, row+i] = items_map_power_price[item] #Power_Price
        X[7, row+i] = items_map_power_item_age[item] #Power item age
        X[8, row+i] = items_map_sqrt_price[item]
        X[9, row+i] = items_map_sqrt_item_age[item]
        X_vec[row+i] = items_map_text_vec[item] #items_map_text_vec[item] #Text_Vec

- ITEMS_RANKED = 10: 5min
- ITEMS_RANKED = 100: 20min

### Rank Predictions

In [None]:
preds = model_rank.predict([X[0], X[1],  X[2],
                            X[3], X_vec, X[5],
                            X[6], X[7], X[8], X[9]], verbose = 1)

In [None]:
ranked_preds = preds.reshape(data_processed.shape[0], ITEMS_RANKED)

**Ranked preds** is the DataFrame when we have as rows the observations and in columns the ITEMS_RANKED candidates for each observation. Note that a customer can appear in several observations

### Pivot table to index Customers

In [None]:
print(data_processed.shape) #Dataframe de observaciones
print(candidates.shape) #Dataframe en el que tengo 10 candidatos por cada observacion
print(ranked_preds.shape) #Dataframe en el que tengo los 10 candidatos rankeados(prediccion cuanto mas alta mejor)

#¿Objetivo? Tener una tabla/matriz a nivel de cliente con todos los candidatos rankeados, MAX 100

In [None]:
#data_processed['aux'] = np.random.randint(0, MAX_ITEMS, data_processed.shape[0])
data_processed['zero'] = 0
table = pd.pivot_table(data_processed, values = ['zero'] , index = ['customer_id_int'],
                       columns = ['item_id_int'], fill_value=0)

print(table.shape)
#columnas = table.columns
table.columns = [col[1] for col in table.columns]
del table.index.name
table = table.astype(float)
table.head()

### Fill table with ranked predictions

Now we need to fill the table we just created with the ranked predictions of our candidates, if an item is not a candidate will take the default value that is 0

In [None]:
%%time
for obs in range(ranked_preds.shape[0]):
    for i, cand in enumerate(candidates[obs]):
        table.loc[idx_customers_map[obs]][cand] = ranked_preds[obs][i]

In [None]:
table.head()

---

In [None]:
#for cust in range(table.shape[0]):
#    cust_candidates_idx = idx_customers_df['idx'][idx_customers_df['customer_id_int'] == cust].values
#    list_cust_candidates_items = []
#    for elem in cust_candidates_idx:
#        list_cust_candidates_items.append(candidates[elem])
#    list_cust_candidates_items = np.unique(np.concatenate(lista, axis = 0)) #array that gets the item_id candidates generated for all the interactions of the customer
    

## Recommend

In [None]:
#Arreglar que matchee los clientes, esta matcheando indices cuando digo data_pred[cliente]
def get_afines(data_pred, customer, items_unique, N = 5):
    try:
        print(f'==> Top {N} Recommended items to Customer {customer}: ')
        print(f'\nThe customer {customer} has bought this items: ')
        print('\n' + '=='*30 + '\n')
        interacted_items = data_processed[['text', 'score_original']][data_processed['customer_id_int'] == customer].groupby('text')\
                            .sum().reset_index().sort_values(['score_original'], ascending = False)
        print('\n'.join([str(i+1) + str(' - ') + str(x) for i, x in enumerate(interacted_items['text'].values[0:30])]))
        top = data_pred[customer].values.argsort()[-N:][::-1] #items positions
        print('\n====================== IDs DE PRODUCTOS RECOMENDADOS ==============')
        print([items_map[item] for item in top])
        print ("\n===================== PRODUCTOS RECOMENDADOS =====================")
        print('\n'.join([str(i+1) + str(' - ') + str(items_map_text[x]) for i, x in enumerate(top)]))
        print ("==================================================================")
    except:
        print(f'The customer {customer} does not exist')

In [None]:
CUSTOMER = 204 #128
get_afines(data_pred = table, customer = CUSTOMER, items_unique = items_unique, N = 10)
#data_processed[data_processed['customer_id_int'] == CUSTOMER].sort_values(['score_original'], ascending=False)