In [1]:
import pandas as pd
from pandas.api.types import CategoricalDtype
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import lightgbm as lgb

import pickle
from tqdm import tqdm
import gc
from pathlib import Path

In [2]:
import warnings
import sys
from IPython.core.interactiveshell import InteractiveShell

warnings.filterwarnings("ignore")
sys.path.append("/Users/tuanthanh/Documents/KLTN/SourceClone/H-M-Fashion-RecSys/") # path to the `src`` folder
InteractiveShell.ast_node_interactivity = "all"
tqdm.pandas()

In [3]:
from src.data import DataHelper
from src.data.metrics import map_at_k, hr_at_k, recall_at_k, ndcg_at_k, mrr_at_k

from src.utils import (
    calc_valid_date,
    merge_week_data,
    reduce_mem_usage,
    calc_embd_similarity,
)

In [4]:
tqdm.pandas()

In [5]:
data_dir = Path("/Users/tuanthanh/Documents/KLTN/SourceClone/H-M-Fashion-RecSys/data/HM")
model_dir = Path("/Users/tuanthanh/Documents/KLTN/SourceClone/H-M-Fashion-RecSys/models/models_HM")

Pepare data: encoding ids and preprocessing

In [6]:
dh = DataHelper(data_dir)

In [7]:
# data = dh.preprocess_data(save=True, name="encoded_full") # * run only once, processed data will be saved

In [8]:
data = dh.load_data(name="encoded_full")

### Blend

In [9]:
pred1_lgb_rank = pd.read_parquet(data_dir/"processed"/"large_rank_valid.pqt")
pred1_lgb_binary = pd.read_parquet(data_dir/"processed"/"large_binary_valid.pqt")
pred1_nn = pd.read_parquet(data_dir/"external"/"large_nn_valid.pqt")
pred1_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [10]:
pred2_lgb_rank = pd.read_parquet(data_dir/"processed"/"small_rank_valid.pqt")
pred2_lgb_binary = pd.read_parquet(data_dir/"processed"/"small_binary_valid.pqt")
pred2_nn = pd.read_parquet(data_dir/"external"/"small_nn_valid.pqt")
pred2_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [11]:
pred1_lgb_binary = pred1_lgb_binary.merge(pred1_nn, on=['customer_id','prediction'], how='left')
pred1_lgb_binary['prob'] = pred1_lgb_binary['prob_x'] + pred1_lgb_binary['prob_y']

pred2_lgb_binary = pred2_lgb_binary.merge(pred2_nn, on=['customer_id','prediction'], how='left')
pred2_lgb_binary['prob'] = pred2_lgb_binary['prob_x'] + pred2_lgb_binary['prob_y']

In [12]:
pred1_lgb_rank   = pred1_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_lgb_binary = pred1_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_rank   = pred2_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_binary = pred2_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)

In [13]:
pred1_lgb_rank   = pred1_lgb_rank.groupby('customer_id')['prediction'].apply(list).reset_index()
pred1_lgb_binary = pred1_lgb_binary.groupby('customer_id')['prediction'].apply(list).reset_index()
pred2_lgb_rank   = pred2_lgb_rank.groupby('customer_id')['prediction'].apply(list).reset_index()
pred2_lgb_binary = pred2_lgb_binary.groupby('customer_id')['prediction'].apply(list).reset_index()

In [14]:
pred1_lgb_rank.rename(columns={'prediction':'large_rank'},inplace=True)
pred1_lgb_binary.rename(columns={'prediction':'large_binary'},inplace=True)
pred2_lgb_rank.rename(columns={'prediction':'small_rank'},inplace=True)
pred2_lgb_binary.rename(columns={'prediction':'small_binary'},inplace=True)

In [15]:
pred = (
    pred1_lgb_rank.merge(pred1_lgb_binary, on=['customer_id'])
                    .merge(pred2_lgb_rank, on=['customer_id'])
                    .merge(pred2_lgb_binary, on=['customer_id'])
                    )

In [16]:
def cust_blend(dt, W = [1,1,1,1]):
    #Create a list of all model predictions
    REC = []

    # Second Try
    REC.append(dt['large_rank'])
    REC.append(dt['large_binary'])
    REC.append(dt['small_rank'])
    REC.append(dt['small_binary'])

    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    # Return the top 12 items only
    return res[:12]

In [17]:
pred['prediction'] = pred.progress_apply(cust_blend, W = [1.0, 1.3, 1.0, 1.3], axis=1)

100%|██████████| 68984/68984 [00:02<00:00, 29344.78it/s]


In [18]:
label = pd.read_parquet(data_dir/"processed"/"LargeRecall"/"week1_label.pqt")
label = pd.merge(label, pred, on="customer_id", how="left")

In [19]:
#MapK
map_at_k(label["article_id"], label["prediction"], k=5)
map_at_k(label["article_id"], label["prediction"], k=20)


0.028300624460422388

0.02943935547687692

In [20]:
#Recall
recall_at_k(label["article_id"], label["prediction"], k=5)
recall_at_k(label["article_id"], label["prediction"], k=20)

0.04668818648196349

0.06750526919717242

In [21]:
#HitRate
hr_at_k(label["article_id"], label["prediction"], k=5)
hr_at_k(label["article_id"], label["prediction"], k=20)

0.09044126174185319

0.1349008465731184

In [22]:
ndcg_at_k(label["article_id"], label["prediction"], k=5)
ndcg_at_k(label["article_id"], label["prediction"], k=20)

0.0387206876164103

0.044658269835736654

In [23]:
mrr_at_k(label["article_id"], label["prediction"], k=5)
mrr_at_k(label["article_id"], label["prediction"], k=20)

0.05480159650546987

0.06017284797327996

### Test

In [24]:
pred1_lgb_rank = pd.read_parquet(data_dir/"processed"/"large_rank_test.pqt")
pred1_lgb_binary = pd.read_parquet(data_dir/"processed"/"large_binary_test.pqt")
pred1_nn = pd.read_parquet(data_dir/"processed"/"large_nn_test.pqt")
# pred1_lgb_rank = pd.read_parquet(data_dir/"processed"/"small_rank_test.pqt")
# pred1_lgb_binary = pd.read_parquet(data_dir/"processed"/"small_binary_test.pqt")
# pred1_nn = pd.read_parquet(data_dir/"processed"/"small_nn_test.pqt")
pred1_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [25]:
pred1_lgb_rank = pred1_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_lgb_rank = pred1_lgb_rank.drop_duplicates(['customer_id','prediction'])

pred1_lgb_binary = pred1_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_lgb_binary = pred1_lgb_binary.drop_duplicates(['customer_id','prediction'])

pred1_nn = pred1_nn.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred1_nn = pred1_nn.drop_duplicates(['customer_id','prediction'])

KeyboardInterrupt: 

In [None]:
pred1_lgb_binary = pred1_lgb_binary.sort_values(by=['customer_id','prediction']).reset_index(drop=True)
pred1_nn = pred1_nn.sort_values(by=['customer_id','prediction']).reset_index(drop=True)

# * not using merge here to avoid memory error
pred1_lgb_binary['prob2'] = pred1_lgb_binary['prob'] + pred1_nn['prob']
pred1_lgb_binary = pred1_lgb_binary.sort_values(by='prob2', ascending=False).reset_index(drop=True)

In [None]:
pred1_lgb_rank   = pred1_lgb_rank.groupby('customer_id')['prediction'].progress_apply(list).reset_index()
pred1_lgb_binary = pred1_lgb_binary.groupby('customer_id')['prediction'].progress_apply(list).reset_index()

100%|██████████| 1371980/1371980 [00:38<00:00, 35712.21it/s]
100%|██████████| 1371980/1371980 [00:35<00:00, 38128.53it/s]


In [None]:
pred1_lgb_rank.to_parquet(data_dir/"large_rank_test_new.pqt")
pred1_lgb_binary.to_parquet(data_dir/"large_binary_test_new.pqt")

In [None]:
# * ------------------------------------------------------------

In [None]:
pred2_lgb_rank = pd.read_parquet(data_dir/"processed"/"small_rank_test.pqt")
pred2_lgb_binary = pd.read_parquet(data_dir/"processed"/"small_binary_test.pqt")
pred2_nn = pd.read_parquet(data_dir/"processed"/"small_nn_test.pqt")
pred2_nn.rename(columns={'article_id':'prediction'},inplace=True)

In [None]:
pred2_lgb_rank = pred2_lgb_rank.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_rank = pred2_lgb_rank.drop_duplicates(['customer_id','prediction'])

pred2_lgb_binary = pred2_lgb_binary.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_lgb_binary = pred2_lgb_binary.drop_duplicates(['customer_id','prediction'])

pred2_nn = pred2_nn.sort_values(by='prob', ascending=False).reset_index(drop=True)
pred2_nn = pred2_nn.drop_duplicates(['customer_id','prediction'])

In [None]:
pred2_lgb_binary = pred2_lgb_binary.sort_values(by=['customer_id','prediction']).reset_index(drop=True)
pred2_nn = pred2_nn.sort_values(by=['customer_id','prediction']).reset_index(drop=True)

pred2_lgb_binary['prob2'] = pred2_lgb_binary['prob'] + pred2_nn['prob']
pred2_lgb_binary = pred2_lgb_binary.sort_values(by='prob2', ascending=False).reset_index(drop=True)

In [None]:
pred2_lgb_rank   = pred2_lgb_rank.groupby('customer_id')['prediction'].progress_apply(list).reset_index()
pred2_lgb_binary = pred2_lgb_binary.groupby('customer_id')['prediction'].progress_apply(list).reset_index()

100%|██████████| 1371980/1371980 [00:39<00:00, 34394.36it/s]
100%|██████████| 1371980/1371980 [00:45<00:00, 30081.73it/s]


In [None]:
pred2_lgb_rank.shape

(1371980, 2)

In [None]:
# pred2_lgb_rank.to_parquet(data_dir/"small_rank_test.pqt")
# pred2_lgb_binary.to_parquet(data_dir/"small_binary_test.pqt")

In [None]:
# ----------------------------------------------------------

In [None]:
pred1_lgb_rank.rename(columns={'prediction':'large_rank'},inplace=True)
pred1_lgb_binary.rename(columns={'prediction':'large_binary'},inplace=True)
pred2_lgb_rank.rename(columns={'prediction':'small_rank'},inplace=True)
pred2_lgb_binary.rename(columns={'prediction':'small_binary'},inplace=True)

In [None]:
pred = pred2_lgb_rank.merge(pred2_lgb_binary, on=['customer_id'], how='left')\
                    .merge(pred1_lgb_rank, on=['customer_id'], how='left')\
                    .merge(pred1_lgb_binary, on=['customer_id'], how='left')

In [None]:
for f in ['large_rank','large_binary']:
    pred[f] = pred.progress_apply(lambda x:x[f] if not pd.isna(np.array(x[f])).any() else [], axis=1)

100%|██████████| 1371980/1371980 [00:11<00:00, 119005.52it/s]
100%|██████████| 1371980/1371980 [00:11<00:00, 114337.07it/s]


In [None]:
def cust_blend(dt, W = [1,1]):
    #Create a list of all model predictions
    REC = []

    # Second Try
    REC.append(dt['large_rank'])
    REC.append(dt['large_binary'])
    REC.append(dt['small_rank'])
    REC.append(dt['small_binary'])

    #Create a dictionary of items recommended.
    #Assign a weight according the order of appearance and multiply by global weights
    res = {}
    for M in range(len(REC)):
        for n, v in enumerate(REC[M]):
            if v in res:
                res[v] += (W[M]/(n+1))
            else:
                res[v] = (W[M]/(n+1))

    # Sort dictionary by item weights
    res = list(dict(sorted(res.items(), key=lambda item: -item[1])).keys())

    # Return the top 12 items only
    return res[:12]

In [None]:
pred['prediction'] = pred.progress_apply(cust_blend, W = [1.0, 1.3, 1.0, 1.3], axis=1) # , 1.0, 1.2

100%|██████████| 1371980/1371980 [00:54<00:00, 25301.19it/s]


In [None]:
idx2uid = pickle.load(open(data_dir/"index_id_map/user_index2id.pkl", "rb"))
idx2iid = pickle.load(open(data_dir/"index_id_map/item_index2id.pkl", "rb"))

In [None]:
def parse(x):
    l = ['0'+str(idx2iid[i]) for i in x]
    l = ' '.join(l[:12])
    return l

In [None]:
pred['prediction'] = pred['prediction'].progress_apply(lambda x: parse(x))

100%|██████████| 1371980/1371980 [00:03<00:00, 365733.77it/s]


In [None]:
uid2idx = pickle.load(open(data_dir/"index_id_map/user_id2index.pkl", "rb"))
submission = pd.read_csv(data_dir/"raw"/'sample_submission.csv')
submission['customer_id'] = submission['customer_id'].map(uid2idx)

In [None]:
del submission['prediction']
submission = submission.merge(pred, on='customer_id', how='left')
submission['customer_id'] = submission['customer_id'].map(idx2uid)

In [None]:
submission = submission[['customer_id', 'prediction']]

In [None]:
submission.to_csv('large_recall.csv', index=False)

In [None]:
submission.head()

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0762846031 0924243002 0762846027 0751471043 07...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0706016001 0915529003 0762846027 0751471001 07...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0706016001 0794321007 0924243002 0448509014 07...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0751471043 0762846027 0924243002 0918522001 04...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0896152002 0915529003 0896152001 0866731001 07...
