In [1]:
import os
import tqdm
import seaborn as sns

import pandas as pd
import numpy as np
from lightfm import LightFM
from lightfm.data import Dataset

# Import LightFM's evaluation metrics
from lightfm.evaluation import precision_at_k

%matplotlib inline
SEED = 42
np.random.seed(SEED)
os.environ['PYTHONHASHSEED'] = str(SEED)



In [2]:
main_dir = "./dataset"
user = pd.read_csv(main_dir+"/user.csv")
item = pd.read_csv(main_dir+"/item.csv", dtype={'article_id': int})
train = pd.read_csv(main_dir+'/transaction.csv', usecols=['t_dat','customer_id','item_id'] ,dtype={'article_id': int}, parse_dates=['t_dat'])

In [4]:
train['article_id'] = train['item_id']

In [5]:
train.head(2)

Unnamed: 0,t_dat,customer_id,item_id,article_id
0,2019-09-01,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,727880001,727880001
1,2019-09-01,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,767869001,767869001


In [5]:
item.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [6]:
user.head(2)

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...


In [5]:
train['t_dat'] = pd.to_datetime(train['t_dat'])
last_date = train['t_dat'].max()
first_date = train['t_dat'].min()
print(first_date,last_date)

2019-09-01 00:00:00 2020-09-22 00:00:00


In [None]:
# calculate week number
train['week'] = (last_date - pd.to_datetime(train['t_dat'])).dt.days // 7

In [11]:
img_fea_df = pd.read_parquet('./dataset/image_features.parquet')

In [12]:
img_fea_df.head(2)

Unnamed: 0,article_id,img_fea,img_fea_reduced
0,669091028,"[0.31230295, 0.25357383, 0.2588049, 0.30037212...","[-0.20904341, 0.23869179, -0.16474992, -0.0983..."
1,669091031,"[0.3691902, 0.28182054, 0.35075924, 0.28320932...","[-0.20443004, 0.3717328, -0.122880295, 0.10295..."


In [13]:
train.head(2)

Unnamed: 0,t_dat,customer_id,item_id,article_id,week
0,2019-09-01,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,727880001,727880001,55
1,2019-09-01,000f7535bdc611ad136a9f04746d6b1431f50a7f60fbbe...,767869001,767869001,55


In [87]:
recall_weeks = [0,1,2,3,4]
week_nums = 4

In [77]:

from scipy.spatial.distance import cdist

# Function to calculate cosine similarity
def calculate_cosine_similarity(user_embedding, item_embeddings):
    # Cosine similarity is the normalized dot product between vectors
    # If using cdist, 1 - cdist gives cosine similarity
    similarities = 1 - cdist([user_embedding], item_embeddings, 'cosine')
    return similarities[0]

# Function to recall items based on cosine similarity
def recall_items_based_on_cosine_similarity(label_set, img_fea_df, train_set, fea_column):
    # Convert embeddings to numpy arrays if they aren't already
    img_fea_df[fea_column] = img_fea_df[fea_column].apply(np.array)
    
    # Create a dictionary for fast lookup
    img_fea_dict = img_fea_df.set_index('article_id')[fea_column].to_dict()
    
    # Convert train set embeddings to a 2D numpy array
    train_set_embeddings = np.array([img_fea_dict[aid] for aid in train_set['article_id'] if aid in img_fea_dict])
    
    # Generate user embeddings by averaging the embeddings of the articles they ordered
    def generate_user_embedding(labels):
        embeddings = [img_fea_dict[label] for label in labels if label in img_fea_dict]
        if embeddings:
            return np.mean(embeddings, axis=0)
        else:
            #print(000000)
            return np.zeros(len(list(img_fea_dict.values())[0]))
        
    label_set['user_embedding'] = label_set['label'].apply(generate_user_embedding)
    
    # Find top 50 similar items for each user embedding
    def find_top_similar_items(user_embedding):
        if np.count_nonzero(user_embedding) == 0:
            return np.array([])  # Return empty array if user_embedding is all zeros
        similarities = calculate_cosine_similarity(user_embedding, train_set_embeddings)
        top_indices = np.argsort(similarities)[-50:]
        return train_set['article_id'].iloc[top_indices].values
    
    label_set['prediction'] = label_set['user_embedding'].apply(find_top_similar_items)
    
    # Drop the temporary 'user_embedding' column
    return label_set#.drop(columns='user_embedding')



# image embedding recall

In [78]:
#image_model recall
for week in recall_weeks:
    train_set = train[(train['week']>week) & (train['week']<=(week+week_nums))]
    train_set = train_set[['article_id']].drop_duplicates()
  
    label_set = train[train['week']==week]
    label_set_grouped = label_set.groupby('customer_id')['article_id'].agg(list).reset_index()
    label_set = label_set.drop('article_id', axis=1).merge(label_set_grouped, on='customer_id')
    label_set['label'] = label_set['article_id'].apply(lambda x:list(set(x)))
    label_set = label_set.drop(['t_dat','item_id','article_id'],axis=1).drop_duplicates(subset=['customer_id', 'week'])
    label_set = recall_items_based_on_cosine_similarity(label_set, img_fea_df, train_set, 'img_fea_reduced')
    label_set.to_parquet(f"./fea_img_result/img_recall_week_{week}.pt")
    
    

In [3]:
df = pd.read_parquet('./fea_img_result/img_recall_week_4.pt')


In [4]:
df

Unnamed: 0,customer_id,week,label,user_embedding,prediction
0,002156b708c7c6dd8afe31a743131d13b1e5dcbf2ce8c4...,4,"[896152002, 897146002]","[-0.2907887, 0.30698544, -0.14043331, 0.202867...","[783384008, 691177002, 862105001, 919786001, 8..."
2,00ad9e5d82fc8ad18e1fac84f515ab735bd516df32b8ca...,4,"[572998009, 920752001, 902161006]","[0.110653035, 0.021000704, -0.20993477, 0.1180...","[562245062, 903218001, 824430001, 724977002, 8..."
5,00d4f8759e569b9e63a6fecb0c2ed5802174c2c2ad64ce...,4,"[915526001, 557994014]","[-0.19154519, 0.28013027, -0.09179828, -0.1202...","[783129002, 911108002, 640882003, 874054002, 5..."
7,0119826e13f3ef7fb3fb84c778a883710cc859de4b1886...,4,"[808659001, 832307003, 750424017]","[0.3998337, -0.11472506, -0.029288024, -0.3064...","[696734002, 880000001, 481347003, 629881006, 7..."
10,011fc4c3387f8c6eba0e7062aa47750b65d4dc2d5d6148...,4,"[762600009, 817086002, 535455003]","[-0.27753997, 0.21480298, 0.09964185, -0.09723...","[747045002, 811358001, 767925005, 873679005, 5..."
...,...,...,...,...,...
206536,fb04e98ab39aff596c8dee1f20c5863a12f68d2064ae72...,4,"[873217001, 873217004, 868034001, 791033010, 7...","[-0.10269102, -0.01726059, 0.21156302, -0.0745...","[820100001, 812970001, 659460002, 818378001, 8..."
206541,fd344f39be798bc456fd3c041b6cef4933ab0e5875189a...,4,"[799365028, 886737001, 906305002, 891898001, 7...","[-0.019961959, 0.29275155, -0.15852852, -0.055...","[882925002, 637348002, 355072002, 449570030, 7..."
206546,fe5648cc03e5337ce28d4ba24cebdf57247c093a937ee7...,4,"[715624001, 448509001, 910601001, 826646001, 9...","[-0.19505, 0.16666381, 0.044731274, 0.07460298...","[614607002, 684021066, 858145001, 821993001, 5..."
206554,fec9fcd8d529ecd32485518de4ea12f196b9f7126a5c1e...,4,"[866714016, 918516001, 896152002, 866714017, 8...","[-0.1758853, 0.08639591, 0.016021423, 0.050921...","[744111003, 564358023, 888295001, 539217025, 6..."


In [25]:
img_fea_df.head(2)

Unnamed: 0,article_id,img_fea,img_fea_reduced
0,669091028,"[0.31230295, 0.25357383, 0.2588049, 0.30037212...","[-0.20904341, 0.23869179, -0.16474992, -0.0983..."
1,669091031,"[0.3691902, 0.28182054, 0.35075924, 0.28320932...","[-0.20443004, 0.3717328, -0.122880295, 0.10295..."


# word embedding recall

In [79]:
word_fea_df = pd.read_parquet('./dataset/word_fea_data.pt')

In [88]:
#word_model recall
for week in recall_weeks:
    train_set = train[(train['week']>week) & (train['week']<=(week+week_nums))]
    train_set = train_set[['article_id']].drop_duplicates()
  
    label_set = train[train['week']==week]
    label_set_grouped = label_set.groupby('customer_id')['article_id'].agg(list).reset_index()
    label_set = label_set.drop('article_id', axis=1).merge(label_set_grouped, on='customer_id')
    label_set['label'] = label_set['article_id'].apply(lambda x:list(set(x)))
    label_set = label_set.drop(['t_dat','item_id','article_id'],axis=1).drop_duplicates(subset=['customer_id', 'week'])
    label_set = recall_items_based_on_cosine_similarity(label_set, word_fea_df, train_set, 'word_fea_reduced')
    # label_set = recall_items_based_on_cosine_similarity(label_set, img_fea_df, train_set, 'img_fea_reduced')
    label_set.to_parquet(f"./fea_word_result/word_recall_week_{week}.pt")
    
    

In [83]:
label_set = recall_items_based_on_cosine_similarity(label_set, word_fea_df, train_set, 'word_fea_reduced')

In [84]:
label_set

Unnamed: 0,customer_id,week,label,user_embedding,prediction
0,002156b708c7c6dd8afe31a743131d13b1e5dcbf2ce8c4...,4,"[896152002, 897146002]","[-0.04093735, 0.004139971, 0.08616687, 0.03055...","[642105003, 853316001, 853316003, 876101001, 6..."
2,00ad9e5d82fc8ad18e1fac84f515ab735bd516df32b8ca...,4,"[572998009, 920752001, 902161006]","[-0.07547997, 0.06957034, -0.016282974, -0.006...","[615130001, 889087001, 840011001, 882269001, 8..."
5,00d4f8759e569b9e63a6fecb0c2ed5802174c2c2ad64ce...,4,"[915526001, 557994014]","[-0.07180841, -0.106583826, 0.15241666, -0.013...","[825771007, 825771005, 825771004, 825771001, 8..."
7,0119826e13f3ef7fb3fb84c778a883710cc859de4b1886...,4,"[808659001, 832307003, 750424017]","[-0.16604681, 0.018134506, 0.0857052, -0.07887...","[839915011, 733569002, 692846012, 692846011, 8..."
10,011fc4c3387f8c6eba0e7062aa47750b65d4dc2d5d6148...,4,"[762600009, 817086002, 535455003]","[0.03254881, -0.20929354, -0.072348244, 0.0020...","[793704002, 793704001, 877273001, 877273002, 8..."
...,...,...,...,...,...
206536,fb04e98ab39aff596c8dee1f20c5863a12f68d2064ae72...,4,"[873217001, 873217004, 868034001, 791033010, 7...","[-0.05591513, -0.003985235, -0.065697275, 0.03...","[874244002, 820431001, 755609003, 788489001, 7..."
206541,fd344f39be798bc456fd3c041b6cef4933ab0e5875189a...,4,"[799365028, 886737001, 906305002, 891898001, 7...","[-0.013069162, -0.048925105, -0.027484179, -0....","[469562013, 469562036, 469562058, 469562065, 4..."
206546,fe5648cc03e5337ce28d4ba24cebdf57247c093a937ee7...,4,"[715624001, 448509001, 910601001, 826646001, 9...","[0.14729409, 0.12882619, -0.03484679, -0.00527...","[892421002, 892421009, 854154001, 806388010, 8..."
206554,fec9fcd8d529ecd32485518de4ea12f196b9f7126a5c1e...,4,"[866714016, 918516001, 896152002, 866714017, 8...","[-0.011867408, -0.05254487, 0.05048839, 0.0129...","[708495003, 893635001, 912759001, 880538001, 8..."


In [3]:
df = pd.read_parquet("./fea_word_result/word_recall_week_4.pt")

In [8]:
df1 = df[4:5]

In [9]:
df1

Unnamed: 0,customer_id,week,label,user_embedding,prediction
10,011fc4c3387f8c6eba0e7062aa47750b65d4dc2d5d6148...,4,"[762600009, 817086002, 535455003]","[0.03254881, -0.20929354, -0.072348244, 0.0020...","[793704002, 793704001, 877273001, 877273002, 8..."


In [12]:
item.head(2)

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.


In [10]:
df2 = df1.explode('label')

In [13]:
df2 = df2.merge(item[['article_id','detail_desc']],left_on='label',right_on='article_id')

In [15]:
print(df2['detail_desc'].to_list())

['T-shirt in soft jersey with a lightly brushed finish. Straight, relaxed fit with dropped shoulders.', 'Blouse in a linen and cotton weave with a collar, V-neck opening at the front and yoke with a pleat at the back. Dropped shoulders, 3/4-length sleeves with buttoned cuffs, and a rounded hem.', 'Blouse in airy, patterned chiffon with an opening and concealed hook and eye at the back of the neck, a seam at the waist with a drawstring that can be pulled to tighten from the sides.']


# build dataset

In [7]:
def concat_ids(x):
    # Assuming x['label'], x['img_prediction'], and x['word_prediction'] are arrays or series
    label = list(x['label'])  # Convert to list if they are not lists already
    img_prediction = list(x['img_prediction'])
    word_prediction = list(x['word_prediction'])

    # Extend label with img_prediction and word_prediction
    label.extend(img_prediction)
    label.extend(word_prediction)
    
    # Return unique IDs only
    return list(set(label))


In [8]:
recall_weeks = [4]

In [47]:
for week in recall_weeks:
    #word
    word_set = pd.read_parquet(f"./fea_word_result/word_recall_week_{week}.pt")
    word_set['user_word_embedding'] = word_set['user_embedding']
    word_set['word_prediction'] = word_set['prediction']
    word_set = word_set.drop(['user_embedding','prediction'],axis=1)
    #image
    image_set = pd.read_parquet(f"./fea_img_result/img_recall_week_{week}.pt")
    image_set['user_img_embedding'] = image_set['user_embedding']
    image_set['img_prediction'] = image_set['prediction']
    image_set = image_set.drop(['user_embedding','prediction'],axis=1)
    dataset = image_set.merge(word_set[['customer_id','user_word_embedding','word_prediction']],on='customer_id')
    
    

In [48]:
dataset['candidates'] = dataset[['label','img_prediction','word_prediction']].apply(concat_ids,axis=1)

In [49]:
dataset = dataset.drop(['img_prediction','word_prediction'],axis=1)

In [50]:
dataset['len'] = dataset['candidates'].apply(lambda x:len(x))

In [51]:
dataset

Unnamed: 0,customer_id,week,label,user_img_embedding,user_word_embedding,candidates,len
0,002156b708c7c6dd8afe31a743131d13b1e5dcbf2ce8c4...,4,"[896152002, 897146002]","[-0.2907887, 0.30698544, -0.14043331, 0.202867...","[-0.04093735, 0.004139971, 0.08616687, 0.03055...","[828928001, 579541001, 919786001, 865663003, 8...",98
1,00ad9e5d82fc8ad18e1fac84f515ab735bd516df32b8ca...,4,"[572998009, 920752001, 902161006]","[0.110653035, 0.021000704, -0.20993477, 0.1180...","[-0.07547997, 0.06957034, -0.016282974, -0.006...","[913365001, 860949002, 640021005, 797866001, 9...",96
2,00d4f8759e569b9e63a6fecb0c2ed5802174c2c2ad64ce...,4,"[915526001, 557994014]","[-0.19154519, 0.28013027, -0.09179828, -0.1202...","[-0.07180841, -0.106583826, 0.15241666, -0.013...","[851264001, 631744007, 408875021, 557994014, 6...",100
3,0119826e13f3ef7fb3fb84c778a883710cc859de4b1886...,4,"[808659001, 832307003, 750424017]","[0.3998337, -0.11472506, -0.029288024, -0.3064...","[-0.16604681, 0.018134506, 0.0857052, -0.07887...","[880000001, 839915011, 757333001, 772031001, 8...",100
4,011fc4c3387f8c6eba0e7062aa47750b65d4dc2d5d6148...,4,"[762600009, 817086002, 535455003]","[-0.27753997, 0.21480298, 0.09964185, -0.09723...","[0.03254881, -0.20929354, -0.072348244, 0.0020...","[718549002, 782890001, 916394002, 865386001, 7...",101
...,...,...,...,...,...,...,...
58099,fb04e98ab39aff596c8dee1f20c5863a12f68d2064ae72...,4,"[873217001, 873217004, 868034001, 791033010, 7...","[-0.10269102, -0.01726059, 0.21156302, -0.0745...","[-0.05591513, -0.003985235, -0.065697275, 0.03...","[833408001, 636096002, 817280001, 817280007, 8...",103
58100,fd344f39be798bc456fd3c041b6cef4933ab0e5875189a...,4,"[799365028, 886737001, 906305002, 891898001, 7...","[-0.019961959, 0.29275155, -0.15852852, -0.055...","[-0.013069162, -0.048925105, -0.027484179, -0....","[820032001, 355072002, 835712003, 792490001, 8...",103
58101,fe5648cc03e5337ce28d4ba24cebdf57247c093a937ee7...,4,"[715624001, 448509001, 910601001, 826646001, 9...","[-0.19505, 0.16666381, 0.044731274, 0.07460298...","[0.14729409, 0.12882619, -0.03484679, -0.00527...","[835605001, 835605002, 835605004, 835605009, 8...",105
58102,fec9fcd8d529ecd32485518de4ea12f196b9f7126a5c1e...,4,"[866714016, 918516001, 896152002, 866714017, 8...","[-0.1758853, 0.08639591, 0.016021423, 0.050921...","[-0.011867408, -0.05254487, 0.05048839, 0.0129...","[676352001, 727808008, 682261004, 796010002, 8...",116
