## Import

In [1]:
import sys
import os

sys.path.append(os.path.abspath(os.path.join('..')))


import pandas as pd
import numpy as np

import papermill as pm

from lightfm import LightFM
from lightfm.data import Dataset
from lightfm.evaluation import precision_at_k, recall_at_k

## Constant

In [2]:
RAW_PATH = '../data/raw'
TRANSACTION_PATH = os.path.join(RAW_PATH, 'transactions_train.csv')
CUSTOMER_PATH = os.path.join(RAW_PATH, 'customers.csv')
ARTICLE_PATH = os.path.join(RAW_PATH, 'articles.csv')

SAMPLE_ROW = 500000

# top k items to recommend
TOP_K = 10

# Model parameters
NUM_FACTORS = 200
NUM_EPOCHS = 100

## Load data

In [3]:
transaction_data = pd.read_csv(TRANSACTION_PATH)

print(len(transaction_data))

transaction_data.head()

31788324


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,663713001,0.050831,2
1,2018-09-20,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,541518023,0.030492,2
2,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,505221004,0.015237,2
3,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687003,0.016932,2
4,2018-09-20,00007d2de826758b65a93dd24ce629ed66842531df6699...,685687004,0.016932,2


In [4]:
article_data = pd.read_csv(ARTICLE_PATH)

print(len(article_data))

article_data.head()

105542


Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
2,108775051,108775,Strap top (1),253,Vest top,Garment Upper body,1010017,Stripe,11,Off White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [5]:
customer_data = pd.read_csv(CUSTOMER_PATH)

print(len(customer_data))

customer_data.head()

1371980


Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,,,ACTIVE,NONE,49.0,52043ee2162cf5aa7ee79974281641c6f11a68d276429a...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,,,ACTIVE,NONE,25.0,2973abc54daa8a5f8ccfe9362140c63247c5eee03f1d93...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,,,ACTIVE,NONE,24.0,64f17e6a330a85798e4998f62d0930d14db8db1c054af6...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,,,ACTIVE,NONE,54.0,5d36574f52495e81f019b680c843c443bd343d5ca5b1c2...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,1.0,1.0,ACTIVE,Regularly,52.0,25fa5ddee9aac01b35208d01736e57942317d756b32ddd...


In [6]:
article_column = [
    'product_type_no',
    'graphical_appearance_no',
    'colour_group_code',
    'perceived_colour_value_id',
    'perceived_colour_master_id',
    'department_no',
    'index_code',
    'index_group_no',
    'section_no',
    'garment_group_no',
]

article_num_data = article_data[['article_id'] + article_column]
article_num_data = article_num_data.fillna(0)
for column in article_column:
    article_num_data[column] = column + '-' + article_num_data[column].astype(str)
    
article_num_data.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,colour_group_code,perceived_colour_value_id,perceived_colour_master_id,department_no,index_code,index_group_no,section_no,garment_group_no
0,108775015,product_type_no-253,graphical_appearance_no-1010016,colour_group_code-9,perceived_colour_value_id-4,perceived_colour_master_id-5,department_no-1676,index_code-A,index_group_no-1,section_no-16,garment_group_no-1002
1,108775044,product_type_no-253,graphical_appearance_no-1010016,colour_group_code-10,perceived_colour_value_id-3,perceived_colour_master_id-9,department_no-1676,index_code-A,index_group_no-1,section_no-16,garment_group_no-1002
2,108775051,product_type_no-253,graphical_appearance_no-1010017,colour_group_code-11,perceived_colour_value_id-1,perceived_colour_master_id-9,department_no-1676,index_code-A,index_group_no-1,section_no-16,garment_group_no-1002
3,110065001,product_type_no-306,graphical_appearance_no-1010016,colour_group_code-9,perceived_colour_value_id-4,perceived_colour_master_id-5,department_no-1339,index_code-B,index_group_no-1,section_no-61,garment_group_no-1017
4,110065002,product_type_no-306,graphical_appearance_no-1010016,colour_group_code-10,perceived_colour_value_id-3,perceived_colour_master_id-9,department_no-1339,index_code-B,index_group_no-1,section_no-61,garment_group_no-1017


In [7]:
customer_column = [
    'FN', 
    'Active', 
    'club_member_status',
    'fashion_news_frequency', 
    'age'
]


customer_num_data = customer_data[['customer_id'] + customer_column]

customer_num_data = customer_num_data.fillna(0)

customer_num_data['fashion_news_frequency'] = customer_num_data['fashion_news_frequency'].replace('None', 'NONE')
customer_num_data['fashion_news_frequency'] = customer_num_data['fashion_news_frequency'].replace(0, 'NONE')

for column in customer_column:
    customer_num_data[column] = column + '-' + customer_num_data[column].astype(str)
    
customer_num_data.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,FN-0.0,Active-0.0,club_member_status-ACTIVE,fashion_news_frequency-NONE,age-49.0
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,FN-0.0,Active-0.0,club_member_status-ACTIVE,fashion_news_frequency-NONE,age-25.0
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,FN-0.0,Active-0.0,club_member_status-ACTIVE,fashion_news_frequency-NONE,age-24.0
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,FN-0.0,Active-0.0,club_member_status-ACTIVE,fashion_news_frequency-NONE,age-54.0
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,FN-1.0,Active-1.0,club_member_status-ACTIVE,fashion_news_frequency-Regularly,age-52.0


In [8]:
data = transaction_data.sample(SAMPLE_ROW)

data['t_dat'] = pd.to_datetime(data['t_dat'], format='%Y-%m-%d')

data = data.sort_values(by = 't_dat')

print(len(data))

data.head()

500000


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
4628,2018-09-20,1a551908db2b1adddbcc66eccd3d65ec930f8258e32776...,624257002,0.06778,2
24981,2018-09-20,83844c87b4966e10d06ebd411fc3ea3b25bcca136bdd06...,636323001,0.015237,2
33895,2018-09-20,b22b7bf29121038b7988466d39c9e4b3a0919725a52d99...,469137011,0.030492,2
42383,2018-09-20,e01bb1250aaf16155f7740c22b9b7a84f9b816a69a79f2...,679494004,0.016932,2
1663,2018-09-20,08b803b33953910b012dcd5589a432b8d9c241c118da36...,468480025,0.033881,2


In [9]:
data = data.rename(
    columns = {
        'customer_id': 'userID',
        'article_id': 'itemID',
    }
)

### Data preprocessing

In [10]:
data = data[
    (data['itemID'].isin(list(data['itemID'].value_counts()[data['itemID'].value_counts()>5].index))) &
    (data['userID'].isin(list(data['userID'].value_counts()[data['userID'].value_counts()>5].index)))
]

print(len(data))

data.head()

27159


Unnamed: 0,t_dat,userID,itemID,price,sales_channel_id
22122,2018-09-20,73f5ec63e4565b1be3ad72122e3d3ff73e2a9da339cc44...,399136023,0.033881,2
35746,2018-09-20,bcdccbf54055b2ca6e964f79f37e8a56407501aa354c36...,509091033,0.016932,1
30295,2018-09-20,9e523863a8229a063e5eb453a7913325338954164c3b1a...,669786002,0.013542,1
34235,2018-09-20,b4cff519346f94672edc0934ab0c86f890d3c65b706dde...,399136061,0.050831,2
16900,2018-09-20,58da40c2d2d450073b2c8fbe1abb7b324c8825f5a377a3...,637549013,0.033881,2


In [11]:
len_data = len(data)

train = data[:int(len_data*0.7)]
test = data[int(len_data*0.7):]

test = test[
    (test['userID'].isin(train['userID'].values)) & 
    (test['itemID'].isin(train['itemID'].values))
]

print(f'train len: {len(train)}')
print(f'test len: {len(test)}')

train len: 19011
test len: 1859


In [12]:
train = train.groupby(['userID', 'itemID']).count()['t_dat'].to_frame().reset_index().rename(columns = {'t_dat': 'rating'})

test = test.groupby(['userID', 'itemID']).count()['t_dat'].to_frame().reset_index().rename(columns = {'t_dat': 'rating'})

train.head()

Unnamed: 0,userID,itemID,rating
0,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,628642005,1
1,0001d44dbe7f6c4b35200abdb052c77a87596fe1bdcc37...,637262001,1
2,0024dea548c64fb75a563e0b300c0b16210decee446f1a...,573085044,1
3,0024dea548c64fb75a563e0b300c0b16210decee446f1a...,704126002,1
4,0024dea548c64fb75a563e0b300c0b16210decee446f1a...,710900001,1


## LightFM

In [13]:
class LightFMDataset():
    def __init__(self, interaction_data, user_data, item_data, user_column, item_column):
        self.dataset = Dataset()
        self.interaction_data = interaction_data
        self.user_data = user_data.set_index('customer_id').loc[list(self.interaction_data['userID'].unique())].reset_index()
#         user_data[user_data['customer_id'].isin(self.interaction_data['userID'].values)]
        self.item_data = item_data.set_index('article_id').loc[list(self.interaction_data['itemID'].unique())].reset_index()
#     item_data[item_data['article_id'].isin(self.interaction_data['itemID'].values)]
        self.user_column = user_column
        self.item_column = item_column
        self.calculate_user_features()
        self.calculate_item_features()

    def calculate_user_features(self):
        self.user_features = []
        for column in self.user_column:
            self.user_features.extend(self.user_data[column].unique())
        self.user_features = list(set(self.user_features))
        
    def calculate_item_features(self):
        self.item_features = []
        for column in self.item_column:
            self.item_features.extend(self.item_data[column].unique())
        self.item_features = list(set(self.item_features))
        
    def fit(self):
        return self.dataset.fit(
            users = (i for i in self.interaction_data['userID'].unique()), 
            items = (i for i in self.interaction_data['itemID'].unique()),
            user_features = (i for i in self.user_features),
            item_features = (i for i in self.item_features)
        )
    
    def interactions_shape(self):
        return self.dataset.interactions_shape()
    
    def build_interactions(self):
        return self.dataset.build_interactions((i for i in self.interaction_data.values))
     
    def build_item_features(self):
        print(self.item_data.values[0][0], self.item_data.values[0][1:])
#         print(self.item_data.head())
        assert len(self.item_data) == len(list(self.interaction_data['itemID'].unique()))
        return self.dataset.build_item_features(((i[0], i[1:3]) for index, i in self.item_data.iterrows()))
    
    def build_user_features(self):
        assert len(self.user_data) == len(list(self.interaction_data['userID'].unique()))
        return self.dataset.build_user_features(((i[0], i[1:3]) for index, i in self.user_data.iterrows()))
    


In [14]:
train_dataset = LightFMDataset(
    train,
    customer_num_data,
    article_num_data,
    customer_column,
    article_column
    
)

train_dataset.fit()

train_num_users, train_num_items = train_dataset.interactions_shape()
print('Num users: {}, num items {}.'.format(train_num_users, train_num_items))

(train_interactions, train_weights) = train_dataset.build_interactions()

print(repr(train_interactions))

train_item_features = train_dataset.build_item_features()
print(repr(train_item_features))

train_user_features = train_dataset.build_user_features()
print(repr(train_user_features))

Num users: 4548, num items 10053.
<4548x10053 sparse matrix of type '<class 'numpy.int32'>'
	with 18828 stored elements in COOrdinate format>
628642005 ['product_type_no-272' 'graphical_appearance_no-1010010'
 'colour_group_code-72' 'perceived_colour_value_id-7'
 'perceived_colour_master_id-2' 'department_no-8616' 'index_code-F'
 'index_group_no-3' 'section_no-23' 'garment_group_no-1009']
<10053x10530 sparse matrix of type '<class 'numpy.float32'>'
	with 30159 stored elements in Compressed Sparse Row format>
<4548x4617 sparse matrix of type '<class 'numpy.float32'>'
	with 13644 stored elements in Compressed Sparse Row format>


In [15]:
test_dataset = LightFMDataset(
    test,
    customer_num_data,
    article_num_data,
    customer_column,
    article_column
    
)

test_dataset.fit()

test_num_users, test_num_items = test_dataset.interactions_shape()
print('Num users: {}, num items {}.'.format(test_num_users, test_num_items))

(test_interactions, test_weights) = test_dataset.build_interactions()

print(repr(test_interactions))

test_item_features = test_dataset.build_item_features()
print(repr(test_item_features))

test_user_features = test_dataset.build_user_features()
print(repr(test_user_features))

Num users: 1452, num items 1119.
<1452x1119 sparse matrix of type '<class 'numpy.int32'>'
	with 1847 stored elements in COOrdinate format>
797058001 ['product_type_no-273' 'graphical_appearance_no-1010016'
 'colour_group_code-9' 'perceived_colour_value_id-4'
 'perceived_colour_master_id-5' 'department_no-1643' 'index_code-D'
 'index_group_no-2' 'section_no-51' 'garment_group_no-1002']
<1119x1438 sparse matrix of type '<class 'numpy.float32'>'
	with 3357 stored elements in Compressed Sparse Row format>
<1452x1513 sparse matrix of type '<class 'numpy.float32'>'
	with 4356 stored elements in Compressed Sparse Row format>


In [16]:
model = LightFM(loss='bpr')
model.fit(train_interactions, item_features=train_item_features, user_features = train_user_features)


<lightfm.lightfm.LightFM at 0x7f00d39e87c0>

## Evaluation

In [17]:
test_precision = precision_at_k(
    model, 
    test_interactions,
#     train_interactions,
    k=5,
    user_features = train_user_features,
    item_features = train_item_features,
    ).mean()

test_recall = recall_at_k(
    model, 
    test_interactions,
#     train_interactions,
    k=5,
    user_features = train_user_features,
    item_features = train_item_features,
    ).mean()

print(f'Precision: {test_precision}')
print(f'Recall: {test_recall}')

Precision: 0.0015151515835896134
Recall: 0.00539485766758494


In [18]:
# def merge_ranking_true_pred(
#     rating_true,
#     rating_pred,
#     col_user,
#     col_item,
#     col_rating,
#     col_prediction,
#     relevancy_method,
#     k=DEFAULT_K,
#     threshold=DEFAULT_THRESHOLD,
# ):
#     """Filter truth and prediction data frames on common users
#     Args:
#         rating_true (pandas.DataFrame): True DataFrame
#         rating_pred (pandas.DataFrame): Predicted DataFrame
#         col_user (str): column name for user
#         col_item (str): column name for item
#         col_rating (str): column name for rating
#         col_prediction (str): column name for prediction
#         relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
#             top k items are directly provided, so there is no need to compute the relevancy operation.
#         k (int): number of top k items per user (optional)
#         threshold (float): threshold of top items per user (optional)
#     Returns:
#         pandas.DataFrame, pandas.DataFrame, int: DataFrame of recommendation hits, sorted by `col_user` and `rank`
#         DataFrame of hit counts vs actual relevant items per user number of unique user ids
#     """

#     # Make sure the prediction and true data frames have the same set of users
#     common_users = set(rating_true[col_user]).intersection(set(rating_pred[col_user]))
#     rating_true_common = rating_true[rating_true[col_user].isin(common_users)]
#     rating_pred_common = rating_pred[rating_pred[col_user].isin(common_users)]
#     n_users = len(common_users)

#     # Return hit items in prediction data frame with ranking information. This is used for calculating NDCG and MAP.
#     # Use first to generate unique ranking values for each item. This is to align with the implementation in
#     # Spark evaluation metrics, where index of each recommended items (the indices are unique to items) is used
#     # to calculate penalized precision of the ordered items.
#     if relevancy_method == "top_k":
#         top_k = k
#     elif relevancy_method == "by_threshold":
#         top_k = threshold
#     elif relevancy_method is None:
#         top_k = None
#     else:
#         raise NotImplementedError("Invalid relevancy_method")
#     df_hit = get_top_k_items(
#         dataframe=rating_pred_common,
#         col_user=col_user,
#         col_rating=col_prediction,
#         k=top_k,
#     )
#     df_hit = pd.merge(df_hit, rating_true_common, on=[col_user, col_item])[
#         [col_user, col_item, "rank"]
#     ]

#     # count the number of hits vs actual relevant items per user
#     df_hit_count = pd.merge(
#         df_hit.groupby(col_user, as_index=False)[col_user].agg({"hit": "count"}),
#         rating_true_common.groupby(col_user, as_index=False)[col_user].agg(
#             {"actual": "count"}
#         ),
#         on=col_user,
#     )

#     return df_hit, df_hit_count, n_users

# def map_at_k(
#     rating_true,
#     rating_pred,
#     col_user=DEFAULT_USER_COL,
#     col_item=DEFAULT_ITEM_COL,
#     col_rating=DEFAULT_RATING_COL,
#     col_prediction=DEFAULT_PREDICTION_COL,
#     relevancy_method="top_k",
#     k=DEFAULT_K,
#     threshold=DEFAULT_THRESHOLD,
# ):
#     """Mean Average Precision at k
#     The implementation of MAP is referenced from Spark MLlib evaluation metrics.
#     https://spark.apache.org/docs/2.3.0/mllib-evaluation-metrics.html#ranking-systems
#     A good reference can be found at:
#     http://web.stanford.edu/class/cs276/handouts/EvaluationNew-handout-6-per.pdf
#     Note:
#         1. The evaluation function is named as 'MAP is at k' because the evaluation class takes top k items for
#         the prediction items. The naming is different from Spark.
#         2. The MAP is meant to calculate Avg. Precision for the relevant items, so it is normalized by the number of
#         relevant items in the ground truth data, instead of k.
#     Args:
#         rating_true (pandas.DataFrame): True DataFrame
#         rating_pred (pandas.DataFrame): Predicted DataFrame
#         col_user (str): column name for user
#         col_item (str): column name for item
#         col_rating (str): column name for rating
#         col_prediction (str): column name for prediction
#         relevancy_method (str): method for determining relevancy ['top_k', 'by_threshold', None]. None means that the
#             top k items are directly provided, so there is no need to compute the relevancy operation.
#         k (int): number of top k items per user
#         threshold (float): threshold of top items per user (optional)
#     Returns:
#         float: MAP at k (min=0, max=1).
#     """

#     df_hit, df_hit_count, n_users = merge_ranking_true_pred(
#         rating_true=rating_true,
#         rating_pred=rating_pred,
#         col_user=col_user,
#         col_item=col_item,
#         col_rating=col_rating,
#         col_prediction=col_prediction,
#         relevancy_method=relevancy_method,
#         k=k,
#         threshold=threshold,
#     )

#     if df_hit.shape[0] == 0:
#         return 0.0

#     # calculate reciprocal rank of items for each user and sum them up
#     df_hit_sorted = df_hit.copy()
#     df_hit_sorted["rr"] = (
#         df_hit_sorted.groupby(col_user).cumcount() + 1
#     ) / df_hit_sorted["rank"]
#     df_hit_sorted = df_hit_sorted.groupby(col_user).agg({"rr": "sum"}).reset_index()

#     df_merge = pd.merge(df_hit_sorted, df_hit_count, on=col_user)
#     return (df_merge["rr"] / df_merge["actual"]).sum() / n_users

## Models
- BPR
- NCF
- LightGCN
- RBM
- SAR
- FastAI
- Standard VAE