Notebook oriented on H&M Kaggle Challenge: 
https://www.kaggle.com/code/julian3833/h-m-implicit-als-model-0-014/notebook
Data: Purchase transactions of H&M

In [1]:
import numpy as np
import pandas as pd
import recometrics
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import train_test_split, mean_average_precision_at_k,  precision_at_k, AUC_at_k, ndcg_at_k, ranking_metrics_at_k
from sklearn.model_selection import train_test_split

In [2]:
%cd

C:\Users\chris


In [4]:
# import data
df = pd.read_csv('transactions_train.csv', dtype={'article_id': str}, parse_dates=['t_dat'])
dfu = pd.read_csv('customers.csv')
dfi = pd.read_csv('articles.csv', dtype={'article_id': str})

In [5]:
# Trying with less data:
# https://www.kaggle.com/tomooinubushi/folk-of-time-is-our-best-friend/notebook
df = df[df['t_dat'] > '2020-08-21']
df.shape

(1190911, 5)

In [None]:
# For validation this means 3 weeks of training and 1 week for validation
# For submission, it means 4 weeks of training
df['t_dat'].max()

In [None]:
ALL_USERS = dfu['customer_id'].unique().tolist()
ALL_ITEMS = dfi['article_id'].unique().tolist()

user_ids = dict(list(enumerate(ALL_USERS)))
item_ids = dict(list(enumerate(ALL_ITEMS)))

user_map = {u: uidx for uidx, u in user_ids.items()}
item_map = {i: iidx for iidx, i in item_ids.items()}

df['user_id'] = df['customer_id'].map(user_map)
df['item_id'] = df['article_id'].map(item_map)

#del dfu, dfi

In [None]:
df_agg = df.groupby(['user_id', 'item_id'])['t_dat'].count().reset_index()

In [None]:
row = df_agg['user_id'].values
col = df_agg['item_id'].values
data = df_agg['t_dat'].values
coo_agg = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))

In [2]:
# Definition of functions
def to_user_item_coo(df):
    """ Turn a dataframe with transactions into a COO sparse items x users matrix"""
    row = df['user_id'].values
    col = df['item_id'].values
    data = df['t_dat'].values
    coo = coo_matrix((data, (row, col)), shape=(len(ALL_USERS), len(ALL_ITEMS)))
    return coo

#custom func
def get_val_matrices_fromCSR(csr, training_share=0.8):
    coo = csr.tocoo()
    splitted = train_test_split(coo, train_percentage=training_share, random_state = 42)
    csr_train = splitted[0]
    csr_val = splitted[1]
    coo_train = csr_train.tocoo()

    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }


def get_val_matrices(df, validation_share=0.2):
    """ Split into training and validation and create various matrices
        
        Returns a dictionary with the following keys:
            coo_train: training data in COO sparse format and as (users x items)
            csr_train: training data in CSR sparse format and as (users x items)
            csr_val:  validation data in CSR sparse format and as (users x items)
    
    """
    df_train, df_val = train_test_split(df, test_size = validation_share, random_state = 42)
    
    coo_train = to_user_item_coo(df_train)
    coo_val = to_user_item_coo(df_val)

    csr_train = coo_train.tocsr()
    csr_val = coo_val.tocsr()
    
    return {'coo_train': coo_train,
            'csr_train': csr_train,
            'csr_val': csr_val
          }

#custom func
def apply_alpha(coo, alpha) :
    new_coo = coo * alpha
    return new_coo

#custom func
def apply_alpha_log(coo, alpha):
    new_coo = alpha * np.log1p(coo/0.0000001)
    return new_coo

def validate(matrices, factors=200, iterations=20, regularization=0.01, alpha=10, show_progress=True):
    """ Train an ALS model with <<factors>> (embeddings dimension) 
    for <<iterations>> over matrices and validate with MAP@12
    """
    coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
    
    model = implicit.als.AlternatingLeastSquares(factors=factors, 
                                                 iterations=iterations, 
                                                 regularization=regularization, 
                                                 random_state=42)
    model.fit(coo_train, show_progress=show_progress)
    
    # The MAPK by implicit doesn't allow to calculate allowing repeated items, which is the case.
    # TODO: change MAP@12 to a library that allows repeated items in prediction
    map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=show_progress, num_threads=4)
    print(f"Factors: {factors:>3} - Iterations: {iterations:>2} - Regularization: {regularization:4.3f} ==> MAP@12: {map12:6.5f}")
    return map12


In [None]:
#validate() model definition, model fitting and evaluation on MAP
matrices = get_val_matrices(df_agg) #data splitting and matrix generation
map12 = validate(matrices, factors=200, iterations=10, regularization=0.01, alpha=10, show_progress=False) #modeling and evaluation

In [None]:
#separate modeling and evaluation
matrices = get_val_matrices(df_agg)
coo_train, csr_train, csr_val = matrices['coo_train'], matrices['csr_train'], matrices['csr_val']
model = implicit.als.AlternatingLeastSquares(factors=200, terations=10, regularization=0.01, random_state=42)
model.fit(coo_train, show_progress=True)
map12 = mean_average_precision_at_k(model, csr_train, csr_val, K=12, show_progress=True, num_threads=4)
ndcg12 = ndcg_at_k(model, csr_train, csr_val, K=12, show_progress=True, num_threads=4)
#more metrics are available