# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Pre-Processing</span>**](#Pre-Processing)
* [**<span>3. Exploratory Data Analysis</span>**](#Exploratory-Data-Analysis)  
    * [**<span>3.1 Articles</span>**](#EDA::Articles)  
    * [**<span>3.2 Customers</span>**](#EDA::Customers)
    * [**<span>3.3 Transactions</span>**](#EDA::Transactions)
* [**<span>4. Helper FunctionsDecorators</span>**](#Helper-Functions)
* [**<span>5. Models</span>**](#Models) 
    * [**<span>5.1 Popularity</span>**](#Popularity-Model)   
    * [**<span>5.2 ALS</span>**](#Alternating-Least-Squares)  
    * [**<span>5.2 GBDT</span>**](#GBDT)  
    * [**<span>5.3 SGD/similar</span>**](#SGD)  
    * [**<span>5.4 NN</span>**](#NN)

## Imports

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
# import cudf # switch on P100 GPU for this to work in Kaggle
# import cupy as cp

# Importing data
articles = pd.read_csv('articles.csv')
print(articles.head())
print("--")
customers = pd.read_csv('customers.csv')
print(customers.head())
print("--")
transactions = pd.read_csv("transactions_train.csv")
print(transactions.head())
print("--")

## Pre-Processing

In [None]:
# ----- empty value stats -------------
print("Missing values: ")
print(customers.isnull().sum())
print("--\n")

print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

# ---- data cleaning -------------

customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)

# replace club_member_status missing values with 'LEFT CLUB' --> no members with LEFT CLUB status in data
customers['club_member_status'] = customers['club_member_status'].fillna('LEFT CLUB')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('None')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE', 'None')
customers['age'] = customers['age'].fillna(customers['age'].mean())
customers['age'] = customers['age'].astype(int)
articles['detail_desc'] = articles['detail_desc'].fillna('None')


print("Customers' Missing values: ")
print(customers.isnull().sum())
print("--\n")

In [None]:
# ---- memory optimizations -------------

# reference: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

# iterate through all the columns of a dataframe and reduce the int and float data types to the smallest possible size, ex. customer_id should not be reduced from int64 to a samller value as it would have collisions
import numpy as np
import pandas as pd

def reduce_mem_usage(df):
    """Iterate over all the columns of a DataFrame and modify the data type
    to reduce memory usage, handling ordered Categoricals"""
    
    # check the memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'category':
            if df[col].cat.ordered:
                # Convert ordered Categorical to an integer
                df[col] = df[col].cat.codes.astype('int16')
            else:
                # Convert unordered Categorical to a string
                df[col] = df[col].astype('str')
        
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    # check the memory usage after optimization
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))

    # calculate the percentage of the memory usage reduction
    mem_reduction = 100 * (start_mem - end_mem) / start_mem
    print("Memory usage decreased by {:.1f}%".format(mem_reduction))
    
    return df

   

In [None]:
print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

In [None]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

In [None]:
# explicitly convert club_member_status to ordinal values before mem optimization to avoid errors

customers['club_member_status'].replace({'LEFT CLUB': 0, 'PRE-CREATE': 1, 'ACTIVE': 2}, inplace=True)
customers['club_member_status'] = customers['club_member_status'].astype('int8')
print(customers['club_member_status'].unique())


In [None]:
# ---- memory optimizations -------------

# uses 8 bytes instead of given 64 byte string, reduces mem by 8x, 
# !!!! have to convert back before merging w/ sample_submissions.csv
# convert transactions['customer_id'] to 8 bytes int
# transactions['customer_id'] = transactions['customer_id'].astype('int64')
transactions['customer_id'] = transactions['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')

articles = reduce_mem_usage(articles)
customers = reduce_mem_usage(customers)
transactions = reduce_mem_usage(transactions)

# articles['article_id'] = articles['article_id'].astype('int32')
# transactions['article_id'] = transactions['article_id'].astype('int32') 
# # !!!! ADD LEADING ZERO BACK BEFORE SUBMISSION OF PREDICTIONS TO KAGGLE: 
# # Ex.: transactions['article_id'] = '0' + transactions.article_id.astype('str')

print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

In [None]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

In [None]:
# time-based splitting strategy

def split_train_val_data_and_drop_duplicates(transactions, days=7):
    """
    Splits the transaction training data into a training set and a validation set of 7 days to prevent data leakage.
    """
    
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
    transactions = transactions.sort_values(by=['t_dat'])
    latest_transaction_date = transactions['t_dat'].max()
    
    training_set = transactions[transactions['t_dat'] < latest_transaction_date - pd.Timedelta(days=days)]
    validation_set = transactions[transactions['t_dat'] >= latest_transaction_date - pd.Timedelta(days=days)]
    
    print("Training set size:", len(training_set))
    print("Validation set size:", len(validation_set))
    print("Last date in training set:", training_set['t_dat'].max())
    print("Last date in validation set:", validation_set['t_dat'].max())

    # drop duplicate rows
    training_set = training_set.drop_duplicates().copy()
    validation_set = validation_set.drop_duplicates().copy()
    
    return training_set, validation_set

In [None]:
def preprocess_data(transactions_df, customers_df, articles_df, customers_col='customer_id', articles_col='article_id'):
    """
    Preprocesses customer and article IDs for use in a sparse matrix.
    
    Returns:
    - transactions_df: the input transaction DataFrame with two additional columns, 'user_index' and 'item_index',
                       that map customer and article IDs to their corresponding indices in a sparse matrix
    - customer_id_indices_map: a dictionary that maps customer IDs to their corresponding indices
    - article_id_indices_map: a dictionary that maps article IDs to their corresponding indices
    """
    # Create a list of unique customer IDs and product IDs
    all_customers = customers_df[customers_col].unique().tolist()
    all_articles = articles_df[articles_col].unique().tolist()

    # Create dicts mapping IDs to their corresponding indices
    customer_id_indices_map = {customer_id: i for i, customer_id in enumerate(all_customers)}
    article_id_indices_map = {article_id: i for i, article_id in enumerate(all_articles)}

    # Map customer and article IDs to their resp. indices in the transaction DataFrame
    transactions_df['user_index'] = transactions_df[customers_col].map(customer_id_indices_map)
    transactions_df['item_index'] = transactions_df[articles_col].map(article_id_indices_map)

    return transactions_df, all_customers, all_articles, customer_id_indices_map, article_id_indices_map

In [None]:
transactions.head()

In [None]:
import numpy as np
from scipy.sparse import csr_matrix

# binary purchase interaction user-item matrix

def create_user_item_matrix(transactions):

    # Get unique user and item indices in asc. order
    user_indices = np.arange(transactions['user_index'].nunique())
    item_indices = np.arange(transactions['item_index'].nunique())

    # Create a dictionary mapping user and item indices to matrix indices
    user_index_dict = dict(zip(sorted(transactions['user_index'].unique()), user_indices))
    item_index_dict = dict(zip(sorted(transactions['item_index'].unique()), item_indices))

    # Create arrays of row indices, column indices, and data for the sparse matrix
    rows = []
    cols = []
    data = [] # purchased 1 or 0

    # Iterate over all possible combinations of user and item indices
    for user_index in user_indices:
        for item_index in item_indices:
            # Get the corresponding matrix indices for the user and item indices
            matrix_user_index = user_index_dict.get(user_index)
            matrix_item_index = item_index_dict.get(item_index)
            # Get the corresponding interaction value from the transactions dataframe
            interaction = transactions.loc[(transactions['user_index'] == user_index) & 
                                            (transactions['item_index'] == item_index), 'quantity'].values
            # Append the row index, column index, and interaction value to the corresponding arrays
            rows.append(matrix_user_index)
            cols.append(matrix_item_index)
            data.append(1 if len(interaction) > 0 else 0)

    # Create the sparse matrix using the row, column, and data arrays
    user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_indices), len(item_indices)))

    return user_item_matrix


In [None]:
from math import ceil

# calculate total number of transaction weeks in tranactions data
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Compute the minimum and maximum date values
min_date = transactions['t_dat'].min()
max_date = transactions['t_dat'].max()

# Compute the number of weeks between the minimum and maximum date values
num_weeks = ceil((max_date - min_date).days / 7)

print(f"Total number of transaction weeks: {num_weeks}")


In [None]:
from datetime import datetime, timedelta

# only use last x weeks of transactions data since data is too large
def filter_transactions_last_x_weeks(transactions, x = 10):
    # Convert date strings to datetime objects
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

    # Calculate the date x weeks ago from the latest transaction date
    latest_date = transactions['t_dat'].max()
    cutoff_date = latest_date - timedelta(weeks=x)

    # Filter transactions to only include those in the last x weeks
    filtered_transactions = transactions.loc[transactions['t_dat'] >= cutoff_date].copy()

    return filtered_transactions

In [None]:
def filter_customers_and_articles(customers, articles, filtered_transactions):
    # Get unique customer and article IDs from filtered transactions
    customer_ids = filtered_transactions['customer_id'].unique()
    article_ids = filtered_transactions['article_id'].unique()

    # Filter customers and articles to only include those in filtered transactions
    customers_filtered = customers.loc[customers['customer_id'].isin(customer_ids)].copy()
    articles_filtered = articles.loc[articles['article_id'].isin(article_ids)].copy()

    return customers_filtered, articles_filtered

## LightGBM

|Feature|LightGBM|XGBoost|CatBoost|
|:----|:----|:----|:----|
|Categoricals|Supports categorical features via one-hot encoding|Supports categorical features via one-hot encoding|Automatically handles categorical features using embeddings|
|Speed|Very fast training and prediction|Fast training and prediction|Slower than LightGBM and XGBoost|
|Handling Bias|Handles unbalanced classes via 'is_unbalance'|Handles unbalanced classes via 'scale_pos_weight'|Automatically handles unbalanced classes|
|Handling NaNs|Handles NaN values natively|Requires manual handling of NaNs|Automatically handles NaN values using special category|
|Custom Loss|Supports custom loss functions|Supports custom loss functions|Supports custom loss functions|


To use LightGBM for a ranking problem, we could also treat this as a binary classification problem where the target variable is whether an item is relevant or not to the user.

OR, use LightGBM's ranking API, which is designed for ranking problems. Instead of optimizing for accuracy, the ranking API optimizes for ranking metric MAP (deprecated). 

### Feature Engineering

In [None]:
# LightGBM imports

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [None]:
# get top 200 customers by number of transactions
top_customers = transactions['customer_id'].value_counts().head(200).index.tolist()

# print num of transactions for the 200th customer
print(transactions['customer_id'].value_counts().sort_values(ascending=False).iloc[199])

# only get articles that were purchased by top 200 customers at least once in articles df
articles_top_200 = articles[articles['article_id'].isin(transactions[transactions['customer_id'].isin(top_customers)]['article_id'].unique())]

# only get 200 customers in customers df
customers_top_200 = customers[customers['customer_id'].isin(top_customers)]

articles = articles_top_200.copy()
customers = customers_top_200.copy()
transactions = transactions[transactions['customer_id'].isin(top_customers)].copy()
transactions = transactions.drop_duplicates().copy()

In [None]:
print(transactions.isnull().sum())
print(customers.isnull().sum())
print(articles.isnull().sum())

In [None]:
print(len(transactions))
print(len(customers))
print(len(articles))


In [None]:
articles.head()

In [None]:
customers.head()

In [None]:
transactions.head()

In [None]:
# Dropping columns with uninformative article data

articles = articles.drop(columns=['product_code', 'prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])
articles = articles.drop(columns=[col for col in articles.columns if 'colour_' in col or 'perceived_' in col])

In [None]:
articles.head()

These columns are left to capture any potential patterns in the other columns, such as how certain index codes or sections might be associated with higher or lower sales.

In [None]:
# Feature engineering
from sklearn.preprocessing import LabelEncoder

# Define mapping for fashion_news_frequency feature
fashion_news_freq_mapping = {'None': 0, 'Monthly': 1, 'Regularly': 2}

# label encode fashion_news_frequency feature
le = LabelEncoder()
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].map(fashion_news_freq_mapping)
customers['fashion_news_frequency'] = le.fit_transform(customers['fashion_news_frequency'])

In [None]:
customers = customers.drop(['postal_code'], axis=1)
customers.head()

In [None]:
# Feature engineering: encode nominal categorical features
ohe = OneHotEncoder()
# One-hot encode sales_channel_id feature
sales_channel_ohe = pd.get_dummies(transactions['sales_channel_id'], prefix='sales_channel')
transactions = pd.concat([transactions, sales_channel_ohe], axis=1)

# Drop the original sales_channel_id feature
transactions.drop('sales_channel_id', axis=1, inplace=True)

In [None]:
# boolean unique values of sales_channel_id after encoding
print(transactions['sales_channel_1'].unique())
print(transactions['sales_channel_2'].unique())

In [None]:
transactions.head()

In [None]:
# Convert 't_dat' column to datetime format
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Group by customer ID and find the first and last transaction dates
first_trans_dates = transactions.groupby('customer_id')['t_dat'].min().reset_index()
last_trans_dates = transactions.groupby('customer_id')['t_dat'].max().reset_index()

customer_purchase_engagement = pd.merge(first_trans_dates, last_trans_dates, on='customer_id', suffixes=('_first', '_last'))
# Create a new feature by calculating the time difference in days between first and last transactions
customer_purchase_engagement['time_diff_days'] = (customer_purchase_engagement['t_dat_last'] - customer_purchase_engagement['t_dat_first']).dt.days
# Drop the original first and last transaction date columns
customer_purchase_engagement.drop(['t_dat_first', 't_dat_last'], axis=1, inplace=True)
customer_purchase_engagement.head()

# Merge the customer_purchase_engagement dataframe with the customers dataframe
customers = pd.merge(customers, customer_purchase_engagement, on='customer_id', how='left')
customers.head()

The above `time_diff_days` feature can potentially provide insights into a customer's engagement by looking at the gap in the number of days between the last purchase and the current date. <br> The assumption is, The larger the gap, the less engaged the customer is. 

In [None]:
# Join the transaction dataframe with the customers dataframe
merged = pd.merge(transactions, customers, on='customer_id', how='inner')

# Calculate the mean age for each article
item_mean_age = merged.groupby('article_id')['age'].mean()

# Calculate the difference between every user's age and the mean age of users who have purchased a particular item
merged['age_diff'] = merged['age'] - merged['article_id'].map(item_mean_age)

# Group by article and take the mean of age_diff
article_age_diff = merged.groupby('article_id')['age_diff'].mean()

# Append the age difference feature to the articles dataframe
articles['age_diff'] = articles['article_id'].map(article_age_diff)

articles.head()

Mean age_diff for every article. It can be useful for predicting whether a user will buy an item based on their age and the age of other users who have already bought the same item. 

Intuituion behind the `age_diff` feature:

Let's say we have a dataset of customers who made transactions for a particular item with article_id = 123. Here is an example of how we can calculate the age_diff feature: <br>
Assume that the mean age of all customers who bought the item with article_id = 123 is 40 years old <br>
Customer A made a transaction for item with article_id = 123 and their age is 35. The age_diff feature for this transaction would be -5. (35 - 40). <br>
Customer B made a transaction for item with article_id = 123 and their age is 50. The age_diff feature for this transaction would be 10. (50 - 40). <br>
Customer C made a transaction for item with article_id = 123 and their age is 40. The age_diff feature for this transaction would be 0. (40 - 40). <br>
So, the age_diff feature measures the difference between the age of each customer who bought a specific item and the average age of all customers who bought that item. <br>

Therefore, the age_diff is the mean of all these individual age_diff values for each customer who bought the item with article_id = 123. age_diff = -1.66 for this example<br>


In [None]:
# Calculate mean, max, and min age for each item
item_mean_age = merged.groupby('article_id')['age'].mean()
item_max_age = merged.groupby('article_id')['age'].max()
item_min_age = merged.groupby('article_id')['age'].min()

# Merge the features back into the articles dataframe
articles = articles.merge(item_mean_age, on='article_id', how='left')
articles = articles.merge(item_max_age, on='article_id', how='left')
articles = articles.merge(item_min_age, on='article_id', how='left')

# Rename the columns to make them more descriptive
articles = articles.rename(columns={'age_x': 'mean_purchase_age', 'age_y': 'max_purchase_age', 'age': 'min_purchase_age'})

articles.head()

Intuituion behind the `*_purchase_age` feature:

Additional age features to capture more information about the age of the customers who bought the respective articles. The gbdt might be able to learn more complex patterns from these features. <br>

In [None]:
# Calculate purchased item count for each user
transactions['quantity'] = 1
user_item_count = transactions.groupby(['customer_id', 'article_id'])['quantity'].sum().reset_index()

# Calculate total item count for each article
total_item_count = transactions.groupby('article_id')['quantity'].sum().reset_index()
total_item_count.columns = ['article_id', 'total_items']

user_item_count = pd.merge(user_item_count, total_item_count, on='article_id', how='left')

# Calculate ratio of purchased item count and total item count
user_item_count['article_engagement_ratio'] = user_item_count['quantity'] / user_item_count['total_items']


transactions = pd.merge(transactions, user_item_count[['customer_id', 'article_id', 'article_engagement_ratio']], on=['customer_id', 'article_id'], how='left')
transactions['quantity'] = user_item_count['quantity']

# fill missing values with 0
transactions['quantity'] = transactions['quantity'].fillna(0)
transactions.head()

Intuition behind feature: <br>

`article_engagement_ratio`: The feature is ratio of one user's purchased item count and the item's total purchase count. This serves to measure how engaged a user is with a particular item, which can be useful for predicting whether a user will buy similar items. <br>
Can also be used to measure how popular an item is, and can be used to potentially diversify recommendations.

In [None]:
transactions, all_customers, all_articles, customer_id_indices_map, article_id_indices_map = preprocess_data(transactions, customers, articles)

print("Total num of customers: ", len(all_customers))
print("Total num of articles: ", len(all_articles))
print("Customer ID mapping: ", list(customer_id_indices_map.items())[:5])
print("Article ID mapping: ", list(article_id_indices_map.items())[:5])
transactions.head()

In [None]:
# user item matrix -- rows are users, columns are items, doesnt need article and customer data

user_item_matrix = create_user_item_matrix(transactions)
user_item_matrix

In [None]:
print(user_item_matrix[:10, :10].toarray())

In [None]:
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity

# from the als_strat1_hyperparam_log 
# Create ALS model with default parameters
alpha = 25
als_model = AlternatingLeastSquares(factors=55, iterations=20, regularization=0.18)

# Fit model to user-item matrix
als_model.fit(user_item_matrix*alpha)

# Latent factors matrices
item_factors = als_model.item_factors
user_factors = als_model.user_factors

# item-item cosine similarity 
item_similarities = cosine_similarity(item_factors, dense_output=False)
# user-user cosine similarity
user_similarities = cosine_similarity(user_factors, dense_output=False)

k = 5
# Get top-k most similar items for each item
top_k_similar_items = item_similarities.argsort()[:, -k-1:-1]
# Get top-k most similar user for each user
top_k_similar_users = user_similarities.argsort()[:, -k-1:-1]

In [None]:
print(item_similarities.shape)
print(user_similarities.shape)

In [None]:
# important: add user_index and item_index to customers and articles respectively
customers['user_index'] = customers['customer_id'].map(customer_id_indices_map)
articles['item_index'] = articles['article_id'].map(article_id_indices_map)


In [None]:
user_indices = np.arange(transactions['user_index'].nunique())
item_indices = np.arange(transactions['item_index'].nunique())
user_index_dict = dict(zip(sorted(transactions['user_index'].unique()), user_indices))
item_index_dict = dict(zip(sorted(transactions['item_index'].unique()), item_indices))

# print first 5 key and values in the dictionary
print(list(user_index_dict.items())[:5])
print(list(item_index_dict.items())[:5])


In [None]:
# Create a mapping from user IDs to matrix indices
user_indices = np.arange(transactions['user_index'].nunique())
item_indices = np.arange(transactions['item_index'].nunique())
user_index_dict = dict(zip(sorted(transactions['user_index'].unique()), user_indices))
item_index_dict = dict(zip(sorted(transactions['item_index'].unique()), item_indices))


# Create features for each user based on the average quantity purchased by similar customers
for i in range(len(customers)):
    customer_id = customers.loc[i, 'user_index']
    # Get the matrix index for the current customer
    customer_idx = user_index_dict.get(customer_id, -1)
    
    if customer_idx != -1:
        similar_user_indexxs = top_k_similar_users[customer_idx]
        # Compute the mean of the user-item matrix for the similar users
        user_feature = user_item_matrix[similar_user_indexxs, :].mean(axis=0).A1
        
        # Store the mean in the customers DataFrame
        customers.loc[i, 'user_purchase_quant'] = user_feature.mean()
    else:
        # If the current customer is not in the user-item matrix, set the feature to NaN
        customers.loc[i, 'user_purchase_quant'] = np.nan

# Print the head of the customers DataFrame
customers.head()


Intuition behind feature: <br>

`user_purchase_quant`: Gets the average quantity of items purchased by the k most similar customers to that customer. It looks at what other customers who are similar to this customer have bought and calculates the average amount of each item they bought. This feature can be used to predict what items a customer is likely to buy in the future based on what similar customers have bought in the past. This feature aims to capture purchase behaviours of a customer.<br>

For example, if a customer typically buys a lot of bomber jackets, and the top k most similar customers to that customer also tend to buy a lot of bomber jackets, then the average quantity of bomber jackets purchased by those similar customers could be a good predictor of how much the original customer is likely to purchase in the future. This however assumes that the k most similar customers have similar purchase behaviours to the customers in question, and on its own is not a strong feature.<br>


In [None]:
# Create a binary feature for each item that indicates whether or not a customer has bought that item,
# based on whether other customers who bought similar items also tended to buy that item.
for i in range(len(articles)):
    item_id = articles.loc[i, 'item_index']
    # Get the matrix index for the current item
    item_idx = item_index_dict.get(item_id, -1)
    
    if item_idx != -1:
        # List of (column) indices of similar items
        similar_items = top_k_similar_items[item_idx]
        
        # Find the customers who have purchased the current item
        customer_indices = np.where(user_item_matrix[:, item_idx].toarray()[:, 0] == 1)[0]
        
        # Binary vector representing the customer's purchases for the similar items
        customer_purchases = user_item_matrix[customer_indices, :][:, similar_items].toarray()
        article_preference = np.any(customer_purchases, axis=0)
        
        # Set article_preference to 1 if any customer has purchased the item, 0 otherwise
        articles.loc[i, 'article_preference'] = int(np.any(article_preference))
    else:
        articles.loc[i, 'article_preference'] = np.nan

In [None]:
articles[articles['article_preference'] == 0].head()

Intuition behind feature: <br>

`article_preference`: Binary feature for each item that indicates whether or not a customer has bought that item, based on whether other customers who bought similar items also tended to buy that item. <br>

This feature can be useful for a fashion-based recommender system because it captures the idea that customers who have similar tastes or preferences tend to buy similar items. For example, if a customer has a history of buying shirts and other customers who bought similar shorts also tended to buy a specific pair of jeans, then the binary feature for those jeans would be set to 1 for that customer, indicating that they are likely to be interested in those shoes. The binary feature can then be used as a predictor for which items to recommend to the customer.

In [None]:
# Create feature for each item based on the total number of times it was purchased by similar customers
for i in range(len(articles)):
    item_id = articles.loc[i, 'item_index']
    item_idx = item_index_dict.get(item_id, -1)
    
    if item_idx != -1:
    
        similar_items = top_k_similar_items[item_idx]
        
        # Find the customers who have purchased the current item
        customer_indices = np.where(user_item_matrix[:, item_idx].toarray()[:, 0] == 1)[0]
        
        # Compute the mean of the user-item matrix for the similar items
        item_feature = user_item_matrix[customer_indices, :][:, similar_items].sum() / len(customer_indices)
        articles.loc[i, 'item_purchase_frequency'] = item_feature.mean()
        
    else:
        articles.loc[i, 'item_purchase_frequency'] = np.nan

In [None]:
articles.head()

Intuition behind feature: <br>

`item_purchase_frequency`: It gives an estimate of how frequently an item is being bought by customers who have similar purchase histories. This feature is useful because it can provide insights into purchasing patterns and identify popular items that are often bought together. <br> It can potentially help identify popular items among customer groups, and the lightgbm model can potentially use this feature.

In [None]:
merged_df = pd.merge(transactions, articles[['item_index']], on='item_index', how='left')

merged_df.head()

In [None]:
# Merge transactions with articles on item_index to access the price of each article
merged_df = pd.merge(transactions, articles[['item_index']], on='item_index', how='left')

# Compute the average price levels of all articles purchased by similar customers who have purchased this particular article in the past
for i in range(len(articles)):
    item_id = articles.loc[i, 'item_index']
    item_idx = item_index_dict.get(item_id, -1)
    
    if item_idx != -1:
        similar_items = top_k_similar_items[item_idx]
        
        # Find the customers who have purchased the current item
        customer_indices = np.where(user_item_matrix[:, item_idx].toarray()[:, 0] == 1)[0]
        
        # Compute the average price levels of all articles purchased by similar customers who have purchased this particular article in the past
        item_feature = merged_df.loc[(merged_df['item_index'] == item_id) & (merged_df['user_index'].isin(customer_indices)), 'price'].mean()
        articles.loc[i, 'item_avg_price_level'] = item_feature
    else:
        articles.loc[i, 'item_avg_price_level'] = np.nan

Intuition behind feature: <br>

`item_avg_price_level`: Calculates the average price levels of all articles purchased by similar customers who have purchased this particular article in the past.  <br> It provides information on the typical price level of articles that are purchased together with a given article, as indicated by the purchasing patterns of similar customers. 

For example, if customers who frequently purchase articles A also tend to purchase higher-priced article, then the "item_avg_price_level" feature for article A would be relatively high. 

In [None]:
articles.head()

In [None]:
# # print num of unique articles['graphical_appearance_no']
print(articles['graphical_appearance_no'].nunique())
print(articles['product_type_no'].nunique())
print(articles['department_no'].nunique())
print(articles['index_code'].nunique())
print(articles['index_group_no'].nunique()) 
print(articles['section_no'].nunique())
print(articles['garment_group_no'].nunique())

In [None]:
articles.head()

In [None]:
# already dropped for articles
# articles_final_df = articles.drop(['article_id'], axis=1).copy()
customers_final_df = customers.drop(['customer_id'], axis=1).copy()
transactions_final_df = transactions.drop(['article_id', 'customer_id'], axis=1).copy()

# Merge transactions with customers
df = pd.merge(transactions_final_df, customers_final_df, on='user_index', how='left')

# Merge resulting dataframe with articles_final_df usually
df = pd.merge(df, articles, on='item_index', how='left')

In [None]:
# exttracting time-based features

df['t_dat'] = pd.to_datetime(df['t_dat'])
df['year'] = df['t_dat'].dt.year
df['month'] = df['t_dat'].dt.month
df['day'] = df['t_dat'].dt.day

In [None]:
# Calculate RFM features, reference: https://www.geeksforgeeks.org/rfm-analysis-analysis-using-python/

from datetime import datetime, timedelta

# Calculate recency
last_purchase_date = transactions.groupby('user_index')['t_dat'].max().reset_index()
last_purchase_date.columns = ['user_index', 'last_purchase_date']
last_purchase_date['recency'] = (last_purchase_date['last_purchase_date'].max() - last_purchase_date['last_purchase_date']).dt.days
last_purchase_date.drop('last_purchase_date', axis=1, inplace=True)


# Calculate Frequency
frequency = transactions.groupby('user_index')['t_dat'].count().reset_index()
frequency.columns = ['user_index', 'frequency']

# Calculate Monetary Value
monetary_value = transactions.groupby('user_index')['price'].sum().reset_index()
monetary_value.columns = ['user_index', 'monetary_value']

# Merge all RFM features into a single DataFrame
rfm = last_purchase_date[['user_index', 'recency']].merge(frequency, on='user_index').merge(monetary_value, on='user_index')

# Calculate RFM Scores
quantiles = rfm.quantile(q=[0.25, 0.5, 0.75])
quantiles = quantiles.to_dict()

def rfm_segmenter(x, quantiles):
    if x <= quantiles['recency'][0.25]:
        return 4
    elif x <= quantiles['recency'][0.50]:
        return 3
    elif x <= quantiles['recency'][0.75]: 
        return 2
    else:
        return 1
    
rfm['R'] = rfm['recency'].apply(rfm_segmenter, args=(quantiles,))

def f_segmenter(x, quantiles):
    if x <= quantiles['frequency'][0.25]:
        return 1
    elif x <= quantiles['frequency'][0.50]:
        return 2
    elif x <= quantiles['frequency'][0.75]: 
        return 3
    else:
        return 4

rfm['F'] = rfm['frequency'].apply(f_segmenter, args=(quantiles,))

def m_segmenter(x, quantiles):
    if x <= quantiles['monetary_value'][0.25]:
        return 1
    elif x <= quantiles['monetary_value'][0.50]:
        return 2
    elif x <= quantiles['monetary_value'][0.75]: 
        return 3
    else:
        return 4

rfm['M'] = rfm['monetary_value'].apply(m_segmenter, args=(quantiles,))

# Calculate RFM Score
rfm['RFM Score'] = rfm['R'].map(str) + rfm['F'].map(str) + rfm['M'].map(str)
rfm = rfm.drop(['R', 'F', 'M'], axis=1)

# Display sample of RFM DataFrame
print(rfm.head())

In [None]:
rfm.head()

In [None]:
# merge rfm with df on user_index

df = pd.merge(df, rfm, on='user_index', how='left')
df.head()

In [None]:
final_df = df.drop(['t_dat'], axis=1).copy()
# print all column names in final_df
print(final_df.columns)
final_df.head()

In [None]:
final_df['RFM Score'] = pd.to_numeric(final_df['RFM Score'], errors='coerce')

In [None]:
# save the DataFrame as a pickle file
df.to_pickle('lightgbm/df.pkl')
final_df.to_pickle('lightgbm/final_df.pkl')

In [None]:
# print final_df shape
print(final_df.shape)

In [None]:
user_indices, item_indices = user_item_matrix.get_shape()

print('Number of users: %d' % user_indices)
print('Number of items: %d' % item_indices)

In [None]:
final_df.head()
print(final_df.shape)

In [None]:
import pickle

# Assume you have a CSR matrix called user_item_matrix
# Save the matrix as a pickle file
with open('user_item_matrix_200.pkl', 'wb') as f:
    pickle.dump(user_item_matrix, f)

In [None]:
# load user_item_matrix from pickle file

with open('user_item_matrix_200.pkl', 'rb') as f:
    user_item_matrix = pickle.load(f)

user_item_matrix = user_item_matrix.toarray()

# extract indices of non-zero elements
user_purchased_indices, item_purchased_indices = user_item_matrix.nonzero()

print('user_purchased_indices: ', user_indices)
print('item_purchased_indices: ', item_indices)

In [None]:
# create a list to hold the dummy data

# load final_df from pickle file for clean processing
with open('lightgbm/final_df.pkl', 'rb') as f:
    final_df = pickle.load(f)

dummy_data = []

# get the unique user and item indices from final_df
users = final_df['user_index'].unique()
items = final_df['item_index'].unique()

# final_df.shape[1]
# loop through all possible user-item pairs
for user in users:
    for item in items:
        # check if the user-item pair has an interaction in the sparse matrix
        if user_item_matrix[user, item] == 1:
            # if it does, set the target of the corresponding row in final_df to 1
            final_df.loc[(final_df['user_index'] == user) & (final_df['item_index'] == item), 'target'] = 1
        else:
            # if it doesn't, add a row to the dummy data with target = 0 and stub values for other columns
            dummy_data.append([user, item] + [np.nan] * (final_df.shape[1] - 2))


dummy_df = pd.DataFrame(dummy_data, columns=['user_index', 'item_index'] + list(final_df.columns.drop(['user_index', 'item_index'])))
dummy_df['target'] = 0  # set target to 0 for the dummy data
final_df = pd.concat([final_df, dummy_df], ignore_index=True)

final_df.shape

In [None]:
# drop recency	frequency	monetary_value columns
final_df = final_df.drop(['recency', 'frequency', 'monetary_value'], axis=1)

In [None]:
# one-hot encode garment_group_no and index_group_no columns
one_hot_cols = ['garment_group_no', 'index_group_no']
final_df = pd.get_dummies(final_df, columns=one_hot_cols)

final_df.head()

In [None]:
# save the DataFrame as a pickle file
final_df.to_pickle('lightgbm/final_df_with_binary_targets.pkl')

In [None]:
# save the article_id_indices_map and user_id_indices_map as pickle files
with open('lightgbm/article_id_indices_map.pkl', 'wb') as f:
    pickle.dump(article_id_indices_map, f)
with open('lightgbm/customer_id_indices_map.pkl', 'wb') as f:
    pickle.dump(customer_id_indices_map, f)

In [None]:
# rename RFM_Score to RFM_Score
final_df.rename(columns={'RFM Score': 'RFM_Score'}, inplace=True)
# convert sales_channel_ 1 to to boolean
final_df['sales_channel_1'] = final_df['sales_channel_1'].astype('bool')
final_df['sales_channel_2'] = final_df['sales_channel_2'].astype('bool')
final_df.to_pickle('lightgbm/final_df_with_binary_targets.pkl')
# final_df.dtypes

### Model Training

In [None]:
import pickle

# load final_df from pickle file for clean processing
with open('lightgbm/final_df_with_binary_targets.pkl', 'rb') as f:
    final_df = pickle.load(f)

In [None]:
final_df.columns

In [None]:
final_df.head()

In [None]:
grouped_data = final_df.groupby('user_index')
grouped_data.head()

In [None]:
# load df from pickle file for clean processing
with open('lightgbm/df.pkl', 'rb') as f:
    df = pickle.load(f)

df.head()

In [1]:
def time_based_train_test_split(final_df, test_size=0.2):

    # Convert days, months, and years columns to datetime object
    final_df['date'] = pd.to_datetime(final_df[['day', 'month', 'year']])

    # Sort dataframe by date in ascending order
    final_df = final_df.sort_values(by='date')

    # Calculate cutoff index
    cutoff_index = int(len(final_df) * (1-test_size))

    # Create train and test dataframes
    train_df = final_df[:cutoff_index]
    test_df = final_df[cutoff_index:]

    # Drop date column from train and test dataframes
    train_df = train_df.drop('date', axis=1)
    test_df = test_df.drop('date', axis=1)

    # split train_df into X_train and y_train
    X_train = train_df.drop('target', axis=1)
    y_train = train_df['target']

    # split test_df into X_test and y_test
    X_test = test_df.drop('target', axis=1)
    y_test = test_df['target']

    return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = time_based_train_test_split(final_df, test_size=0.2)

# drop date column from final_df
final_df = final_df.drop('date', axis=1)

#print the shape of X_train, X_test, y_train, y_test
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
final_df.to_pickle('lightgbm/final_df_with_binary_targets.pkl')

In [2]:
import pickle
import pandas as pd
import numpy as np
with open('lightgbm/final_df_with_binary_targets.pkl', 'rb') as f:
    final_df = pickle.load(f)

In [3]:
# 80/20 time-based split to curb data leakage
X_train, X_test, y_train, y_test = time_based_train_test_split(final_df, test_size=0.2)
final_df = final_df.drop('date', axis=1)

# print(len(X_train))
# print(len(y_train))
# print(len(X_test))
# print(len(y_test))

grouped_data_train = X_train.groupby('user_index')
grouped_data_test = X_test.groupby('user_index')
groups = [grouped_data_train.groups[user] for user in grouped_data_train.groups.keys()]
groups_flat = np.concatenate(groups)
print(X_train['user_index'].nunique())
print(len(X_train))
print(len(y_train))
print(len(groups_flat))
print(len(grouped_data_train.groups.values()))

200
6242440
6242440
6242440
200


In [5]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ndcg_score
from category_encoders import TargetEncoder
from sklearn.feature_selection import RFECV
import joblib
from sklearn.model_selection import KFold
from sklearn.metrics import get_scorer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.pipeline import Pipeline

# Define columns to target encode
cols_to_encode = ['department_no', 'product_type_no', 'section_no', 'graphical_appearance_no']

# Define number of folds for cross-validation
n_splits = 5

# Create KFold object for cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# Perform target encoding with cross-validation
for col in cols_to_encode:
    final_df[f'{col}_te'] = 0
    te = TargetEncoder(cols=[col])
    for train_idx, val_idx in kf.split(final_df):
        te.fit(final_df.iloc[train_idx][[col]], final_df.iloc[train_idx]['target'])
        final_df.loc[val_idx, f'{col}_te'] = te.transform(final_df.iloc[val_idx][[col]]).values.flatten()

# Define features and target
features = final_df.columns.tolist()
features.remove('target')
target = 'target'

# # 80/20 time-based split to curb data leakage
# X_train, X_test, y_train, y_test = time_based_train_test_split(final_df, test_size=0.2)
# final_df = final_df.drop('date', axis=1)

# Group data by user -- so that LightGBM knows which data points belong to each user and can compute the metrics correctly
grouped_data_train = X_train.groupby('user_index')
grouped_data_test = X_test.groupby('user_index')

# Create LightGBM datasets with group query information
train_data = lgb.Dataset(X_train, label=y_train, group=grouped_data_train.groups.values())
test_data = lgb.Dataset(X_test, label=y_test, group=grouped_data_test.groups.values())

# Define hyperparameters
params = {'objective': 'binary',
          'boosting_type': 'gbdt',
          'metric': 'map',
          'num_leaves': 31,
          'learning_rate': 0.05,
          'feature_fraction': 0.9,
          'bagging_fraction': 0.8,
          'bagging_freq': 5,
          'early_stopping_rounds': 10,
          'verbose': 1}
          
grid_params = {
    'selector__k': [10, 25, 40], 
    'lgbm__learning_rate': [0.01, 0.05, 0.1],
    'lgbm__num_leaves': [15, 31, 63],
    'lgbm__bagging_fraction': [0.6, 0.8, 1.0],
    'lgbm__feature_fraction': [0.6, 0.8, 1.0],
}

# Create a pipeline that includes SelectKBest
selector = SelectKBest(score_func=chi2)
pipeline = Pipeline(steps=[('selector', selector), ('lgbm', lgb.LGBMClassifier(**params))])

# Add num_boost_round parameter to lgbm estimator in pipeline
pipeline.named_steps['lgbm'].set_params(num_boost_round=100)

# Perform grid search on the pipeline
scoring = get_scorer('average_precision')
clf = GridSearchCV(estimator=pipeline, param_grid=grid_params, cv=2, scoring=scoring, n_jobs=-1, verbose=2)
groups = [grouped_data_train.groups[user] for user in grouped_data_train.groups.keys()]
groups_flat = np.concatenate(groups)
clf.fit(X_train, y_train, groups=groups_flat)

# Save the best model
joblib.dump(clf.best_estimator_, 'best_model.pkl')
print(f'Best hyperparameters: {clf.best_params_}')
print(f'Best map score: {clf.best_score_}')

# Save the feature importances to a file
feat_importances = pd.Series(clf.best_estimator_.feature_importances_, index=X_train_sel.columns)
feat_importances.nlargest(20).plot(kind='barh')
plt.savefig('lightgbm/feature_importances.png')

# Save the selected features
selected_features = X_train_sel.columns.tolist()
joblib.dump(selected_features, 'lightgbm/selected_features.pkl')

# Evaluate the best model on the test set
y_pred = clf.best_estimator_.predict(X_test_sel, num_iteration=clf.best_estimator_.best_iteration_)
ndcg = ndcg_score(y_test, y_pred, group_scores=True, verbose=1)
print(f'NDCG score on test set: {ndcg}')

Fitting 2 folds for each of 405 candidates, totalling 810 fits


MemoryError: Unable to allocate 47.6 MiB for an array with shape (2, 3121220) and data type float64

Once the model is trained, it can be used to predict the probability of purchase for new user-product pairs, which can be used to generate recommendations for users.

In [None]:
# Assume X is the input data for the LightGBM model
# X has a row for each user-product pair and a binary target indicating whether the user purchased the product or not

# Train the LightGBM model on X
# lgb_model = lgb.LGBMClassifier(**best_params)
# lgb_model.fit(X, y)

# Generate candidate products for each user
# This can be done using a combination of popular products and user purchase history
# Let's assume we have a dictionary 'user_products' that maps each user ID to a list of products they've purchased
user_candidates = {}
for user_id in user_products:
    # Select the 600 most popular products
    popular_products = select_popular_products(600)
    
    # Add user purchase history to candidate list
    user_history = user_products[user_id]
    candidate_products = list(set(popular_products + user_history))
    
    # Store candidate products for this user
    user_candidates[user_id] = candidate_products

# Predict probabilities of purchase for each candidate product for each user
user_scores = {}
for user_id, candidates in user_candidates.items():
    # Create input data for this user
    user_data = create_user_data(user_id, candidates)
    
    # Predict probabilities using the LightGBM model
    scores = lgb_model.predict_proba(user_data)[:, 1]
    
    # Store scores for this user
    user_scores[user_id] = scores

# Rank candidate products for each user and return top 12 as recommendations
recommendations = {}
for user_id, scores in user_scores.items():
    # Sort candidate products by descending score
    candidate_products = user_candidates[user_id]
    sorted_indices = np.argsort(scores)[::-1]
    sorted_products = [candidate_products[i] for i in sorted_indices]
    
    # Select top 12 products
    top_products = sorted_products[:12]
    
    # Add user purchase history to top products
    top_products += user_products[user_id]
    
    # Remove duplicates and return as recommendations
    recommendations[user_id] = list(set(top_products))

If we treat this as a binary classification problem, we would be ignoring the importance of the ranking of the recommended items and the MAP metric would not be appropriate. Since we are using MAP as the evaluation metric, we should use the LightGBM ranking API instead of the binary classification API.

In [None]:
# import lightgbm as lgb
# from sklearn.model_selection import train_test_split, GridSearchCV
# from sklearn.metrics import make_scorer
# from sklearn.metrics import average_precision_score
# from sklearn.model_selection import ParameterGrid
# import numpy as np
# import pickle
# import os

# target = 'item_index'
# features = final_df.columns.tolist()
# features.remove(target)

# # split the data into training and test sets -- can also do time-based split
# X_train, X_test, y_train, y_test = train_test_split(final_df[features], final_df[target], test_size=0.2, random_state=42)

# # for number of items to rank for each user (group param for ordered ranking)
# num_items_per_user = 12
# user_indices = X_test.index.unique()
# query = [num_items_per_user] * len(user_indices)
# query_ids = []
# for user_index in user_indices:
#     user_indices_repeated = [user_index] * num_items_per_user
#     query_ids.extend(user_indices_repeated)

# train_data = lgb.Dataset(X_train, label=y_train, group=query_ids)

# # MAP@12 metric
# def mean_average_precision(y_true, y_score, k=12):
#     # get the indices of the top k scores
#     top_k_indices = np.argsort(y_score)[::-1][:k]

#     # calculate average precision at k
#     return average_precision_score(y_true[top_k_indices], y_score[top_k_indices])

# # define hyperparameters for tuning
# params = {
# 'objective': 'lambdarank', #using lightgbm ranking API
# 'metric': 'MAP',
# 'learning_rate': 0.05,
# 'num_leaves': 31,
# 'max_depth': 5,
# 'min_data_in_leaf': 50,
# 'feature_fraction': 0.8,
# 'bagging_fraction': 0.8,
# 'bagging_freq': 5
# }

# # create LightGBM model
# model = lgb.LGBMRanker()

# # perform grid search with cross-validation
# param_grid = {
# 'num_leaves': [31, 50, 75],
# 'max_depth': [5, 7, -1],
# 'min_data_in_leaf': [20, 50, 100],
# 'feature_fraction': [0.6, 0.8, 1],
# 'bagging_fraction': [0.6, 0.8, 1],
# 'bagging_freq': [1, 3, 5],
# 'lambda_l1': [0, 1, 2],
# 'lambda_l2': [0, 1, 2]
# }

# best_map_score = 0.0
# best_model = None

# for params_dict in ParameterGrid(param_grid):
#     params.update(params_dict)
#     model = lgb.train(params, train_data)
#     y_pred = model.predict(X_test, group=query)
#     map_score = mean_average_precision(y_test, y_pred, k=12)
#     if map_score > best_map_score:
#         best_map_score = map_score
#         best_model = model
#         with open(f"lightgbm/grid_search_model_{map_score:.4f}.pickle", 'wb') as f:
#             pickle.dump(model, f)

# # save the best model
# if not os.path.exists('lightgbm'):
#     os.makedirs('lightgbm')
# with open('lightgbm/best_model.pickle', 'wb') as f:
#     pickle.dump(best_model, f)

# # print the best MAP score
# print(f"Best mean average precision: {best_map_score}")

# todo



- Lightgbm training with training with MAP as eval metric, grid search for hyperparams (ref. kaggle for starting params) (in built train test split? or by dates?)
- Lightgbm recommendation example
  

- Baseline model evalutaion for top 200 (same train test split as Lightgbm)

- Redo ALS for top 200 (same train test split as Lightgbm, import user_item_matrix)


- comparison of ALS and Lightgbm and baseline model