# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Pre-Processing</span>**](#Pre-Processing)
* [**<span>3. Exploratory Data Analysis</span>**](#Exploratory-Data-Analysis)  
    * [**<span>3.1 Articles</span>**](#EDA::Articles)  
    * [**<span>3.2 Customers</span>**](#EDA::Customers)
    * [**<span>3.3 Transactions</span>**](#EDA::Transactions)
* [**<span>4. Helper FunctionsDecorators</span>**](#Helper-Functions)
* [**<span>5. Models</span>**](#Models) 
    * [**<span>5.1 Popularity</span>**](#Popularity-Model)   
    * [**<span>5.2 ALS</span>**](#Alternating-Least-Squares)  
    * [**<span>5.2 GBDT</span>**](#GBDT)  
    * [**<span>5.3 SGD/similar</span>**](#SGD)  
    * [**<span>5.4 NN</span>**](#NN)

## Imports

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
# import cudf # switch on P100 GPU for this to work in Kaggle
# import cupy as cp

# Importing data
articles = pd.read_csv('articles.csv')
print(articles.head())
print("--")
customers = pd.read_csv('customers.csv')
print(customers.head())
print("--")
transactions = pd.read_csv("transactions_train.csv")
print(transactions.head())
print("--")

   article_id  product_code          prod_name  product_type_no   
0   108775015        108775          Strap top              253  \
1   108775044        108775          Strap top              253   
2   108775051        108775      Strap top (1)              253   
3   110065001        110065  OP T-shirt (Idro)              306   
4   110065002        110065  OP T-shirt (Idro)              306   

  product_type_name  product_group_name  graphical_appearance_no   
0          Vest top  Garment Upper body                  1010016  \
1          Vest top  Garment Upper body                  1010016   
2          Vest top  Garment Upper body                  1010017   
3               Bra           Underwear                  1010016   
4               Bra           Underwear                  1010016   

  graphical_appearance_name  colour_group_code colour_group_name  ...   
0                     Solid                  9             Black  ...  \
1                     Solid               

## Pre-Processing

In [87]:
# ----- empty value stats -------------
print("Missing values: ")
print(customers.isnull().sum())
print("--\n")

print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

# ---- data cleaning -------------

customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)

# replace club_member_status missing values with 'LEFT CLUB' --> no members with LEFT CLUB status in data
customers['club_member_status'] = customers['club_member_status'].fillna('LEFT CLUB')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('None')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE', 'None')
customers['age'] = customers['age'].fillna(customers['age'].mean())
customers['age'] = customers['age'].astype(int)
articles['detail_desc'] = articles['detail_desc'].fillna('None')


print("Customers' Missing values: ")
print(customers.isnull().sum())
print("--\n")

Missing values: 
customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16011
age                        15861
postal_code                    0
dtype: int64
--

FN Newsletter vals:  [nan  1.]
Active communication vals:  [nan  1.]
Club member status vals:  ['ACTIVE' nan 'PRE-CREATE' 'LEFT CLUB']
Fashion News frequency vals:  ['NONE' 'Regularly' nan 'Monthly']
--

Customers' Missing values: 
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
--



In [88]:
# ---- memory optimizations -------------

# reference: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

# iterate through all the columns of a dataframe and reduce the int and float data types to the smallest possible size, ex. customer_id should not be reduced from int64 to a samller value as it would have collisions
import numpy as np
import pandas as pd

def reduce_mem_usage(df):
    """Iterate over all the columns of a DataFrame and modify the data type
    to reduce memory usage, handling ordered Categoricals"""
    
    # check the memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'category':
            if df[col].cat.ordered:
                # Convert ordered Categorical to an integer
                df[col] = df[col].cat.codes.astype('int16')
            else:
                # Convert unordered Categorical to a string
                df[col] = df[col].astype('str')
        
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    # check the memory usage after optimization
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))

    # calculate the percentage of the memory usage reduction
    mem_reduction = 100 * (start_mem - end_mem) / start_mem
    print("Memory usage decreased by {:.1f}%".format(mem_reduction))
    
    return df

   

In [89]:
print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-n

In [90]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  ['ACTIVE' 'LEFT CLUB' 'PRE-CREATE']
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



In [91]:
# explicitly convert club_member_status to ordinal values before mem optimization to avoid errors

customers['club_member_status'].replace({'LEFT CLUB': 0, 'PRE-CREATE': 1, 'ACTIVE': 2}, inplace=True)
customers['club_member_status'] = customers['club_member_status'].astype('int8')
print(customers['club_member_status'].unique())


[2 0 1]


In [92]:
# ---- memory optimizations -------------

# uses 8 bytes instead of given 64 byte string, reduces mem by 8x, 
# !!!! have to convert back before merging w/ sample_submissions.csv
# convert transactions['customer_id'] to 8 bytes int
# transactions['customer_id'] = transactions['customer_id'].astype('int64')
transactions['customer_id'] = transactions['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')

articles = reduce_mem_usage(articles)
customers = reduce_mem_usage(customers)
transactions = reduce_mem_usage(transactions)

# articles['article_id'] = articles['article_id'].astype('int32')
# transactions['article_id'] = transactions['article_id'].astype('int32') 
# # !!!! ADD LEADING ZERO BACK BEFORE SUBMISSION OF PREDICTIONS TO KAGGLE: 
# # Ex.: transactions['article_id'] = '0' + transactions.article_id.astype('str')

print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Memory usage of dataframe is 20.13 MB
Memory usage after optimization is: 13.59 MB
Memory usage decreased by 32.5%
Memory usage of dataframe is 58.88 MB
Memory usage after optimization is: 39.25 MB
Memory usage decreased by 33.3%
Memory usage of dataframe is 1212.63 MB
Memory usage after optimization is: 697.26 MB
Memory usage decreased by 42.5%
Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int32 
 1   product_code                  105542 non-null  int32 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int16 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  i

In [93]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  [2 0 1]
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



In [94]:
# time-based splitting strategy

def split_train_val_data_and_drop_duplicates(transactions, days=7):
    """
    Splits the transaction training data into a training set and a validation set of 7 days to prevent data leakage.
    """
    
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
    transactions = transactions.sort_values(by=['t_dat'])
    latest_transaction_date = transactions['t_dat'].max()
    
    training_set = transactions[transactions['t_dat'] < latest_transaction_date - pd.Timedelta(days=days)]
    validation_set = transactions[transactions['t_dat'] >= latest_transaction_date - pd.Timedelta(days=days)]
    
    print("Training set size:", len(training_set))
    print("Validation set size:", len(validation_set))
    print("Last date in training set:", training_set['t_dat'].max())
    print("Last date in validation set:", validation_set['t_dat'].max())

    # drop duplicate rows
    training_set = training_set.drop_duplicates().copy()
    validation_set = validation_set.drop_duplicates().copy()
    
    return training_set, validation_set

In [95]:
def preprocess_data(transactions_df, customers_df, articles_df, customers_col='customer_id', articles_col='article_id'):
    """
    Preprocesses customer and article IDs for use in a sparse matrix.
    
    Returns:
    - transactions_df: the input transaction DataFrame with two additional columns, 'user_index' and 'item_index',
                       that map customer and article IDs to their corresponding indices in a sparse matrix
    - customer_id_indices_map: a dictionary that maps customer IDs to their corresponding indices
    - article_id_indices_map: a dictionary that maps article IDs to their corresponding indices
    """
    # Create a list of unique customer IDs and product IDs
    all_customers = customers_df[customers_col].unique().tolist()
    all_articles = articles_df[articles_col].unique().tolist()

    # Create dicts mapping IDs to their corresponding indices
    customer_id_indices_map = {customer_id: i for i, customer_id in enumerate(all_customers)}
    article_id_indices_map = {article_id: i for i, article_id in enumerate(all_articles)}

    # Map customer and article IDs to their resp. indices in the transaction DataFrame
    transactions_df['user_index'] = transactions_df[customers_col].map(customer_id_indices_map)
    transactions_df['item_index'] = transactions_df[articles_col].map(article_id_indices_map)

    return transactions_df, all_customers, all_articles, customer_id_indices_map, article_id_indices_map

In [96]:
# Create a sparse matrix of all user-item (a.k.a customer_id-article_id) interactions
# Supply training set, val set or entire transactions df

def create_user_item_matrix(transactions_df):
    # all customers and articles in their resp. rows of transaction data indicate that an article was purchased, thus:
    interaction = np.ones(transactions_df.shape[0]) 
    user_rows = transactions_df['user_index'].values
    item_cols = transactions_df['item_index'].values

    user_item_matrix = sparse.csr_matrix((interaction, (user_rows, item_cols)), shape=(len(all_customers), len(all_articles)))
    
    return user_item_matrix

In [97]:
from math import ceil

# calculate total number of transaction weeks in tranactions data
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Compute the minimum and maximum date values
min_date = transactions['t_dat'].min()
max_date = transactions['t_dat'].max()

# Compute the number of weeks between the minimum and maximum date values
num_weeks = ceil((max_date - min_date).days / 7)

print(f"Total number of transaction weeks: {num_weeks}")


Total number of transaction weeks: 105


In [98]:
from datetime import datetime, timedelta

# only use last x weeks of transactions data since data is too large
def filter_transactions_last_x_weeks(transactions, x = 10):
    # Convert date strings to datetime objects
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

    # Calculate the date x weeks ago from the latest transaction date
    latest_date = transactions['t_dat'].max()
    cutoff_date = latest_date - timedelta(weeks=x)

    # Filter transactions to only include those in the last x weeks
    filtered_transactions = transactions.loc[transactions['t_dat'] >= cutoff_date].copy()

    return filtered_transactions

In [99]:
def filter_customers_and_articles(customers, articles, filtered_transactions):
    # Get unique customer and article IDs from filtered transactions
    customer_ids = filtered_transactions['customer_id'].unique()
    article_ids = filtered_transactions['article_id'].unique()

    # Filter customers and articles to only include those in filtered transactions
    customers_filtered = customers.loc[customers['customer_id'].isin(customer_ids)].copy()
    articles_filtered = articles.loc[articles['article_id'].isin(article_ids)].copy()

    return customers_filtered, articles_filtered

## LightGBM

|Feature|LightGBM|XGBoost|CatBoost|
|:----|:----|:----|:----|
|Categoricals|Supports categorical features via one-hot encoding|Supports categorical features via one-hot encoding|Automatically handles categorical features using embeddings|
|Speed|Very fast training and prediction|Fast training and prediction|Slower than LightGBM and XGBoost|
|Handling Bias|Handles unbalanced classes via 'is_unbalance'|Handles unbalanced classes via 'scale_pos_weight'|Automatically handles unbalanced classes|
|Handling NaNs|Handles NaN values natively|Requires manual handling of NaNs|Automatically handles NaN values using special category|
|Custom Loss|Supports custom loss functions|Supports custom loss functions|Supports custom loss functions|


- Perform feature engineering using one-hot encoding or label encoding to encode the categorical features in the dataset.<br>
- Try different feature selection techniques, such as Recursive Feature Elimination (RFE) or SelectKBest, to select a smaller subset of features for the model.<br>
- Deal with class imbalance by adjusting the weights of the samples in the training set. Use the class_weights function from scikit-learn to calculate the weights based on the class distribution and pass them as - the weight parameter when creating the LightGBM datasets.<br>


- Use more advanced feature selection techniques such as feature importance analysis provided by LightGBM or PCA to reduce the dimensionality of the dataset and remove any multicollinearity. (??) <br> 
- Split the data into train and test sets using a time-based split based on the transaction date to avoid data leakage.<br>
  
- Train a LightGBM model on the training data using the selected features.<br>
  
- Experiment with different evaluation metrics to find the most appropriate one for your specific use case. For example, you could use the area under the ROC curve (AUC) or the F1-score if MAP does not perform well.<br>
- Use a time series cross-validation strategy to find the best hyperparameters for your model. This can be achieved using the TimeSeriesSplit function from scikit-learn instead of the default k-fold cross-validation.<br>
- Try different hyperparameters for the LightGBM model, such as the learning rate, number of estimators, max depth, etc., and use cross-validation to select the best combination of hyperparameters. (OR) <br> 
  - Bayesian optimization/Hyperopt to more efficiently search the hyperparameter space and find the optimal combination of hyperparameters.<br>
  
- Evaluate the model's performance on the val set using mean average precision (MAP) as the evaluation metric. <br>
- Once you have selected the best hyperparameters, train the final LightGBM model on the entire dataset using the selected features and hyperparameters.<br>
- Save the trained model for future use.<br>

To use LightGBM for a ranking problem, treat this as a binary classification problem where the target variable is whether an item is relevant or not to the user.

Then use LightGBM's ranking API, which is designed for ranking problems. Instead of optimizing for accuracy, the ranking API optimizes for ranking metric MAP. 

### Feature Engineering

In [100]:
# LightGBM imports

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [101]:
# transactions = filter_transactions_last_x_weeks(transactions)
# customers, articles = filter_customers_and_articles(customers, articles, transactions)

In [102]:
# Dropping columns with uninformative article data

articles = articles.drop(columns=['product_code', 'prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])
articles = articles.drop(columns=[col for col in articles.columns if 'colour_' in col or 'perceived_' in col])

In [103]:
articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no
0,108775015,253,1010016,1676,A,1,16,1002
1,108775044,253,1010016,1676,A,1,16,1002
2,108775051,253,1010017,1676,A,1,16,1002
3,110065001,306,1010016,1339,B,1,61,1017
4,110065002,306,1010016,1339,B,1,61,1017


These columns are left to capture any potential patterns in the other columns, such as how certain index codes or sections might be associated with higher or lower sales.

In [104]:
# Feature engineering
from sklearn.preprocessing import LabelEncoder

# Define mapping for fashion_news_frequency feature
fashion_news_freq_mapping = {'None': 0, 'Monthly': 1, 'Regularly': 2}

# label encode fashion_news_frequency feature
le = LabelEncoder()
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].map(fashion_news_freq_mapping)
customers['fashion_news_frequency'] = le.fit_transform(customers['fashion_news_frequency'])

In [105]:
customers = customers.drop(['postal_code'], axis=1)
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
0,6883939031699146327,0.0,0.0,2,0,49
1,-7200416642310594310,0.0,0.0,2,0,25
2,-6846340800584936,0.0,0.0,2,0,24
3,-94071612138601410,0.0,0.0,2,0,54
4,-283965518499174310,1.0,1.0,2,2,52


In [106]:
# Feature engineering: encode nominal categorical features
ohe = OneHotEncoder()
sales_channel_encoded = ohe.fit_transform(transactions['sales_channel_id'].values.reshape(-1,1)).toarray()
sales_channel_encoded = pd.DataFrame(sales_channel_encoded, columns=['sales_channel_'+str(int(i)) for i in range(sales_channel_encoded.shape[1])])
transactions = pd.concat([transactions, sales_channel_encoded], axis=1)
transactions.drop('sales_channel_id', axis=1, inplace=True)

In [107]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_0,sales_channel_1
0,2018-09-20,-6846340800584936,663713001,0.050842,0.0,1.0
1,2018-09-20,-6846340800584936,541518023,0.030487,0.0,1.0
2,2018-09-20,-8334631767138808638,505221004,0.015236,0.0,1.0
3,2018-09-20,-8334631767138808638,685687003,0.016937,0.0,1.0
4,2018-09-20,-8334631767138808638,685687004,0.016937,0.0,1.0


In [108]:
# Convert 't_dat' column to datetime format
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Group by customer ID and find the first and last transaction dates
first_trans_dates = transactions.groupby('customer_id')['t_dat'].min().reset_index()
last_trans_dates = transactions.groupby('customer_id')['t_dat'].max().reset_index()

customer_purchase_engagement = pd.merge(first_trans_dates, last_trans_dates, on='customer_id', suffixes=('_first', '_last'))
# Create a new feature by calculating the time difference in days between first and last transactions
customer_purchase_engagement['time_diff_days'] = (customer_purchase_engagement['t_dat_last'] - customer_purchase_engagement['t_dat_first']).dt.days
# Drop the original first and last transaction date columns
customer_purchase_engagement.drop(['t_dat_first', 't_dat_last'], axis=1, inplace=True)
customer_purchase_engagement.head()

# Merge the customer_purchase_engagement dataframe with the customers dataframe
customers = pd.merge(customers, customer_purchase_engagement, on='customer_id', how='left')
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days
0,6883939031699146327,0.0,0.0,2,0,49,618.0
1,-7200416642310594310,0.0,0.0,2,0,25,656.0
2,-6846340800584936,0.0,0.0,2,0,24,726.0
3,-94071612138601410,0.0,0.0,2,0,54,0.0
4,-283965518499174310,1.0,1.0,2,2,52,670.0


The above `time_diff_days` feature can potentially provide insights into a customer's engagement by looking at the gap in the number of days between the last purchase and the current date. <br> The assumption is, The larger the gap, the less engaged the customer is. 

In [109]:
# Join the transaction dataframe with the customers dataframe
merged = pd.merge(transactions, customers, on='customer_id', how='inner')

# Calculate the mean age for each article
item_mean_age = merged.groupby('article_id')['age'].mean()

# Calculate the difference between every user's age and the mean age of users who have purchased a particular item
merged['age_diff'] = merged['age'] - merged['article_id'].map(item_mean_age)

# Group by article and take the mean of age_diff
article_age_diff = merged.groupby('article_id')['age_diff'].mean()

# Append the age difference feature to the articles dataframe
articles['age_diff'] = articles['article_id'].map(article_age_diff)

articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff
0,108775015,253,1010016,1676,A,1,16,1002,-2.032463e-15
1,108775044,253,1010016,1676,A,1,16,1002,-3.057784e-15
2,108775051,253,1010017,1676,A,1,16,1002,-2.478637e-15
3,110065001,306,1010016,1339,B,1,61,1017,1.388417e-15
4,110065002,306,1010016,1339,B,1,61,1017,-2.636522e-17


Mean age_diff for every article. It can be useful for predicting whether a user will buy an item based on their age and the age of other users who have already bought the same item. 

Intuituion behind the `age_diff` feature:

Let's say we have a dataset of customers who made transactions for a particular item with article_id = 123. Here is an example of how we can calculate the age_diff feature: <br>
Assume that the mean age of all customers who bought the item with article_id = 123 is 40 years old <br>
Customer A made a transaction for item with article_id = 123 and their age is 35. The age_diff feature for this transaction would be -5. (35 - 40). <br>
Customer B made a transaction for item with article_id = 123 and their age is 50. The age_diff feature for this transaction would be 10. (50 - 40). <br>
Customer C made a transaction for item with article_id = 123 and their age is 40. The age_diff feature for this transaction would be 0. (40 - 40). <br>
So, the age_diff feature measures the difference between the age of each customer who bought a specific item and the average age of all customers who bought that item. <br>

Therefore, the age_diff is the mean of all these individual age_diff values for each customer who bought the item with article_id = 123. age_diff = -1.66 for this example<br>


In [110]:
# Calculate mean, max, and min age for each item
item_mean_age = merged.groupby('article_id')['age'].mean()
item_max_age = merged.groupby('article_id')['age'].max()
item_min_age = merged.groupby('article_id')['age'].min()

# Merge the features back into the articles dataframe
articles = articles.merge(item_mean_age, on='article_id', how='left')
articles = articles.merge(item_max_age, on='article_id', how='left')
articles = articles.merge(item_min_age, on='article_id', how='left')

# Rename the columns to make them more descriptive
articles = articles.rename(columns={'age_x': 'mean_purchase_age', 'age_y': 'max_purchase_age', 'age': 'min_purchase_age'})

articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age
0,108775015,253,1010016,1676,A,1,16,1002,-2.032463e-15,34.477078,77.0,17.0
1,108775044,253,1010016,1676,A,1,16,1002,-3.057784e-15,36.063448,90.0,17.0
2,108775051,253,1010017,1676,A,1,16,1002,-2.478637e-15,35.395349,66.0,18.0
3,110065001,306,1010016,1339,B,1,61,1017,1.388417e-15,38.045977,76.0,18.0
4,110065002,306,1010016,1339,B,1,61,1017,-2.636522e-17,39.460111,75.0,18.0


Additional age features to capture more information about the age of the customers who bought the respective articles.

In [111]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days
0,6883939031699146327,0.0,0.0,2,0,49,618.0
1,-7200416642310594310,0.0,0.0,2,0,25,656.0
2,-6846340800584936,0.0,0.0,2,0,24,726.0
3,-94071612138601410,0.0,0.0,2,0,54,0.0
4,-283965518499174310,1.0,1.0,2,2,52,670.0


In [112]:
# Calculate purchased item count for each user
transactions['quantity'] = 1
user_item_count = transactions.groupby(['customer_id', 'article_id'])['quantity'].sum().reset_index()

# Calculate total item count for each article
total_item_count = transactions.groupby('article_id')['quantity'].sum().reset_index()
total_item_count.columns = ['article_id', 'total_items']

user_item_count = pd.merge(user_item_count, total_item_count, on='article_id', how='left')

# Calculate ratio of purchased item count and total item count
user_item_count['article_engagement_ratio'] = user_item_count['quantity'] / user_item_count['total_items']


transactions = pd.merge(transactions, user_item_count[['customer_id', 'article_id', 'article_engagement_ratio']], on=['customer_id', 'article_id'], how='left')
transactions['quantity'] = user_item_count['quantity']

# merge article_engagement_ratio_x and article_engagement_ratio_y into one column
# transactions['article_engagement_ratio'] = transactions['article_engagement_ratio_x'].fillna(transactions['article_engagement_ratio_y'])

transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_0,sales_channel_1,quantity,article_engagement_ratio
0,2018-09-20,-6846340800584936,663713001,0.050842,0.0,1.0,1.0,0.00316
1,2018-09-20,-6846340800584936,541518023,0.030487,0.0,1.0,1.0,0.002304
2,2018-09-20,-8334631767138808638,505221004,0.015236,0.0,1.0,1.0,0.02381
3,2018-09-20,-8334631767138808638,685687003,0.016937,0.0,1.0,1.0,0.000545
4,2018-09-20,-8334631767138808638,685687004,0.016937,0.0,1.0,3.0,0.000558


Intuition behind feature: <br>

`article_engagement_ratio`: The feature is ratio of one user's purchased item count and the item's total purchase count. This serves to measure how engaged a user is with a particular item, which can be useful for predicting whether a user will buy similar items. <br>
Can also be used to measure how popular an item is, and can be used to potentially diversify recommendations.

In [113]:
transactions, all_customers, all_articles, customer_id_indices_map, article_id_indices_map = preprocess_data(transactions, customers, articles)

print("Total num of customers: ", len(all_customers))
print("Total num of articles: ", len(all_articles))
print("Customer ID mapping: ", list(customer_id_indices_map.items())[:5])
print("Article ID mapping: ", list(article_id_indices_map.items())[:5])
transactions.head()

Total num of customers:  1371980
Total num of articles:  105542
Customer ID mapping:  [(6883939031699146327, 0), (-7200416642310594310, 1), (-6846340800584936, 2), (-94071612138601410, 3), (-283965518499174310, 4)]
Article ID mapping:  [(108775015, 0), (108775044, 1), (108775051, 2), (110065001, 3), (110065002, 4)]


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_0,sales_channel_1,quantity,article_engagement_ratio,user_index,item_index
0,2018-09-20,-6846340800584936,663713001,0.050842,0.0,1.0,1.0,0.00316,2,40179
1,2018-09-20,-6846340800584936,541518023,0.030487,0.0,1.0,1.0,0.002304,2,10520
2,2018-09-20,-8334631767138808638,505221004,0.015236,0.0,1.0,1.0,0.02381,7,6387
3,2018-09-20,-8334631767138808638,685687003,0.016937,0.0,1.0,1.0,0.000545,7,46304
4,2018-09-20,-8334631767138808638,685687004,0.016937,0.0,1.0,3.0,0.000558,7,46305


In [114]:
# from implicit.als import AlternatingLeastSquares

# # from the als_strat1_hyperparam_log 
# # Create ALS model with default parameters
# alpha = 24
# als_model = AlternatingLeastSquares(factors=55, iterations=20, regularization=0.18)

# # Fit model to user-item matrix
# als_model.fit(user_item_training_matrix*alpha)

# # Get item factors matrix
# item_factors = als_model.item_factors

# # Calculate cosine similarity between item factors
# item_similarities = cosine_similarity(item_factors, dense_output=False)

# # Get top-k most similar items for each item
# k = 5
# top_k_similar_items = item_similarities.argsort()[:, -k-1:-1]

# # Create item-item similarity features
# for i in range(len(articles)):
#     item_id = articles.loc[i, 'article_id']
#     similar_items = top_k_similar_items[item_id].toarray().ravel()
#     item_feature = item_similarities[item_id, similar_items].mean()
#     articles.loc[i, 'item_similarity'] = item_feature

# # Get user factors matrix
# user_factors = als_model.user_factors

# # Calculate cosine similarity between user factors
# user_similarities = cosine_similarity(user_factors, dense_output=False)

# top_k_similar_users = user_similarities.argsort()[:, -k-1:-1]

# # Create user-user similarity features
# for i in range(len(customers)):
#     user_id = customers.loc[i, 'customer_id']
#     similar_users = top_k_similar_users[user_id].toarray().ravel()
#     user_feature = user_similarities[user_id, similar_users].mean()
#     customers.loc[i, 'user_similarity'] = user_feature

In [115]:
# Normal cosine_sim() requires 80-300GB. Batched cosine_sim() crashes PC

# from sklearn.metrics.pairwise import cosine_similarity

# # Define batch size
# batch_size = 500

# # Split user-item matrix into smaller submatrices
# submatrices = []
# for i in range(0, user_item_training_matrix.shape[0], batch_size):
#     submatrix = user_item_training_matrix[i:i+batch_size, :]
#     submatrices.append(submatrix)

# # Compute item-item cosine similarity between submatrices
# similarities = []
# for i in range(len(submatrices)):
#     for j in range(i+1, len(submatrices)):
#         similarity = cosine_similarity(submatrices[i], submatrices[j], dense_output=False)
#         similarities.append(similarity)

# # Concatenate cosine similarity matrices into a single matrix
# item_similarities = np.concatenate(similarities, axis=1)

# # Compute user-user cosine similarity between submatrices
# similarities = []
# for i in range(len(submatrices)):
#     for j in range(i+1, len(submatrices)):
#         similarity = cosine_similarity(submatrices[i].T, submatrices[j].T, dense_output=False)
#         similarities.append(similarity)

# # Concatenate cosine similarity matrices into a single matrix
# user_similarities = np.concatenate(similarities, axis=1)

In [116]:
# print(item_similarities.shape)
# print(user_similarities.shape)

In [117]:
# # Hyperparameter, set to initial low since matrix is very sparse
# k = 5

# # Get top k most similar items for each item (excluding itself)
# most_similar_items = np.argsort(-item_similarities)[:, :k] 

# # Get top k most similar users for each user
# most_similar_users = np.argsort(-user_similarities)[:, :k]

In [118]:
# # Create a binary feature that indicates whether or not a customer has bought a particular item
# # based on whether other customers who bought similar items also tended to buy that item
# for i in range(len(articles)):
#     item_id = articles.loc[i, 'article_id']

#     # list of (column) indices of similar items
#     similar_items = most_similar_items[item_id]

#     # subset of the user-item matrix containing the purchases for the similar items by all customers ==
#             #  user_item_matrix.loc[:, similar_items] -> selects all customers (rows) from the user-item matrix and all items (columns) corresponding to the similar items

#     # .max(axis=1) -> computes the maximum value for each customer in this subset of the user-item matrix. 
#         # The values in this subset of the matrix are binary (either 0 or 1), the maximum value for each row will be 1 if the customer purchased any of the similar items, and 0 otherwise.
#     item_feature = user_item_matrix.loc[:, similar_items].max(axis=1).values
#     articles.loc[i, 'article_preference'] = item_feature

# # Drop the quantity column from the transactions dataframe
# #transactions = transactions.drop('quantity', axis=1)

# articles.head()

Intuition behind feature: <br>

`article_preference`: Binary feature for each item that indicates whether or not a customer has bought that item, based on whether other customers who bought similar items also tended to buy that item. <br>

This feature can be useful for a fashion-based recommender system because it captures the idea that customers who have similar tastes or preferences tend to buy similar items. For example, if a customer has a history of buying shirts and other customers who bought similar shorts also tended to buy a specific pair of jeans, then the binary feature for those jeans would be set to 1 for that customer, indicating that they are likely to be interested in those shoes. The binary feature can then be used as a predictor for which items to recommend to the customer.

In [119]:
# # Create features for each user based on the average quantity purchased by similar customers
# for i in range(len(customers)):
#     customer_id = customers.loc[i, 'customer_id']
#     similar_users = most_similar_users[customer_id]
#     user_feature = user_item_matrix.loc[similar_users, :].mean(axis=0).values
#     customers.loc[i, 'user_purchase_quant'] = user_feature

Intuition behind feature: <br>

`user_purchase_quant`: Gets the average quantity of items purchased by the k most similar customers to that customer. It looks at what other customers who are similar to this customer have bought and calculates the average amount of each item they bought. This feature can be used to predict what items a customer is likely to buy in the future based on what similar customers have bought in the past. This feature aims to capture purchase behaviours of a customer.<br>

For example, if a customer typically buys a lot of bomber jackets, and the top k most similar customers to that customer also tend to buy a lot of bomber jackets, then the average quantity of bomber jackets purchased by those similar customers could be a good predictor of how much the original customer is likely to purchase in the future. This however assumes that the k most similar customers have similar purchase behaviours to the customers in question, and on its own is not a strong feature.<br>


In [120]:
# # Create features for each item based on the total number of times it was purchased by similar customers
# for i in range(len(articles)):
#     item_id = articles.loc[i, 'article_id']
#     similar_users = most_similar_users[user_item_matrix.loc[:, item_id].values > 0]
#     item_feature = user_item_matrix.loc[similar_users, item_id].sum() / k
#     articles.loc[i, 'item_total_purchases'] = item_feature

Intuition behind feature: <br>

`item_total_purchases`: It gives an estimate of how frequently an item is being bought by customers who have similar purchase histories. This feature is useful because it can provide insights into purchasing patterns and identify popular items that are often bought together. <br> It can potentially help identify popular items among customer groups.

In [121]:
# # Create features for each item based on the average price of items purchased by similar customers
# for i in range(len(articles)):
#     item_id = articles.loc[i, 'article_id']
#     similar_users = most_similar_users[user_item_matrix.loc[:, item_id].values > 0]
#     item_prices = pd.merge(transactions, customers[['customer_id', 'price_level']], on='customer_id', how='left')['price_level']
#     item_feature = item_prices[user_item_matrix.loc[similar_users, item_id].values > 0].mean()
#     articles.loc[i, 'item_avg_price'] = item_feature

Intuition behind feature: <br>

`item_avg_price`: Calculates the average price level of articles purchased by customers similar to those who have purchased a particular article in the past.  <br> It provides information on the typical price level of articles that are purchased together with a given article, as indicated by the purchasing patterns of similar customers. 

For example, if customers who frequently purchase articles A also tend to purchase higher-priced article, then the "item_avg_price" feature for article A would be relatively high. 

In [122]:
articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age
0,108775015,253,1010016,1676,A,1,16,1002,-2.032463e-15,34.477078,77.0,17.0
1,108775044,253,1010016,1676,A,1,16,1002,-3.057784e-15,36.063448,90.0,17.0
2,108775051,253,1010017,1676,A,1,16,1002,-2.478637e-15,35.395349,66.0,18.0
3,110065001,306,1010016,1339,B,1,61,1017,1.388417e-15,38.045977,76.0,18.0
4,110065002,306,1010016,1339,B,1,61,1017,-2.636522e-17,39.460111,75.0,18.0


In [123]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days
0,6883939031699146327,0.0,0.0,2,0,49,618.0
1,-7200416642310594310,0.0,0.0,2,0,25,656.0
2,-6846340800584936,0.0,0.0,2,0,24,726.0
3,-94071612138601410,0.0,0.0,2,0,54,0.0
4,-283965518499174310,1.0,1.0,2,2,52,670.0


### Model Training

In [60]:
# Prepare the data
transactions['purchased'] = 1
customers_articles = pd.merge(customers, articles, on=None, how='outer')

# left join with transaction data to get purchase history
df = pd.merge(customers_articles, transactions, on=['customer_id', 'article_id'], how='left')
df['purchased'] = df['quantity'].fillna(0)

# quantity not really needed anymore
df = df.drop(['quantity'], axis=1)

# fill NaN values with 0 for other columns -- Lightgbm will handle this though
df = df.fillna(0)

df.head()

MergeError: No common columns to perform merge on. Merge options: left_on=None, right_on=None, left_index=False, right_index=False

In [None]:
# Split the data into training and testing sets
training_set, validation_set = split_train_val_data_by_time_and_drop_duplicates(transactions)

X_train = training_set.drop(['purchased', 'article_id', 'customer_id'], axis=1)
y_train = training_set['purchased']

X_test = validation_set.drop(['purchased', 'article_id', 'customer_id'], axis=1)
y_test = validation_set['purchased']

In [57]:
training_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_0,sales_channel_1,user_index,item_index,quantity,article_engagement_ratio,purchased
0,2018-09-20,-6846340800584936,663713001,0.050842,0.0,1.0,2,40179,1.0,0.00316,1
32262,2018-09-20,-5586719204546245114,569933006,0.042358,0.0,1.0,904365,16334,1.0,0.002558,1
32263,2018-09-20,-5586719204546245114,621846006,0.025406,0.0,1.0,904365,28076,1.0,0.004843,1
32264,2018-09-20,5741556008941214631,587189003,0.016953,1.0,0.0,904427,20130,1.0,0.011628,1
32265,2018-09-20,5741556008941214631,587189002,0.033905,1.0,0.0,904427,20129,1.0,0.016667,1


In [58]:
validation_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_0,sales_channel_1,user_index,item_index,quantity,article_engagement_ratio,purchased
31539333,2020-09-15,3494039427063804431,925779002,0.084717,0.0,1.0,908734,104674,,0.166667,1
31539327,2020-09-15,3494039427063804431,456163087,0.033875,0.0,1.0,908734,3521,,0.001916,1
31539330,2020-09-15,3494039427063804431,896169005,0.050842,0.0,1.0,908734,101375,,0.001066,1
31539329,2020-09-15,3494039427063804431,896152002,0.033875,0.0,1.0,908734,101367,,0.000366,1
31539328,2020-09-15,3494039427063804431,924243001,0.042358,0.0,1.0,908734,104553,,0.000587,1


In [85]:
user_item_training_matrix = create_user_item_matrix(training_set)
user_item_validation_matrix = create_user_item_matrix(validation_set)

user_item_training_matrix 

NameError: name 'training_set' is not defined

In [56]:
# check for duplicates
print(training_set.duplicated().sum())
print(validation_set.duplicated().sum())

NameError: name 'training_set' is not defined

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import average_precision_score

X_train = training_set.drop(['purchased', 'article_id', 'customer_id'], axis=1)
y_train = training_set['purchased']

X_test = validation_set.drop(['purchased', 'article_id', 'customer_id'], axis=1)
y_test = validation_set['purchased']

# Train a LightGBM model
params = {
    'boosting_type': 'gbdt',
    'objective': 'binary',
    'metric': 'binary_logloss',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

clf = lgb.LGBMClassifier(**params)
grid_params = {
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'num_leaves': [15, 31, 63],
    'min_child_samples': [20, 30, 40]
}

grid_search = GridSearchCV(clf, grid_params, cv=5, scoring='average_precision', n_jobs=-1)
grid_search.fit(X_train, y_train)

# Evaluate the model
y_pred = grid_search.predict_proba(X_test)[:, 1]
map_score = average_precision_score(y_test, y_pred, pos_label=1, average='weighted')

# Generate recommendations for a given customer
def get_top_articles(customer_id):
    articles['prob'] = grid_search.predict_proba(X)[:, 1]
    recommendations = articles.sort_values('prob', ascending=False).head(12)
    return recommendations

# Print the MAP@12 score and some sample recommendations for a customer
print('MAP@12 score:', map_score)
print('Sample recommendations for customer 123:', get_top_articles(123))
