# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Helper Functions/Decorators</span>**](#Helper-Functions)
* [**<span>5. LightGBM Model</span>**](#LightGBM-Model) 

## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve

## Helper-Functions

## LightGBM

A comparison of the top GBDT models today. LightGBM is the fastest to train.

|Feature|LightGBM|XGBoost|CatBoost|
|:----|:----|:----|:----|
|Categoricals|Supports categorical features via one-hot encoding|Supports categorical features via one-hot encoding|Automatically handles categorical features using embeddings|
|Speed|Very fast training and prediction|Fast training and prediction|Slower than LightGBM and XGBoost|
|Handling Bias|Handles unbalanced classes via 'is_unbalance'|Handles unbalanced classes via 'scale_pos_weight'|Automatically handles unbalanced classes|
|Handling NaNs|Handles NaN values natively|Requires manual handling of NaNs|Automatically handles NaN values using special category|
|Custom Loss|Supports custom loss functions|Supports custom loss functions|Supports custom loss functions|


In [2]:
import pickle

# open user_item_matrix_200
with open('user_item_matrix_200.pkl', 'rb') as f:
    user_item_matrix = pickle.load(f)

# open customer and articels incides map
with open('lightgbm/customer_id_indices_map.pkl', 'rb') as f:
    customer_id_indices_map = pickle.load(f)

with open('lightgbm/article_id_indices_map.pkl', 'rb') as f:
    article_id_indices_map = pickle.load(f)

# load df from pickle file for time-based split
with open('lightgbm/df.pkl', 'rb') as f:
    df = pickle.load(f)

# load final_df from pickle file for clean processing
with open('lightgbm/final_df_with_binary_targets.pkl', 'rb') as f:
    final_df = pickle.load(f)

### Model Training

In [3]:
final_df.head()

Unnamed: 0,price,sales_channel_1,sales_channel_2,quantity,article_engagement_ratio,user_index,item_index,FN,Active,club_member_status,...,garment_group_no_1019.0,garment_group_no_1020.0,garment_group_no_1021.0,garment_group_no_1023.0,garment_group_no_1025.0,index_group_no_1.0,index_group_no_2.0,index_group_no_3.0,index_group_no_4.0,index_group_no_26.0
0,0.042358,False,True,1.0,1.0,5,11563,1.0,1.0,2.0,...,False,False,False,False,False,True,False,False,False,False
1,0.050842,False,True,1.0,1.0,5,9899,1.0,1.0,2.0,...,False,False,False,False,False,True,False,False,False,False
2,0.06781,False,True,1.0,1.0,5,14438,1.0,1.0,2.0,...,False,False,False,False,False,True,False,False,False,False
3,0.016937,False,True,1.0,0.5,10,10307,0.0,0.0,2.0,...,False,False,False,False,False,False,True,False,False,False
4,0.016937,False,True,1.0,0.166667,10,13608,0.0,0.0,2.0,...,False,False,False,True,False,True,False,False,False,False


In [4]:
# # target encoding
# from category_encoders import TargetEncoder
# from sklearn.model_selection import KFold

# # Define columns to target encode
# cols_to_encode = ['department_no', 'product_type_no', 'section_no', 'graphical_appearance_no']

# # Define number of folds for cross-validation
# n_splits = 5

# # Create KFold object for cross-validation
# kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

# # Perform target encoding with cross-validation
# for col in cols_to_encode:
#     final_df[f'{col}_te'] = 0
#     te = TargetEncoder(cols=[col])
#     for train_idx, val_idx in kf.split(final_df):
#         te.fit(final_df.iloc[train_idx][[col]], final_df.iloc[train_idx]['target'])
#         final_df.loc[val_idx, f'{col}_te'] = te.transform(final_df.iloc[val_idx][[col]]).values.flatten()

In [5]:
# ---- memory optimizations -------------

# reference: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

# iterate through all the columns of a dataframe and reduce the int and float data types to the smallest possible size, ex. customer_id should not be reduced from int64 to a samller value as it would have collisions
import numpy as np
import pandas as pd

def reduce_mem_usage(df):
    """Iterate over all the columns of a DataFrame and modify the data type
    to reduce memory usage, handling ordered Categoricals"""
    
    # check the memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'category':
            if df[col].cat.ordered:
                # Convert ordered Categorical to an integer
                df[col] = df[col].cat.codes.astype('int16')
            else:
                # Convert unordered Categorical to a string
                df[col] = df[col].astype('str')
        
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    # check the memory usage after optimization
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))

    # calculate the percentage of the memory usage reduction
    mem_reduction = 100 * (start_mem - end_mem) / start_mem
    print("Memory usage decreased by {:.1f}%".format(mem_reduction))
    
    return df

In [6]:
# only get top 50 customers by number of total pruchase quantity from final_df

# Compute the total quantity for each user_index
user_quantity = final_df.groupby('user_index')['quantity'].sum()

# Get the top 50 user_indices by total quantity
top_50_users = user_quantity.nlargest(50).index

# Filter the final_df to include only the data for the top 50 users
final_df_top_50 = final_df[final_df['user_index'].isin(top_50_users)].copy()
# print the shape of final_df_top_50
print(final_df_top_50.shape)

print(final_df_top_50['user_index'].nunique())


(1952211, 56)
50


In [7]:
# Import necessary libraries
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split

# Preprocessing
final_df = final_df.fillna(-1)
X = final_df.drop(['target'], axis=1)
y = final_df['target']
dtrain = lgb.Dataset(X, label=y)

# Train-Test split
if 'date' in final_df.columns:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=X['date'])
else:
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Model Training
params = {
    'objective': 'binary',
    'boosting_type': 'gbdt',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9,
}
clf = lgb.train(params, dtrain, num_boost_round=100)

# Model Evaluation
y_pred = clf.predict(X_test)
y_pred_binary = [1 if x >= 0.5 else 0 for x in y_pred]
accuracy = sum(y_pred_binary == y_test) / len(y_test)
print("Accuracy: ", accuracy)

# Prediction
users = final_df['user_index'].unique()
predictions = []
for user in users:
    user_df = final_df[final_df['user_index'] == user]
    item_indices = user_df['item_index'].unique()
    candidate_items = candidate_df[candidate_df['item_index'].isin(item_indices)]
    candidate_X = candidate_items.drop(['item_index'], axis=1)
    candidate_y = clf.predict(candidate_X)
    candidate_items['prob'] = candidate_y
    top_items = candidate_items.sort_values(by=['prob'], ascending=False).iloc[:12]
    predictions.append(top_items['item_index'].values)

# Output
output_df = pd.DataFrame({
    'user_index': users,
    'item_index_1': [x[0] for x in predictions],
    'item_index_2': [x[1] for x in predictions],
    'item_index_3': [x[2] for x in predictions]
})

[LightGBM] [Info] Number of positive: 126622, number of negative: 7676429
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2289
[LightGBM] [Info] Number of data points in the train set: 7803051, number of used features: 55
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.016227 -> initscore=-4.104703
[LightGBM] [Info] Start training from score -4.104703
Accuracy:  1.0


NameError: name 'candidate_df' is not defined

In [12]:
import lightgbm as lgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import ndcg_score, average_precision_score
from sklearn.feature_selection import RFECV
import joblib
from sklearn.metrics import get_scorer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [13]:
# Define features and target
features = final_df.columns.tolist()
features.remove('target')
target = 'target'

# Group data by user -- so that LightGBM knows which data points belong to each user and can compute the metrics correctly
grouped_data_train = X_train.groupby('user_index')
grouped_data_test = X_test.groupby('user_index')
groups_train = [grouped_data_train.groups[user] for user in grouped_data_train.groups.keys()]
groups_train_flat = np.concatenate(groups_train)
groups_test = [grouped_data_test.groups[user] for user in grouped_data_test.groups.keys()]

# Create LightGBM datasets with group query information
train_data = lgb.Dataset(X_train, label=y_train, group=grouped_data_train.groups.values())
test_data = lgb.Dataset(X_test, label=y_test, group=grouped_data_test.groups.values())

In [14]:
from sklearn.metrics import make_scorer, average_precision_score

pr_auc_scorer = make_scorer(average_precision_score, needs_proba=True)

# Preprocess the data
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Feature selection using RFECV
selector = RFECV(
    estimator=lgb.LGBMClassifier(n_jobs=-1,
        num_leaves=31, max_depth=7, learning_rate=0.1
    ),
    cv=5, scoring=pr_auc_scorer,
    verbose=1, step=5
)
selector.fit(X_train_scaled, y_train)

Fitting estimator with 55 features.
Fitting estimator with 50 features.
Fitting estimator with 45 features.
Fitting estimator with 40 features.
Fitting estimator with 35 features.
Fitting estimator with 30 features.
Fitting estimator with 25 features.
Fitting estimator with 20 features.
Fitting estimator with 15 features.
Fitting estimator with 10 features.
Fitting estimator with 5 features.
Fitting estimator with 55 features.
Fitting estimator with 50 features.
Fitting estimator with 45 features.
Fitting estimator with 40 features.
Fitting estimator with 35 features.
Fitting estimator with 30 features.
Fitting estimator with 25 features.
Fitting estimator with 20 features.
Fitting estimator with 15 features.
Fitting estimator with 10 features.
Fitting estimator with 5 features.
Fitting estimator with 55 features.
Fitting estimator with 50 features.
Fitting estimator with 45 features.
Fitting estimator with 40 features.
Fitting estimator with 35 features.
Fitting estimator with 30 feat

In [19]:
# Get selected features
selected_features = X_train.columns[selector.get_support()]
print(selected_features)

ranks = selector.ranking_
feat_ranks = {feat:rank for feat, rank in zip(X_train.columns, ranks)}
sorted_ranks = sorted(feat_ranks.items(), key=lambda x: x[1])
print(sorted_ranks)

Index(['price'], dtype='object')
[('price', 1), ('article_engagement_ratio', 2), ('user_index', 2), ('item_index', 2), ('item_avg_price_level', 2), ('quantity', 3), ('user_purchase_quant', 3), ('department_no', 3), ('age_diff', 3), ('mean_purchase_age', 3), ('age', 4), ('product_type_no', 4), ('max_purchase_age', 4), ('min_purchase_age', 4), ('item_purchase_frequency', 4), ('sales_channel_1', 5), ('sales_channel_2', 5), ('club_member_status', 5), ('time_diff_days', 5), ('garment_group_no_1013.0', 5), ('FN', 6), ('Active', 6), ('garment_group_no_1012.0', 6), ('garment_group_no_1014.0', 6), ('garment_group_no_1020.0', 6), ('fashion_news_frequency', 7), ('graphical_appearance_no', 7), ('garment_group_no_1007.0', 7), ('index_group_no_2.0', 7), ('index_group_no_3.0', 7), ('section_no', 8), ('garment_group_no_1021.0', 8), ('garment_group_no_1023.0', 8), ('garment_group_no_1025.0', 8), ('index_group_no_1.0', 8), ('garment_group_no_1016.0', 9), ('garment_group_no_1017.0', 9), ('garment_group_n

In [20]:
# # Get selected features
# selected_features = X_train.columns[selector.get_support()]

# select only the top  x features
selected_features = [feat for feat, rank in sorted_ranks if rank <= 5]
print(selected_features)

['price', 'article_engagement_ratio', 'user_index', 'item_index', 'item_avg_price_level', 'quantity', 'user_purchase_quant', 'department_no', 'age_diff', 'mean_purchase_age', 'age', 'product_type_no', 'max_purchase_age', 'min_purchase_age', 'item_purchase_frequency', 'sales_channel_1', 'sales_channel_2', 'club_member_status', 'time_diff_days', 'garment_group_no_1013.0']


In [17]:
# Get integer indices of selected features
selected_feature_indices = [X_train.columns.get_loc(col) for col in selected_features]

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 63, 127],
    'max_depth': [3, 5, 7],
    'learning_rate': [0.1, 0.01, 0.001],
}

# Create an instance of the LGBMClassifier
lgbm = lgb.LGBMClassifier()

# Create an instance of GridSearchCV
grid_search = GridSearchCV(estimator=lgbm, param_grid=param_grid, cv=5, scoring=pr_auc_scorer, verbose=1)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train_scaled[:, selected_feature_indices], y_train)

Fitting 5 folds for each of 27 candidates, totalling 135 fits


In [35]:
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc

# Define PR-AUC scorer
def pr_auc_score(y_true, y_pred):
    precision, recall, _ = precision_recall_curve(y_true, y_pred)
    return auc(recall, precision)

# Use PR-AUC scorer to make_scorer object
pr_auc_scorer = make_scorer(pr_auc_score)

# Print the best hyperparameters and score
print("Best Hyperparameters:", grid_search.best_params_)
print("Best Score:", grid_search.best_score_)

# Make predictions on the test set using the best model
y_pred = grid_search.best_estimator_.predict_proba(X_test_scaled[:, selected_feature_indices])[:, 1]

# Evaluate the model using PR-AUC
pr_auc = pr_auc_score(y_test, y_pred)
print("PR-AUC Score:", pr_auc)

Best Hyperparameters: {'learning_rate': 0.1, 'max_depth': 3, 'num_leaves': 31}
Best Score: 1.0
PR-AUC Score: 1.0


Once the model is trained, it can be used to predict the probability of purchase for new user-product pairs, which can be used to generate recommendations for users.

If we treat this as a binary classification problem: After training the model, we can then get the probability that each user is likely to purchase an item from a candidate set of items. We can then sort these by descending probability to get the top 12 products as done below. <br>

A heuristic apparoach that we use to enhance LighGBM predictions here: <br>
1. Get a candidate set of top 500 most popular articles (by total purchase quanitity). <br>
2. Include the customer's predicitons to this set. <br>
3. Use lightGBM to predict the probability of purchases, and get the top 12. <br>

In [None]:
# dictionary 'user_products' that maps each user ID to a list of products they've purchased from the user-item matrix

user_products = {}
for user_idx in range(user_item_matrix.shape[0]):
    purchased_items = list(np.where(user_item_matrix[user_idx, :].toarray()[0] == 1)[0])
    user_products[user_idx] = purchased_items

In [38]:
import numpy as np
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted

class PULearner(BaseEstimator, ClassifierMixin):
    def __init__(self, base_estimator, hold_out_ratio=0.1):
        self.base_estimator = base_estimator
        self.hold_out_ratio = hold_out_ratio

    def fit(self, X, y):
        X, y = check_X_y(X, y)
        n_pos = y.sum()
        n_unlabeled = y.size - n_pos
        n_neg_to_select = int(n_pos / (1 - self.hold_out_ratio)) - n_pos

        y_unlabeled = np.zeros(n_unlabeled)
        y_hold_out = np.concatenate([np.ones(n_pos), y_unlabeled])

        # shuffle the data and split it into the hold-out set and the remaining unlabeled set
        idx = np.random.permutation(y.size)
        X_hold_out, X_unlabeled = np.array_split(X[idx], [n_pos])
        _, y_hold_out = np.array_split(y[idx], [n_pos])
        _, y_unlabeled = np.array_split(y_hold_out, [n_pos])
        
        # fit the base estimator on the unlabeled set
        self.base_estimator.fit(X_unlabeled, y_unlabeled)

        # predict on the hold-out set and select the most confident negatives
        y_hold_out_pred = self.base_estimator.predict_proba(X_hold_out)[:, 1]
        y_hold_out_pred_neg = y_hold_out_pred[y_hold_out == 0]
        sorted_idx = np.argsort(y_hold_out_pred_neg)[::-1]
        neg_idx = sorted_idx[:n_neg_to_select]
        neg_mask = np.zeros_like(y_hold_out, dtype=bool)
        neg_mask[y_hold_out == 0][neg_idx] = True

        # set the hold-out negatives as unlabeled
        y_unlabeled[neg_mask] = -1

        # fit the base estimator on the new PU set
        self.base_estimator.fit(X, y_unlabeled)
        return self

    def predict(self, X):
        check_is_fitted(self)
        X = check_array(X)
        return self.base_estimator.predict(X)

    def predict_proba(self, X):
        check_is_fitted(self)
        X = check_array(X)
        return self.base_estimator.predict_proba(X)


In [40]:
# Initialize the PUClassifier with LightGBM as the base estimator
pu_model = PULearner(base_estimator=lgb.LGBMClassifier(**grid_search.best_params_))

# Fit the model on the training set
pu_model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = pu_model.predict_proba(X_test)[:, 1]

# Evaluate the performance of the model using AUC-PRC
auc_prc = pr_auc_score(y_test, y_pred)
print("AUC-PRC Score:", auc_prc)

ValueError: Input X contains NaN.

In [None]:
# Train the LightGBM model on X

#drop target and columns not in selected_features
X = final_df.drop(['target'], axis=1)
X = X[selected_features]
y = final_df['target']
lgb_model = lgb.LGBMClassifier(**grid_search.best_params_)
lgb_model.fit(X, y)

In [None]:
# returns set of most pupular products in the catalog

def select_popular_products(df, n_products=500):
    # Group the dataframe by user and product and sum the quantity for each group
    product_quantities = df.groupby(['user_index', 'item_index'])['quantity'].sum()
    # Sort the products by quantity in descending order and select the top n_products
    popular_products = product_quantities.groupby('item_index').sum().sort_values(ascending=False).index.tolist()[:n_products]
    # return only the unique item_index values
    return list(set(popular_products))

In [None]:
# Generate candidate products for each user
# This can be done using a combination of popular products and user purchase history

popular_products = select_popular_products(final_df, 500)
print(len(popular_products))
# print first 10 popular products
print(popular_products[:20])

for user_id in user_products:
    
    # Add user purchase history to candidate list
    user_history = user_products[user_id]
    candidate_products = list(set(popular_products + user_history))
    
    # Store candidate products for this user
    user_candidates[user_id] = candidate_products

In [None]:
test_final_df = final_df.copy()

test_final_df.groupby(['user_index', 'item_index'])[selected_features].mean().copy()

In [None]:
def create_user_data(user_id, candidates, selected_features):
    user_data = final_df.groupby(['user_index', 'item_index'])[selected_features].mean().copy()
    user_data.fillna(0, inplace=True)
    
    return user_data

In [None]:
# Predict probabilities of purchase for each candidate product for each user
user_scores = {}
for user_id, candidates in user_candidates.items():
    # Create input data for this user
    user_data = create_user_data(user_id, candidates, selected_features)
    
    # Predict probabilities using the LightGBM model -- slicing for prob of positive class, a.k.a. prob. of purchase
    scores = lgb_model.predict_proba(user_data)[:, 1]
    
    # Store scores for this user
    user_scores[user_id] = scores

In [None]:
# print first 2 users and their scores
for user_id, scores in list(user_scores.items())[:2]:
    print(user_id, scores)
    # print number of unique values in scores
    print(len(np.unique(scores)))

In [None]:
# Rank candidate products for each user and return top 12 as recommendations
recommendations = {}
for user_id, scores in user_scores.items():
    # Sort candidate products by descending score
    candidate_products = user_candidates[user_id]
    sorted_indices = np.argsort(scores)[::-1]
    sorted_products = [candidate_products[i] for i in sorted_indices]
    
    # Select top 12 products
    top_products = sorted_products[:12]
    
    # Add user purchase history to top products
    top_products += user_products[user_id]
    
    # Remove duplicates and return as recommendations
    recommendations[user_id] = list(set(top_products))

Since we are using MAP as the evaluation metric, we could also use the LightGBM ranking API instead of the binary classification API. 