# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Pre-Processing</span>**](#Pre-Processing)
* [**<span>3. Exploratory Data Analysis</span>**](#Exploratory-Data-Analysis)  
    * [**<span>3.1 Articles</span>**](#EDA::Articles)  
    * [**<span>3.2 Customers</span>**](#EDA::Customers)
    * [**<span>3.3 Transactions</span>**](#EDA::Transactions)
* [**<span>4. Helper FunctionsDecorators</span>**](#Helper-Functions)
* [**<span>5. Models</span>**](#Models) 
    * [**<span>5.1 Popularity</span>**](#Popularity-Model)   
    * [**<span>5.2 ALS</span>**](#Alternating-Least-Squares)  
    * [**<span>5.2 GBDT</span>**](#GBDT)  
    * [**<span>5.3 SGD/similar</span>**](#SGD)  
    * [**<span>5.4 NN</span>**](#NN)

## Imports

In [71]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
# import cudf # switch on P100 GPU for this to work in Kaggle
# import cupy as cp

# Importing data
articles = pd.read_csv('articles.csv')
print(articles.head())
print("--")
customers = pd.read_csv('customers.csv')
print(customers.head())
print("--")
transactions = pd.read_csv("transactions_train.csv")
print(transactions.head())
print("--")

   article_id  product_code          prod_name  product_type_no   
0   108775015        108775          Strap top              253  \
1   108775044        108775          Strap top              253   
2   108775051        108775      Strap top (1)              253   
3   110065001        110065  OP T-shirt (Idro)              306   
4   110065002        110065  OP T-shirt (Idro)              306   

  product_type_name  product_group_name  graphical_appearance_no   
0          Vest top  Garment Upper body                  1010016  \
1          Vest top  Garment Upper body                  1010016   
2          Vest top  Garment Upper body                  1010017   
3               Bra           Underwear                  1010016   
4               Bra           Underwear                  1010016   

  graphical_appearance_name  colour_group_code colour_group_name  ...   
0                     Solid                  9             Black  ...  \
1                     Solid               

## Pre-Processing

In [72]:
# ----- empty value stats -------------
print("Missing values: ")
print(customers.isnull().sum())
print("--\n")

print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

# ---- data cleaning -------------

customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)

# replace club_member_status missing values with 'LEFT CLUB' --> no members with LEFT CLUB status in data
customers['club_member_status'] = customers['club_member_status'].fillna('LEFT CLUB')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('None')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE', 'None')
customers['age'] = customers['age'].fillna(customers['age'].mean())
customers['age'] = customers['age'].astype(int)
articles['detail_desc'] = articles['detail_desc'].fillna('None')


print("Customers' Missing values: ")
print(customers.isnull().sum())
print("--\n")

Missing values: 
customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16011
age                        15861
postal_code                    0
dtype: int64
--

FN Newsletter vals:  [nan  1.]
Active communication vals:  [nan  1.]
Club member status vals:  ['ACTIVE' nan 'PRE-CREATE' 'LEFT CLUB']
Fashion News frequency vals:  ['NONE' 'Regularly' nan 'Monthly']
--

Customers' Missing values: 
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
--



In [73]:
# ---- memory optimizations -------------

# reference: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

# iterate through all the columns of a dataframe and reduce the int and float data types to the smallest possible size, ex. customer_id should not be reduced from int64 to a samller value as it would have collisions
import numpy as np
import pandas as pd

def reduce_mem_usage(df):
    """Iterate over all the columns of a DataFrame and modify the data type
    to reduce memory usage, handling ordered Categoricals"""
    
    # check the memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'category':
            if df[col].cat.ordered:
                # Convert ordered Categorical to an integer
                df[col] = df[col].cat.codes.astype('int16')
            else:
                # Convert unordered Categorical to a string
                df[col] = df[col].astype('str')
        
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    # check the memory usage after optimization
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))

    # calculate the percentage of the memory usage reduction
    mem_reduction = 100 * (start_mem - end_mem) / start_mem
    print("Memory usage decreased by {:.1f}%".format(mem_reduction))
    
    return df

   

In [74]:
print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-n

In [75]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  ['ACTIVE' 'LEFT CLUB' 'PRE-CREATE']
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



In [76]:
# explicitly convert club_member_status to ordinal values before mem optimization to avoid errors

customers['club_member_status'].replace({'LEFT CLUB': 0, 'PRE-CREATE': 1, 'ACTIVE': 2}, inplace=True)
customers['club_member_status'] = customers['club_member_status'].astype('int8')
print(customers['club_member_status'].unique())


[2 0 1]


In [77]:
# ---- memory optimizations -------------

# uses 8 bytes instead of given 64 byte string, reduces mem by 8x, 
# !!!! have to convert back before merging w/ sample_submissions.csv
# convert transactions['customer_id'] to 8 bytes int
# transactions['customer_id'] = transactions['customer_id'].astype('int64')
transactions['customer_id'] = transactions['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')

articles = reduce_mem_usage(articles)
customers = reduce_mem_usage(customers)
transactions = reduce_mem_usage(transactions)

# articles['article_id'] = articles['article_id'].astype('int32')
# transactions['article_id'] = transactions['article_id'].astype('int32') 
# # !!!! ADD LEADING ZERO BACK BEFORE SUBMISSION OF PREDICTIONS TO KAGGLE: 
# # Ex.: transactions['article_id'] = '0' + transactions.article_id.astype('str')

print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Memory usage of dataframe is 20.13 MB
Memory usage after optimization is: 13.59 MB
Memory usage decreased by 32.5%
Memory usage of dataframe is 58.88 MB
Memory usage after optimization is: 39.25 MB
Memory usage decreased by 33.3%
Memory usage of dataframe is 1212.63 MB
Memory usage after optimization is: 697.26 MB
Memory usage decreased by 42.5%
Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int32 
 1   product_code                  105542 non-null  int32 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int16 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  i

In [56]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  [0 1 2]
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



## LightGBM

|Feature|LightGBM|XGBoost|CatBoost|
|:----|:----|:----|:----|
|Categoricals|Supports categorical features via one-hot encoding|Supports categorical features via one-hot encoding|Automatically handles categorical features using embeddings|
|Speed|Very fast training and prediction|Fast training and prediction|Slower than LightGBM and XGBoost|
|Handling Bias|Handles unbalanced classes via 'is_unbalance'|Handles unbalanced classes via 'scale_pos_weight'|Automatically handles unbalanced classes|
|Handling NaNs|Handles NaN values natively|Requires manual handling of NaNs|Automatically handles NaN values using special category|
|Custom Loss|Supports custom loss functions|Supports custom loss functions|Supports custom loss functions|


- Perform feature engineering using one-hot encoding or label encoding to encode the categorical features in the dataset.<br>
- Try different feature selection techniques, such as Recursive Feature Elimination (RFE) or SelectKBest, to select a smaller subset of features for the model.<br>
- Deal with class imbalance by adjusting the weights of the samples in the training set. Use the class_weights function from scikit-learn to calculate the weights based on the class distribution and pass them as - the weight parameter when creating the LightGBM datasets.<br>


- Use more advanced feature selection techniques such as feature importance analysis provided by LightGBM or PCA to reduce the dimensionality of the dataset and remove any multicollinearity. (??) <br> 
- Split the data into train and test sets using a time-based split based on the transaction date to avoid data leakage.<br>
  
- Train a LightGBM model on the training data using the selected features.<br>
  
- Experiment with different evaluation metrics to find the most appropriate one for your specific use case. For example, you could use the area under the ROC curve (AUC) or the F1-score if MAP does not perform well.<br>
- Use a time series cross-validation strategy to find the best hyperparameters for your model. This can be achieved using the TimeSeriesSplit function from scikit-learn instead of the default k-fold cross-validation.<br>
- Try different hyperparameters for the LightGBM model, such as the learning rate, number of estimators, max depth, etc., and use cross-validation to select the best combination of hyperparameters. (OR) <br> 
  - Bayesian optimization/Hyperopt to more efficiently search the hyperparameter space and find the optimal combination of hyperparameters.<br>
  
- Evaluate the model's performance on the val set using mean average precision (MAP) as the evaluation metric. (AUC or F1 ?/)<br>
- Once you have selected the best hyperparameters, train the final LightGBM model on the entire dataset using the selected features and hyperparameters.<br>
- Save the trained model for future use.<br>

In [54]:
# LightGBM imports

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [91]:
# Dropping columns with uninformative data

articles = articles.drop(columns=['product_code', 'prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])

In [94]:
articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no
0,108775015,253,1010016,1676,A,1,16,1002
1,108775044,253,1010016,1676,A,1,16,1002
2,108775051,253,1010017,1676,A,1,16,1002
3,110065001,306,1010016,1339,B,1,61,1017
4,110065002,306,1010016,1339,B,1,61,1017


Thhese columns are left to capture any potential patterns in the other columns, such as how certain index codes or sections might be associated with higher or lower sales.

In [107]:
customers = customers.drop(['postal_code'], axis=1)
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
0,6883939031699146327,0.0,0.0,2,0,49
1,-7200416642310594310,0.0,0.0,2,0,25
2,-6846340800584936,0.0,0.0,2,0,24
3,-94071612138601410,0.0,0.0,2,0,54
4,-283965518499174310,1.0,1.0,2,0,52


In [108]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_0,sales_channel_1
0,2018-09-20,-6846340800584936,663713001,0.050842,0.0,1.0
1,2018-09-20,-6846340800584936,541518023,0.030487,0.0,1.0
2,2018-09-20,-8334631767138808638,505221004,0.015236,0.0,1.0
3,2018-09-20,-8334631767138808638,685687003,0.016937,0.0,1.0
4,2018-09-20,-8334631767138808638,685687004,0.016937,0.0,1.0


In [83]:
# Feature engineering: encode nominal categorical features
from sklearn.preprocessing import LabelEncoder

# Define mapping for fashion_news_frequency feature
fashion_news_freq_mapping = {'None': 0, 'Monthly': 1, 'Regularly': 2}

# encode fashion_news_frequency feature
le = LabelEncoder()
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].map(fashion_news_freq_mapping)
customers['fashion_news_frequency'] = le.fit_transform(customers['fashion_news_frequency'])

In [86]:
# Feature engineering: encode nominal categorical features
ohe = OneHotEncoder()
sales_channel_encoded = ohe.fit_transform(transactions['sales_channel_id'].values.reshape(-1,1)).toarray()
sales_channel_encoded = pd.DataFrame(sales_channel_encoded, columns=['sales_channel_'+str(int(i)) for i in range(sales_channel_encoded.shape[1])])
transactions = pd.concat([transactions, sales_channel_encoded], axis=1)
transactions.drop('sales_channel_id', axis=1, inplace=True)

In [112]:
# Count: We can create features based on the count of user-item and user-category interactions in the last week/month/season/same week of last year/all. We can also use time-weighted counts to give more weight to recent interactions.

# Time: We can create features based on the first and last days of transactions for each user.

# Mean/Max/Min: We can create features by aggregating age, price, and sales channel ID information for each user. For example, we can calculate the mean, max, and min age of users who have purchased a particular item.

# Difference/Ratio: We can create features by calculating the difference between a user's age and the mean age of users who have purchased a particular item. We can also calculate the ratio of a user's purchased item count and the item's count.

# Similarity: We can create features based on collaborative filtering scores of item-to-item and user-to-item interactions. We can also use cosine similarity scores of item-to-item based on word2vec embeddings, and cosine similarity scores of user-to-item based on ProNE embeddings.

In [None]:
# encode categorical features in transactions
transactions = pd.merge(transactions, articles[['article_id'] + color_columns], on='article_id', how='left')
transactions = pd.merge(transactions, customers[['customer_id', 'department', 'club_member_status', 'age'] + list(fashion_news_freq_encoded.columns) + list(club_member_status_encoded.columns)], on='customer_id', how='left')

# Define feature selection and evaluation functions for hyperparameter tuning
def objective(params):
    # Feature selection: use RFE to select top N features
    X = transactions.drop(['t_dat', 'article_id'], axis=1)
    y = transactions['article_id']
    model = lgb.LGBMRegressor(**params)
    selector = RFE(model, n_features_to_select=params['n_features'], step=params['step'])
    selector = selector.fit(X, y)
    X = X[X.columns[selector.support_]]
    
    # Train/test split: use time-based split
    train_end_date = transactions['t_dat'].max() - pd.DateOffset(weeks=1)
    X_train = X[transactions['t_dat'] <= train_end_date]
    y_train = y[transactions['t_dat'] <= train_end_date]
    X_val = X[transactions['t_dat'] > train_end_date]
    
    # Create class weights
    class_weights = compute_class_weight('balanced', np.unique(y_train), y_train)
    class_weights = dict(enumerate(class_weights))
    
    # Train a LightGBM model
    dtrain = lgb.Dataset(X_train, label=y_train, weight=y_train.map(class_weights))
    model = lgb.train(params, dtrain)
    
    # Predict recommended articles
    y_pred = model.predict(X_val)
    top_articles = articles.iloc[y_pred.argsort()[-10:][::-1]]['article_id'].tolist()
    return {'loss': -1 * top_articles, 'status': STATUS_OK}
