# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Pre-Processing</span>**](#Pre-Processing)
* [**<span>3. Popularity Model</span>**](#Popularity-model) 
* [**<span>4. Model Evaluation</span>**](#Model-Evaluation) 

## Imports

In [18]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
# import cudf # switch on P100 GPU for this to work in Kaggle
# import cupy as cp

# Importing data
articles = pd.read_csv('articles.csv')
print(articles.head())
print("--")
customers = pd.read_csv('customers.csv')
print(customers.head())
print("--")
transactions = pd.read_csv("transactions_train.csv")
print(transactions.head())
print("--")

   article_id  product_code          prod_name  product_type_no  \
0   108775015        108775          Strap top              253   
1   108775044        108775          Strap top              253   
2   108775051        108775      Strap top (1)              253   
3   110065001        110065  OP T-shirt (Idro)              306   
4   110065002        110065  OP T-shirt (Idro)              306   

  product_type_name  product_group_name  graphical_appearance_no  \
0          Vest top  Garment Upper body                  1010016   
1          Vest top  Garment Upper body                  1010016   
2          Vest top  Garment Upper body                  1010017   
3               Bra           Underwear                  1010016   
4               Bra           Underwear                  1010016   

  graphical_appearance_name  colour_group_code colour_group_name  ...  \
0                     Solid                  9             Black  ...   
1                     Solid               

## Pre-Processing

In [19]:
# ----- empty value stats -------------
print("Missing values: ")
print(customers.isnull().sum())
print("--\n")

print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

# ---- data cleaning -------------

customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)

# replace club_member_status missing values with 'LEFT CLUB' --> no members with LEFT CLUB status in data
customers['club_member_status'] = customers['club_member_status'].fillna('LEFT CLUB')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('None')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE', 'None')
customers['age'] = customers['age'].fillna(customers['age'].mean())
customers['age'] = customers['age'].astype(int)
articles['detail_desc'] = articles['detail_desc'].fillna('None')


print("Customers' Missing values: ")
print(customers.isnull().sum())
print("--\n")

Missing values: 
customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16009
age                        15861
postal_code                    0
dtype: int64
--

FN Newsletter vals:  [nan  1.]
Active communication vals:  [nan  1.]
Club member status vals:  ['ACTIVE' nan 'PRE-CREATE' 'LEFT CLUB']
Fashion News frequency vals:  ['NONE' 'Regularly' nan 'Monthly' 'None']
--

Customers' Missing values: 
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
--



In [20]:
# ---- memory optimizations -------------

# reference: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

# iterate through all the columns of a dataframe and reduce the int and float data types to the smallest possible size, ex. customer_id should not be reduced from int64 to a samller value as it would have collisions
import numpy as np
import pandas as pd

def reduce_mem_usage(df):
    """Iterate over all the columns of a DataFrame and modify the data type
    to reduce memory usage, handling ordered Categoricals"""
    
    # check the memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'category':
            if df[col].cat.ordered:
                # Convert ordered Categorical to an integer
                df[col] = df[col].cat.codes.astype('int16')
            else:
                # Convert unordered Categorical to a string
                df[col] = df[col].astype('str')
        
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    # check the memory usage after optimization
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))

    # calculate the percentage of the memory usage reduction
    mem_reduction = 100 * (start_mem - end_mem) / start_mem
    print("Memory usage decreased by {:.1f}%".format(mem_reduction))
    
    return df

   

In [21]:
print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-n

In [22]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  ['ACTIVE' 'LEFT CLUB' 'PRE-CREATE']
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



In [23]:
# explicitly convert club_member_status to ordinal values before mem optimization to avoid errors

customers['club_member_status'].replace({'LEFT CLUB': 0, 'PRE-CREATE': 1, 'ACTIVE': 2}, inplace=True)
customers['club_member_status'] = customers['club_member_status'].astype('int8')
print(customers['club_member_status'].unique())


[2 0 1]


In [24]:
# ---- memory optimizations -------------

# uses 8 bytes instead of given 64 byte string, reduces mem by 8x, 
# !!!! have to convert back before merging w/ sample_submissions.csv
# convert transactions['customer_id'] to 8 bytes int
# transactions['customer_id'] = transactions['customer_id'].astype('int64')
transactions['customer_id'] = transactions['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')

articles = reduce_mem_usage(articles)
customers = reduce_mem_usage(customers)
transactions = reduce_mem_usage(transactions)

# articles['article_id'] = articles['article_id'].astype('int32')
# transactions['article_id'] = transactions['article_id'].astype('int32') 
# # !!!! ADD LEADING ZERO BACK BEFORE SUBMISSION OF PREDICTIONS TO KAGGLE: 
# # Ex.: transactions['article_id'] = '0' + transactions.article_id.astype('str')

print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Memory usage of dataframe is 20.13 MB
Memory usage after optimization is: 13.59 MB
Memory usage decreased by 32.5%
Memory usage of dataframe is 64.11 MB
Memory usage after optimization is: 39.25 MB
Memory usage decreased by 38.8%
Memory usage of dataframe is 1212.63 MB
Memory usage after optimization is: 697.26 MB
Memory usage decreased by 42.5%
Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int32 
 1   product_code                  105542 non-null  int32 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int16 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  i

In [25]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  [2 0 1]
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



In [26]:
# time-based splitting strategy

def split_train_val_data_and_drop_duplicates(transactions, days=7):
    """
    Splits the transaction training data into a training set and a validation set of 7 days to prevent data leakage.
    """
    
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
    transactions = transactions.sort_values(by=['t_dat'])
    latest_transaction_date = transactions['t_dat'].max()
    
    training_set = transactions[transactions['t_dat'] < latest_transaction_date - pd.Timedelta(days=days)]
    validation_set = transactions[transactions['t_dat'] >= latest_transaction_date - pd.Timedelta(days=days)]
    
    print("Training set size:", len(training_set))
    print("Validation set size:", len(validation_set))
    print("Last date in training set:", training_set['t_dat'].max())
    print("Last date in validation set:", validation_set['t_dat'].max())

    # drop duplicate rows
    training_set = training_set.drop_duplicates().copy()
    validation_set = validation_set.drop_duplicates().copy()
    
    return training_set, validation_set

In [27]:
def preprocess_data(transactions_df, customers_df, articles_df, customers_col='customer_id', articles_col='article_id'):
    """
    Preprocesses customer and article IDs for use in a sparse matrix.
    
    Returns:
    - transactions_df: the input transaction DataFrame with two additional columns, 'user_index' and 'item_index',
                       that map customer and article IDs to their corresponding indices in a sparse matrix
    - customer_id_indices_map: a dictionary that maps customer IDs to their corresponding indices
    - article_id_indices_map: a dictionary that maps article IDs to their corresponding indices
    """
    # Create a list of unique customer IDs and product IDs
    all_customers = customers_df[customers_col].unique().tolist()
    all_articles = articles_df[articles_col].unique().tolist()

    # Create dicts mapping IDs to their corresponding indices
    customer_id_indices_map = {customer_id: i for i, customer_id in enumerate(all_customers)}
    article_id_indices_map = {article_id: i for i, article_id in enumerate(all_articles)}

    # Map customer and article IDs to their resp. indices in the transaction DataFrame
    transactions_df['user_index'] = transactions_df[customers_col].map(customer_id_indices_map)
    transactions_df['item_index'] = transactions_df[articles_col].map(article_id_indices_map)

    return transactions_df, all_customers, all_articles, customer_id_indices_map, article_id_indices_map

In [28]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050842,2
1,2018-09-20,-6846340800584936,541518023,0.030487,2
2,2018-09-20,-8334631767138808638,505221004,0.015236,2
3,2018-09-20,-8334631767138808638,685687003,0.016937,2
4,2018-09-20,-8334631767138808638,685687004,0.016937,2


In [29]:
# get top 200 customers by number of transactions
top_customers = transactions['customer_id'].value_counts().head(200).index.tolist()

# print num of transactions for the 200th customer
print(transactions['customer_id'].value_counts().sort_values(ascending=False).iloc[199])

# only get articles that were purchased by top 200 customers at least once in articles df
articles_top_200 = articles[articles['article_id'].isin(transactions[transactions['customer_id'].isin(top_customers)]['article_id'].unique())]

# only get 200 customers in customers df
customers_top_200 = customers[customers['customer_id'].isin(top_customers)]

articles = articles_top_200.copy()
customers = customers_top_200.copy()
transactions = transactions[transactions['customer_id'].isin(top_customers)].copy()
transactions = transactions.drop_duplicates().copy()

622


In [30]:
print(transactions.isnull().sum())
print(customers.isnull().sum())
print(articles.isnull().sum())

t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
dtype: int64
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
article_id                      0
product_code                    0
prod_name                       0
product_type_no                 0
product_type_name               0
product_group_name              0
graphical_appearance_no         0
graphical_appearance_name       0
colour_group_code               0
colour_group_name               0
perceived_colour_value_id       0
perceived_colour_value_name     0
perceived_colour_master_id      0
perceived_colour_master_name    0
department_no                   0
department_name                 0
index_code                      0
index_name                      0
index_group_no                  0
index_group_name      

In [31]:
print(len(transactions))
print(len(customers))
print(len(articles))


126622
200
38919


In [32]:
# Dropping columns with uninformative article data

articles = articles.drop(columns=['product_code', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])
articles = articles.drop(columns=[col for col in articles.columns if 'colour_' in col or 'perceived_' in col])

In [33]:
transactions, all_customers, all_articles, customer_id_indices_map, article_id_indices_map = preprocess_data(transactions, customers, articles)

print("Total num of customers: ", len(all_customers))
print("Total num of articles: ", len(all_articles))
print("Customer ID mapping: ", list(customer_id_indices_map.items())[:5])
print("Article ID mapping: ", list(article_id_indices_map.items())[:5])
transactions.head()

Total num of customers:  200
Total num of articles:  38919
Customer ID mapping:  [(1249760199313500820, 0), (3862718111684591643, 1), (-8098965676522405228, 2), (8346339317755757908, 3), (-7779445982753353194, 4)]
Article ID mapping:  [(108775015, 0), (108775044, 1), (110065001, 2), (110065002, 3), (110065011, 4)]


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_index,item_index
740,2018-09-20,1135991499650384534,668766002,0.042358,2,5,11563
741,2018-09-20,1135991499650384534,652946001,0.050842,2,5,9899
742,2018-09-20,1135991499650384534,691275008,0.06781,2,5,14438
1260,2018-09-20,5085370976430926408,657476001,0.016937,2,10,10307
1261,2018-09-20,5085370976430926408,685687003,0.016937,2,10,13608


## Popularity-Model


To validate that the other 4 models are all performing better than random chance, we will use the `popularity model` as a `baseline` model, which simply recommends `last week's top 12 items` to each customer. 

This will also be the model used to solved the `cold start` problem, where a customer who hasn't made any purchases yet is simply recommended the top 12 items. This may not be the most performant approach, but it is a practical approach given that fashion is seasonal and clothes/articles tend to go 'out of style' fairly quickly. This will also be used to fill up the 12 articles if other models cannot return 12 recommendations for a customer.

Other recommendation strategies suitable for solving the <br>
1.) cold start problem <br>
2.) situation where the customer has made less than 12 purchases <br>
3.) situation where the other models recommend less than 12 articles,<br> include: <br>

- recommend items previously purchased by the customer
- recommend items that are frequently bought together with the customer's previous purchase/s

In [61]:
# get latest transaction date
latest_transaction_date = transactions['t_dat'].max()
latest_transaction_date = pd.to_datetime(latest_transaction_date)
latest_transaction_date

Timestamp('2020-09-22 00:00:00')

In [38]:
# Getting the latest week's top 12 products

# Merge transactions with articles dataframe to get product names
transactions_with_names = pd.merge(transactions, articles[['article_id', 'prod_name']], on='article_id')

# Filter transactions to only include the latest week
transactions_with_names['t_dat'] = pd.to_datetime(transactions_with_names['t_dat'])
latest_week_transactions = transactions_with_names[transactions_with_names['t_dat'] >= latest_transaction_date - pd.Timedelta(days=7)]

# Get the top 12 products purchased during the latest week
latest_top_12_products = latest_week_transactions['article_id'].value_counts().head(12)

print("Last week's 12 most popular articles: \n")
print('{:<15}{:<30}{:<25}'.format('Article ID', 'Product Name', 'Purchases'))
print('-'*55)
for article_id, count in latest_top_12_products.items():
    prod_name = latest_week_transactions.loc[latest_week_transactions['article_id']==article_id, 'prod_name'].iloc[0]
    print('{:<15}{:<30}{:<25}'.format(article_id, prod_name, count))

Last week's 12 most popular articles: 

Article ID     Product Name                  Purchases                
-------------------------------------------------------
919273002      Lucien lace                   6                        
903840002      Lafayette                     5                        
906744001      Tartt wedge                   4                        
889550002      AUSSIE SHEFFIELD LONG PUFFER  4                        
915529005      Liliana                       4                        
871710012      SPEED BRUNO  SHIRT            4                        
918525001      Frill cable                   4                        
929165002      Danila                        3                        
903516001      PETAR SWEATSHIRT 2PK          3                        
870999004      Throne thin puffer            3                        
893059004      Stacey                        3                        
903487001      Millo PQ Loafer               3      

### Model Training

In [41]:
# Split the data into training and testing sets
training_set, validation_set = split_train_val_data_and_drop_duplicates(transactions)

Training set size: 125543
Validation set size: 1079
Last date in training set: 2020-09-14 00:00:00
Last date in validation set: 2020-09-22 00:00:00


In [42]:
training_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_index,item_index
740,2018-09-20,1135991499650384534,668766002,0.042358,2,5,11563
33518,2018-09-20,-4929536485466463532,636900001,0.06781,2,139,8488
33519,2018-09-20,-4929536485466463532,609727003,0.030487,2,139,6231
33520,2018-09-20,-4929536485466463532,692454002,0.025406,2,139,14568
34770,2018-09-20,-8298629768350782975,667378001,0.031052,2,147,11460


In [43]:
validation_set.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_index,item_index
31538998,2020-09-15,3407358910964148684,912208002,0.06781,2,128,38298
31538357,2020-09-15,-6453664902838328333,909357001,0.06781,2,121,38187
31538750,2020-09-15,-8267779422716946985,924243002,0.042358,2,124,38667
31537622,2020-09-15,3976750382053506539,862271001,0.030487,2,113,34640
31538997,2020-09-15,3407358910964148684,928040001,0.06781,2,128,38750


### Model Evaluation

In [62]:
def evaluate_map12(transactions, customers, articles, latest_top_12_products, validation_set):
    # Merge transactions with articles dataframe to get product names
    transactions_with_names = pd.merge(transactions, articles[['article_id', 'prod_name']], on='article_id')

    # Filter transactions to only include the latest week
    latest_week_transactions = transactions_with_names[transactions_with_names['t_dat'] >= transactions_with_names['t_dat'].max() - pd.Timedelta(days=7)]

    # Get the top 12 products purchased during the latest week
    latest_top_12_products = latest_week_transactions['article_id'].value_counts().head(12)

    # Filter validation set to only include customers who have purchased articles
    customer_ids = transactions['customer_id'].unique()
    validation_set = validation_set[validation_set['customer_id'].isin(customer_ids)]

    # Create ground truth dictionary to store purchased articles for each customer in the validation set
    purchased_articles = {}
    for customer_id in validation_set['customer_id'].unique():
        articles = transactions[transactions['customer_id']==customer_id]['article_id'].unique()
        purchased_articles[customer_id] = set(articles)

    # Create recommended articles dictionary to store latest top 12 products for each customer in the validation set
    recommended_articles = {}
    for customer_id in validation_set['customer_id'].unique():
        recommended_articles[customer_id] = latest_top_12_products.index.tolist()

    # Calculate average precision @ 12 for each customer in the validation set
    ap_list = []
    for customer_id in validation_set['customer_id'].unique():
        purchased = purchased_articles[customer_id]
        recommended = set(recommended_articles[customer_id])
        relevant_recommended = purchased.intersection(recommended)
        if len(relevant_recommended) > 0:
            ap = sum([(i+1)/(j+1) for i,j in enumerate(range(12)) if i in relevant_recommended])/min(len(relevant_recommended), 12)
            ap_list.append(ap)
        else:
            ap_list.append(0)

    # Calculate mean average precision @ 12 and return
    map12 = np.mean(ap_list)
    return map12

In [63]:
map12 = evaluate_map12(transactions, customers, articles, latest_top_12_products, validation_set)

In [64]:
map12

0.0