# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Pre-Processing</span>**](#Pre-Processing)  
* [**<span>3. Feature Engineering</span>**](#Feature-Engineering)


## Imports

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
# import cudf # switch on P100 GPU for this to work in Kaggle
# import cupy as cp

# Importing data
articles = pd.read_csv('articles.csv')
print(articles.head())
print("--")
customers = pd.read_csv('customers.csv')
print(customers.head())
print("--")
transactions = pd.read_csv("transactions_train.csv")
print(transactions.head())
print("--")

   article_id  product_code          prod_name  product_type_no   
0   108775015        108775          Strap top              253  \
1   108775044        108775          Strap top              253   
2   108775051        108775      Strap top (1)              253   
3   110065001        110065  OP T-shirt (Idro)              306   
4   110065002        110065  OP T-shirt (Idro)              306   

  product_type_name  product_group_name  graphical_appearance_no   
0          Vest top  Garment Upper body                  1010016  \
1          Vest top  Garment Upper body                  1010016   
2          Vest top  Garment Upper body                  1010017   
3               Bra           Underwear                  1010016   
4               Bra           Underwear                  1010016   

  graphical_appearance_name  colour_group_code colour_group_name  ...   
0                     Solid                  9             Black  ...  \
1                     Solid               

## Pre-Processing

In [2]:
# ----- empty value stats -------------
print("Missing values: ")
print(customers.isnull().sum())
print("--\n")

print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

# ---- data cleaning -------------

customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)

# replace club_member_status missing values with 'LEFT CLUB' --> no members with LEFT CLUB status in data
customers['club_member_status'] = customers['club_member_status'].fillna('LEFT CLUB')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('None')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE', 'None')
customers['age'] = customers['age'].fillna(customers['age'].mean())
customers['age'] = customers['age'].astype(int)
articles['detail_desc'] = articles['detail_desc'].fillna('None')


print("Customers' Missing values: ")
print(customers.isnull().sum())
print("--\n")

Missing values: 
customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16011
age                        15861
postal_code                    0
dtype: int64
--

FN Newsletter vals:  [nan  1.]
Active communication vals:  [nan  1.]
Club member status vals:  ['ACTIVE' nan 'PRE-CREATE' 'LEFT CLUB']
Fashion News frequency vals:  ['NONE' 'Regularly' nan 'Monthly']
--

Customers' Missing values: 
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
--



In [3]:
# ---- memory optimizations -------------

# reference: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

# iterate through all the columns of a dataframe and reduce the int and float data types to the smallest possible size, ex. customer_id should not be reduced from int64 to a samller value as it would have collisions
import numpy as np
import pandas as pd

def reduce_mem_usage(df):
    """Iterate over all the columns of a DataFrame and modify the data type
    to reduce memory usage, handling ordered Categoricals"""
    
    # check the memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'category':
            if df[col].cat.ordered:
                # Convert ordered Categorical to an integer
                df[col] = df[col].cat.codes.astype('int16')
            else:
                # Convert unordered Categorical to a string
                df[col] = df[col].astype('str')
        
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    # check the memory usage after optimization
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))

    # calculate the percentage of the memory usage reduction
    mem_reduction = 100 * (start_mem - end_mem) / start_mem
    print("Memory usage decreased by {:.1f}%".format(mem_reduction))
    
    return df

   

In [4]:
print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-n

In [5]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  ['ACTIVE' 'LEFT CLUB' 'PRE-CREATE']
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



In [6]:
# explicitly convert club_member_status to ordinal values before mem optimization to avoid errors

customers['club_member_status'].replace({'LEFT CLUB': 0, 'PRE-CREATE': 1, 'ACTIVE': 2}, inplace=True)
customers['club_member_status'] = customers['club_member_status'].astype('int8')
print(customers['club_member_status'].unique())


[2 0 1]


In [7]:
# ---- memory optimizations -------------

# uses 8 bytes instead of given 64 byte string, reduces mem by 8x, 
# !!!! have to convert back before merging w/ sample_submissions.csv
# convert transactions['customer_id'] to 8 bytes int
# transactions['customer_id'] = transactions['customer_id'].astype('int64')
transactions['customer_id'] = transactions['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')

articles = reduce_mem_usage(articles)
customers = reduce_mem_usage(customers)
transactions = reduce_mem_usage(transactions)

# articles['article_id'] = articles['article_id'].astype('int32')
# transactions['article_id'] = transactions['article_id'].astype('int32') 
# # !!!! ADD LEADING ZERO BACK BEFORE SUBMISSION OF PREDICTIONS TO KAGGLE: 
# # Ex.: transactions['article_id'] = '0' + transactions.article_id.astype('str')

print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Memory usage of dataframe is 20.13 MB
Memory usage after optimization is: 13.59 MB
Memory usage decreased by 32.5%
Memory usage of dataframe is 58.88 MB
Memory usage after optimization is: 39.25 MB
Memory usage decreased by 33.3%
Memory usage of dataframe is 1212.63 MB
Memory usage after optimization is: 697.26 MB
Memory usage decreased by 42.5%
Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int32 
 1   product_code                  105542 non-null  int32 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int16 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  i

In [8]:
# print unique values of customer columns
print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

FN Newsletter vals:  [0. 1.]
Active communication vals:  [0. 1.]
Club member status vals:  [2 0 1]
Fashion News frequency vals:  ['None' 'Regularly' 'Monthly']
--



In [9]:
# time-based splitting strategy

def split_train_val_data_and_drop_duplicates(transactions, days=7):
    """
    Splits the transaction training data into a training set and a validation set of 7 days to prevent data leakage.
    """
    
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
    transactions = transactions.sort_values(by=['t_dat'])
    latest_transaction_date = transactions['t_dat'].max()
    
    training_set = transactions[transactions['t_dat'] < latest_transaction_date - pd.Timedelta(days=days)]
    validation_set = transactions[transactions['t_dat'] >= latest_transaction_date - pd.Timedelta(days=days)]
    
    print("Training set size:", len(training_set))
    print("Validation set size:", len(validation_set))
    print("Last date in training set:", training_set['t_dat'].max())
    print("Last date in validation set:", validation_set['t_dat'].max())

    # drop duplicate rows
    training_set = training_set.drop_duplicates().copy()
    validation_set = validation_set.drop_duplicates().copy()
    
    return training_set, validation_set

In [10]:
def preprocess_data(transactions_df, customers_df, articles_df, customers_col='customer_id', articles_col='article_id'):
    """
    Preprocesses customer and article IDs for use in a sparse matrix.
    
    Returns:
    - transactions_df: the input transaction DataFrame with two additional columns, 'user_index' and 'item_index',
                       that map customer and article IDs to their corresponding indices in a sparse matrix
    - customer_id_indices_map: a dictionary that maps customer IDs to their corresponding indices
    - article_id_indices_map: a dictionary that maps article IDs to their corresponding indices
    """
    # Create a list of unique customer IDs and product IDs
    all_customers = customers_df[customers_col].unique().tolist()
    all_articles = articles_df[articles_col].unique().tolist()

    # Create dicts mapping IDs to their corresponding indices
    customer_id_indices_map = {customer_id: i for i, customer_id in enumerate(all_customers)}
    article_id_indices_map = {article_id: i for i, article_id in enumerate(all_articles)}

    # Map customer and article IDs to their resp. indices in the transaction DataFrame
    transactions_df['user_index'] = transactions_df[customers_col].map(customer_id_indices_map)
    transactions_df['item_index'] = transactions_df[articles_col].map(article_id_indices_map)

    return transactions_df, all_customers, all_articles, customer_id_indices_map, article_id_indices_map

In [11]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
0,2018-09-20,-6846340800584936,663713001,0.050842,2
1,2018-09-20,-6846340800584936,541518023,0.030487,2
2,2018-09-20,-8334631767138808638,505221004,0.015236,2
3,2018-09-20,-8334631767138808638,685687003,0.016937,2
4,2018-09-20,-8334631767138808638,685687004,0.016937,2


In [12]:
import numpy as np
from scipy.sparse import csr_matrix

# binary purchase interaction user-item matrix

def create_user_item_matrix(transactions):

    # Get unique user and item indices in asc. order
    user_indices = np.arange(transactions['user_index'].nunique())
    item_indices = np.arange(transactions['item_index'].nunique())

    # Create a dictionary mapping user and item indices to matrix indices
    user_index_dict = dict(zip(sorted(transactions['user_index'].unique()), user_indices))
    item_index_dict = dict(zip(sorted(transactions['item_index'].unique()), item_indices))

    # Create arrays of row indices, column indices, and data for the sparse matrix
    rows = []
    cols = []
    data = [] # purchased 1 or 0

    # Iterate over all possible combinations of user and item indices
    for user_index in user_indices:
        for item_index in item_indices:
            # Get the corresponding matrix indices for the user and item indices
            matrix_user_index = user_index_dict.get(user_index)
            matrix_item_index = item_index_dict.get(item_index)
            # Get the corresponding interaction value from the transactions dataframe
            interaction = transactions.loc[(transactions['user_index'] == user_index) & 
                                            (transactions['item_index'] == item_index), 'quantity'].values
            # Append the row index, column index, and interaction value to the corresponding arrays
            rows.append(matrix_user_index)
            cols.append(matrix_item_index)
            data.append(1 if len(interaction) > 0 else 0)

    # Create the sparse matrix using the row, column, and data arrays
    user_item_matrix = csr_matrix((data, (rows, cols)), shape=(len(user_indices), len(item_indices)))

    return user_item_matrix


In [13]:
from math import ceil

# calculate total number of transaction weeks in tranactions data
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Compute the minimum and maximum date values
min_date = transactions['t_dat'].min()
max_date = transactions['t_dat'].max()

# Compute the number of weeks between the minimum and maximum date values
num_weeks = ceil((max_date - min_date).days / 7)

print(f"Total number of transaction weeks: {num_weeks}")


Total number of transaction weeks: 105


In [14]:
from datetime import datetime, timedelta

# only use last x weeks of transactions data since data is too large
def filter_transactions_last_x_weeks(transactions, x = 10):
    # Convert date strings to datetime objects
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

    # Calculate the date x weeks ago from the latest transaction date
    latest_date = transactions['t_dat'].max()
    cutoff_date = latest_date - timedelta(weeks=x)

    # Filter transactions to only include those in the last x weeks
    filtered_transactions = transactions.loc[transactions['t_dat'] >= cutoff_date].copy()

    return filtered_transactions

In [15]:
def filter_customers_and_articles(customers, articles, filtered_transactions):
    # Get unique customer and article IDs from filtered transactions
    customer_ids = filtered_transactions['customer_id'].unique()
    article_ids = filtered_transactions['article_id'].unique()

    # Filter customers and articles to only include those in filtered transactions
    customers_filtered = customers.loc[customers['customer_id'].isin(customer_ids)].copy()
    articles_filtered = articles.loc[articles['article_id'].isin(article_ids)].copy()

    return customers_filtered, articles_filtered

## Feature-Engineering

In [16]:
# LightGBM imports

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.feature_selection import RFE
from sklearn.model_selection import TimeSeriesSplit
from sklearn.metrics import make_scorer
import lightgbm as lgb
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

In [17]:
# get top 200 customers by number of transactions
top_customers = transactions['customer_id'].value_counts().head(200).index.tolist()

# print num of transactions for the 200th customer
print(transactions['customer_id'].value_counts().sort_values(ascending=False).iloc[199])

# only get articles that were purchased by top 200 customers at least once in articles df
articles_top_200 = articles[articles['article_id'].isin(transactions[transactions['customer_id'].isin(top_customers)]['article_id'].unique())]

# only get 200 customers in customers df
customers_top_200 = customers[customers['customer_id'].isin(top_customers)]

articles = articles_top_200.copy()
customers = customers_top_200.copy()
transactions = transactions[transactions['customer_id'].isin(top_customers)].copy()
transactions = transactions.drop_duplicates().copy()

622


In [18]:
print(transactions.isnull().sum())
print(customers.isnull().sum())
print(articles.isnull().sum())

t_dat               0
customer_id         0
article_id          0
price               0
sales_channel_id    0
dtype: int64
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
article_id                      0
product_code                    0
prod_name                       0
product_type_no                 0
product_type_name               0
product_group_name              0
graphical_appearance_no         0
graphical_appearance_name       0
colour_group_code               0
colour_group_name               0
perceived_colour_value_id       0
perceived_colour_value_name     0
perceived_colour_master_id      0
perceived_colour_master_name    0
department_no                   0
department_name                 0
index_code                      0
index_name                      0
index_group_no                  0
index_group_name      

In [19]:
print(len(transactions))
print(len(customers))
print(len(articles))


126622
200
38919


In [20]:
articles.head()

Unnamed: 0,article_id,product_code,prod_name,product_type_no,product_type_name,product_group_name,graphical_appearance_no,graphical_appearance_name,colour_group_code,colour_group_name,...,department_name,index_code,index_name,index_group_no,index_group_name,section_no,section_name,garment_group_no,garment_group_name,detail_desc
0,108775015,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,9,Black,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
1,108775044,108775,Strap top,253,Vest top,Garment Upper body,1010016,Solid,10,White,...,Jersey Basic,A,Ladieswear,1,Ladieswear,16,Womens Everyday Basics,1002,Jersey Basic,Jersey top with narrow shoulder straps.
3,110065001,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,9,Black,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
4,110065002,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,10,White,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."
5,110065011,110065,OP T-shirt (Idro),306,Bra,Underwear,1010016,Solid,12,Light Beige,...,Clean Lingerie,B,Lingeries/Tights,1,Ladieswear,61,Womens Lingerie,1017,"Under-, Nightwear","Microfibre T-shirt bra with underwired, moulde..."


In [21]:
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,postal_code
38,1249760199313500820,1.0,1.0,2,Regularly,44,930b19ae7db8abb5a27f4da10217755a7305b4c452f5e0...
7066,3862718111684591643,0.0,0.0,2,,30,7d81fea7e0b9c27deb7bd9c46304e24d176a6aa2ced422...
8859,-8098965676522405228,1.0,1.0,2,Regularly,23,84341fc1a4fff70f3effe1dc4de74167024562e00065b4...
9481,8346339317755757908,0.0,0.0,2,,26,716a141c5fe7405d6ceab1ea281917fffca3c7a0e89314...
10095,-7779445982753353194,1.0,1.0,2,Regularly,26,d9953eb7a7a24998cec6deecc7ac24091ccc15e395ba12...


In [22]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id
740,2018-09-20,1135991499650384534,668766002,0.042358,2
741,2018-09-20,1135991499650384534,652946001,0.050842,2
742,2018-09-20,1135991499650384534,691275008,0.06781,2
1260,2018-09-20,5085370976430926408,657476001,0.016937,2
1261,2018-09-20,5085370976430926408,685687003,0.016937,2


In [23]:
# Dropping columns with uninformative article data

articles = articles.drop(columns=['product_code', 'prod_name', 'product_type_name', 'product_group_name', 'graphical_appearance_name', 'department_name', 'index_name', 'index_group_name', 'section_name', 'garment_group_name', 'detail_desc'])
articles = articles.drop(columns=[col for col in articles.columns if 'colour_' in col or 'perceived_' in col])

In [24]:
articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no
0,108775015,253,1010016,1676,A,1,16,1002
1,108775044,253,1010016,1676,A,1,16,1002
3,110065001,306,1010016,1339,B,1,61,1017
4,110065002,306,1010016,1339,B,1,61,1017
5,110065011,306,1010016,1339,B,1,61,1017


These columns are left to capture any potential patterns in the other columns, such as how certain index codes or sections might be associated with higher or lower sales.

In [25]:
# Feature engineering
from sklearn.preprocessing import LabelEncoder

# Define mapping for fashion_news_frequency feature
fashion_news_freq_mapping = {'None': 0, 'Monthly': 1, 'Regularly': 2}

# label encode fashion_news_frequency feature
le = LabelEncoder()
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].map(fashion_news_freq_mapping)
customers['fashion_news_frequency'] = le.fit_transform(customers['fashion_news_frequency'])

In [26]:
customers = customers.drop(['postal_code'], axis=1)
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age
38,1249760199313500820,1.0,1.0,2,1,44
7066,3862718111684591643,0.0,0.0,2,0,30
8859,-8098965676522405228,1.0,1.0,2,1,23
9481,8346339317755757908,0.0,0.0,2,0,26
10095,-7779445982753353194,1.0,1.0,2,1,26


In [27]:
# Feature engineering: encode nominal categorical features
ohe = OneHotEncoder()
# One-hot encode sales_channel_id feature
sales_channel_ohe = pd.get_dummies(transactions['sales_channel_id'], prefix='sales_channel')
transactions = pd.concat([transactions, sales_channel_ohe], axis=1)

# Drop the original sales_channel_id feature
transactions.drop('sales_channel_id', axis=1, inplace=True)

In [28]:
# boolean unique values of sales_channel_id after encoding
print(transactions['sales_channel_1'].unique())
print(transactions['sales_channel_2'].unique())

[False  True]
[ True False]


In [29]:
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_1,sales_channel_2
740,2018-09-20,1135991499650384534,668766002,0.042358,False,True
741,2018-09-20,1135991499650384534,652946001,0.050842,False,True
742,2018-09-20,1135991499650384534,691275008,0.06781,False,True
1260,2018-09-20,5085370976430926408,657476001,0.016937,False,True
1261,2018-09-20,5085370976430926408,685687003,0.016937,False,True


In [30]:
# Convert 't_dat' column to datetime format
transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])

# Group by customer ID and find the first and last transaction dates
first_trans_dates = transactions.groupby('customer_id')['t_dat'].min().reset_index()
last_trans_dates = transactions.groupby('customer_id')['t_dat'].max().reset_index()

customer_purchase_engagement = pd.merge(first_trans_dates, last_trans_dates, on='customer_id', suffixes=('_first', '_last'))
# Create a new feature by calculating the time difference in days between first and last transactions
customer_purchase_engagement['time_diff_days'] = (customer_purchase_engagement['t_dat_last'] - customer_purchase_engagement['t_dat_first']).dt.days
# Drop the original first and last transaction date columns
customer_purchase_engagement.drop(['t_dat_first', 't_dat_last'], axis=1, inplace=True)
customer_purchase_engagement.head()

# Merge the customer_purchase_engagement dataframe with the customers dataframe
customers = pd.merge(customers, customer_purchase_engagement, on='customer_id', how='left')
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days
0,1249760199313500820,1.0,1.0,2,1,44,714
1,3862718111684591643,0.0,0.0,2,0,30,714
2,-8098965676522405228,1.0,1.0,2,1,23,727
3,8346339317755757908,0.0,0.0,2,0,26,724
4,-7779445982753353194,1.0,1.0,2,1,26,520


The above `time_diff_days` feature can potentially provide insights into a customer's engagement by looking at the gap in the number of days between the last purchase and the current date. <br> The assumption is, The larger the gap, the less engaged the customer is. 

In [31]:
# Join the transaction dataframe with the customers dataframe
merged = pd.merge(transactions, customers, on='customer_id', how='inner')

# Calculate the mean age for each article
item_mean_age = merged.groupby('article_id')['age'].mean()

# Calculate the difference between every user's age and the mean age of users who have purchased a particular item
merged['age_diff'] = merged['age'] - merged['article_id'].map(item_mean_age)

# Group by article and take the mean of age_diff
article_age_diff = merged.groupby('article_id')['age_diff'].mean()

# Append the age difference feature to the articles dataframe
articles['age_diff'] = articles['article_id'].map(article_age_diff)

articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff
0,108775015,253,1010016,1676,A,1,16,1002,2.583792e-15
1,108775044,253,1010016,1676,A,1,16,1002,0.0
3,110065001,306,1010016,1339,B,1,61,1017,-1.421085e-15
4,110065002,306,1010016,1339,B,1,61,1017,0.0
5,110065011,306,1010016,1339,B,1,61,1017,1.937844e-15


Mean age_diff for every article. It can be useful for predicting whether a user will buy an item based on their age and the age of other users who have already bought the same item. 

Intuituion behind the `age_diff` feature:

Let's say we have a dataset of customers who made transactions for a particular item with article_id = 123. Here is an example of how we can calculate the age_diff feature: <br>
Assume that the mean age of all customers who bought the item with article_id = 123 is 40 years old <br>
Customer A made a transaction for item with article_id = 123 and their age is 35. The age_diff feature for this transaction would be -5. (35 - 40). <br>
Customer B made a transaction for item with article_id = 123 and their age is 50. The age_diff feature for this transaction would be 10. (50 - 40). <br>
Customer C made a transaction for item with article_id = 123 and their age is 40. The age_diff feature for this transaction would be 0. (40 - 40). <br>
So, the age_diff feature measures the difference between the age of each customer who bought a specific item and the average age of all customers who bought that item. <br>

Therefore, the age_diff is the mean of all these individual age_diff values for each customer who bought the item with article_id = 123. age_diff = -1.66 for this example<br>


In [32]:
# Calculate mean, max, and min age for each item
item_mean_age = merged.groupby('article_id')['age'].mean()
item_max_age = merged.groupby('article_id')['age'].max()
item_min_age = merged.groupby('article_id')['age'].min()

# Merge the features back into the articles dataframe
articles = articles.merge(item_mean_age, on='article_id', how='left')
articles = articles.merge(item_max_age, on='article_id', how='left')
articles = articles.merge(item_min_age, on='article_id', how='left')

# Rename the columns to make them more descriptive
articles = articles.rename(columns={'age_x': 'mean_purchase_age', 'age_y': 'max_purchase_age', 'age': 'min_purchase_age'})

articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age
0,108775015,253,1010016,1676,A,1,16,1002,2.583792e-15,37.409091,61,24
1,108775044,253,1010016,1676,A,1,16,1002,0.0,37.0,51,22
2,110065001,306,1010016,1339,B,1,61,1017,-1.421085e-15,43.6,55,23
3,110065002,306,1010016,1339,B,1,61,1017,0.0,50.0,50,50
4,110065011,306,1010016,1339,B,1,61,1017,1.937844e-15,46.181818,59,29


Intuituion behind the `*_purchase_age` feature:

Additional age features to capture more information about the age of the customers who bought the respective articles. The gbdt might be able to learn more complex patterns from these features. <br>

In [33]:
# Calculate purchased item count for each user
transactions['quantity'] = 1
user_item_count = transactions.groupby(['customer_id', 'article_id'])['quantity'].sum().reset_index()

# Calculate total item count for each article
total_item_count = transactions.groupby('article_id')['quantity'].sum().reset_index()
total_item_count.columns = ['article_id', 'total_items']

user_item_count = pd.merge(user_item_count, total_item_count, on='article_id', how='left')

# Calculate ratio of purchased item count and total item count
user_item_count['article_engagement_ratio'] = user_item_count['quantity'] / user_item_count['total_items']


transactions = pd.merge(transactions, user_item_count[['customer_id', 'article_id', 'article_engagement_ratio']], on=['customer_id', 'article_id'], how='left')
transactions['quantity'] = user_item_count['quantity']

# fill missing values with 0
transactions['quantity'] = transactions['quantity'].fillna(0)
transactions.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_1,sales_channel_2,quantity,article_engagement_ratio
0,2018-09-20,1135991499650384534,668766002,0.042358,False,True,1.0,1.0
1,2018-09-20,1135991499650384534,652946001,0.050842,False,True,1.0,1.0
2,2018-09-20,1135991499650384534,691275008,0.06781,False,True,1.0,1.0
3,2018-09-20,5085370976430926408,657476001,0.016937,False,True,1.0,0.5
4,2018-09-20,5085370976430926408,685687003,0.016937,False,True,1.0,0.166667


Intuition behind feature: <br>

`article_engagement_ratio`: The feature is ratio of one user's purchased item count and the item's total purchase count. This serves to measure how engaged a user is with a particular item, which can be useful for predicting whether a user will buy similar items. <br>
Can also be used to measure how popular an item is, and can be used to potentially diversify recommendations.

In [34]:
transactions, all_customers, all_articles, customer_id_indices_map, article_id_indices_map = preprocess_data(transactions, customers, articles)

print("Total num of customers: ", len(all_customers))
print("Total num of articles: ", len(all_articles))
print("Customer ID mapping: ", list(customer_id_indices_map.items())[:5])
print("Article ID mapping: ", list(article_id_indices_map.items())[:5])
transactions.head()

Total num of customers:  200
Total num of articles:  38919
Customer ID mapping:  [(1249760199313500820, 0), (3862718111684591643, 1), (-8098965676522405228, 2), (8346339317755757908, 3), (-7779445982753353194, 4)]
Article ID mapping:  [(108775015, 0), (108775044, 1), (110065001, 2), (110065002, 3), (110065011, 4)]


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_1,sales_channel_2,quantity,article_engagement_ratio,user_index,item_index
0,2018-09-20,1135991499650384534,668766002,0.042358,False,True,1.0,1.0,5,11563
1,2018-09-20,1135991499650384534,652946001,0.050842,False,True,1.0,1.0,5,9899
2,2018-09-20,1135991499650384534,691275008,0.06781,False,True,1.0,1.0,5,14438
3,2018-09-20,5085370976430926408,657476001,0.016937,False,True,1.0,0.5,10,10307
4,2018-09-20,5085370976430926408,685687003,0.016937,False,True,1.0,0.166667,10,13608


In [35]:
# # user item matrix -- rows are users, columns are items, doesnt need article and customer data

# user_item_matrix = create_user_item_matrix(transactions)
# user_item_matrix

In [36]:
import pickle

# open user_item_matrix file
with open('user_item_matrix_200.pkl', 'rb') as f:
    user_item_matrix = pickle.load(f)

print(user_item_matrix[:10, :10].toarray())

[[0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [1 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 1 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0 0 0 0]]


In [37]:
from implicit.als import AlternatingLeastSquares
from sklearn.metrics.pairwise import cosine_similarity

# from the als_strat1_hyperparam_log 
# Create ALS model with default parameters
alpha = 25
als_model = AlternatingLeastSquares(factors=55, iterations=20, regularization=0.18)

# Fit model to user-item matrix
als_model.fit(user_item_matrix*alpha)

# Latent factors matrices
item_factors = als_model.item_factors
user_factors = als_model.user_factors

# item-item cosine similarity 
item_similarities = cosine_similarity(item_factors, dense_output=False)
# user-user cosine similarity
user_similarities = cosine_similarity(user_factors, dense_output=False)

k = 5
# Get top-k most similar items for each item
top_k_similar_items = item_similarities.argsort()[:, -k-1:-1]
# Get top-k most similar user for each user
top_k_similar_users = user_similarities.argsort()[:, -k-1:-1]

  from .autonotebook import tqdm as notebook_tqdm
100%|██████████| 20/20 [00:24<00:00,  1.21s/it]


In [38]:
print(item_similarities.shape)
print(user_similarities.shape)

(38919, 38919)
(200, 200)


In [39]:
# important: add user_index and item_index to customers and articles respectively
customers['user_index'] = customers['customer_id'].map(customer_id_indices_map)
articles['item_index'] = articles['article_id'].map(article_id_indices_map)

In [40]:
user_indices = np.arange(transactions['user_index'].nunique())
item_indices = np.arange(transactions['item_index'].nunique())
user_index_dict = dict(zip(sorted(transactions['user_index'].unique()), user_indices))
item_index_dict = dict(zip(sorted(transactions['item_index'].unique()), item_indices))

# print first 5 key and values in the dictionary
print(list(user_index_dict.items())[:5])
print(list(item_index_dict.items())[:5])

[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]
[(0, 0), (1, 1), (2, 2), (3, 3), (4, 4)]


In [41]:
# Create a mapping from user IDs to matrix indices
user_indices = np.arange(transactions['user_index'].nunique())
item_indices = np.arange(transactions['item_index'].nunique())
user_index_dict = dict(zip(sorted(transactions['user_index'].unique()), user_indices))
item_index_dict = dict(zip(sorted(transactions['item_index'].unique()), item_indices))


# Create features for each user based on the average quantity purchased by similar customers
for i in range(len(customers)):
    customer_id = customers.loc[i, 'user_index']
    # Get the matrix index for the current customer
    customer_idx = user_index_dict.get(customer_id, -1)
    
    if customer_idx != -1:
        similar_user_indexxs = top_k_similar_users[customer_idx]
        # Compute the mean of the user-item matrix for the similar users
        user_feature = user_item_matrix[similar_user_indexxs, :].mean(axis=0).A1
        
        # Store the mean in the customers DataFrame
        customers.loc[i, 'user_purchase_quant'] = user_feature.mean()
    else:
        # If the current customer is not in the user-item matrix, set the feature to NaN
        customers.loc[i, 'user_purchase_quant'] = np.nan

# Print the head of the customers DataFrame
customers.head()

Unnamed: 0,customer_id,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days,user_index,user_purchase_quant
0,1249760199313500820,1.0,1.0,2,1,44,714,0,0.020607
1,3862718111684591643,0.0,0.0,2,0,30,714,1,0.017297
2,-8098965676522405228,1.0,1.0,2,1,23,727,2,0.016722
3,8346339317755757908,0.0,0.0,2,0,26,724,3,0.016424
4,-7779445982753353194,1.0,1.0,2,1,26,520,4,0.016403


Intuition behind feature: <br>

`user_purchase_quant`: Gets the average quantity of items purchased by the k most similar customers to that customer. It looks at what other customers who are similar to this customer have bought and calculates the average amount of each item they bought. This feature can be used to predict what items a customer is likely to buy in the future based on what similar customers have bought in the past. This feature aims to capture purchase behaviours of a customer.<br>

For example, if a customer typically buys a lot of bomber jackets, and the top k most similar customers to that customer also tend to buy a lot of bomber jackets, then the average quantity of bomber jackets purchased by those similar customers could be a good predictor of how much the original customer is likely to purchase in the future. This however assumes that the k most similar customers have similar purchase behaviours to the customers in question, and on its own is not a strong feature.<br>


In [42]:
# Create a binary feature for each item that indicates whether or not a customer has bought that item,
# based on whether other customers who bought similar items also tended to buy that item.
for i in range(len(articles)):
    item_id = articles.loc[i, 'item_index']
    # Get the matrix index for the current item
    item_idx = item_index_dict.get(item_id, -1)
    
    if item_idx != -1:
        # List of (column) indices of similar items
        similar_items = top_k_similar_items[item_idx]
        
        # Find the customers who have purchased the current item
        customer_indices = np.where(user_item_matrix[:, item_idx].toarray()[:, 0] == 1)[0]
        
        # Binary vector representing the customer's purchases for the similar items
        customer_purchases = user_item_matrix[customer_indices, :][:, similar_items].toarray()
        article_preference = np.any(customer_purchases, axis=0)
        
        # Set article_preference to 1 if any customer has purchased the item, 0 otherwise
        articles.loc[i, 'article_preference'] = int(np.any(article_preference))
    else:
        articles.loc[i, 'article_preference'] = np.nan

In [43]:
articles[articles['article_preference'] == 0].head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age,item_index,article_preference
57,181448103,302,1010017,3937,D,2,51,1017,0.0,28.0,28,28,57,0.0
66,186372042,272,1010001,1676,A,1,16,1002,0.0,36.0,36,36,66,0.0
89,189654045,275,1010010,1643,D,2,51,1002,0.0,30.0,32,28,89,0.0
109,212042070,94,1010016,3929,D,2,52,1020,0.0,32.0,32,32,109,0.0
113,212629040,265,1010016,1643,D,2,51,1002,0.0,40.5,52,29,113,0.0


Intuition behind feature: <br>

`article_preference`: Binary feature for each item that indicates whether or not a customer has bought that item, based on whether other customers who bought similar items also tended to buy that item. <br>

This feature can be useful for a fashion-based recommender system because it captures the idea that customers who have similar tastes or preferences tend to buy similar items. For example, if a customer has a history of buying shirts and other customers who bought similar shorts also tended to buy a specific pair of jeans, then the binary feature for those jeans would be set to 1 for that customer, indicating that they are likely to be interested in those shoes. The binary feature can then be used as a predictor for which items to recommend to the customer.

In [44]:
# Create feature for each item based on the total number of times it was purchased by similar customers
for i in range(len(articles)):
    item_id = articles.loc[i, 'item_index']
    item_idx = item_index_dict.get(item_id, -1)
    
    if item_idx != -1:
    
        similar_items = top_k_similar_items[item_idx]
        
        # Find the customers who have purchased the current item
        customer_indices = np.where(user_item_matrix[:, item_idx].toarray()[:, 0] == 1)[0]
        
        # Compute the mean of the user-item matrix for the similar items
        item_feature = user_item_matrix[customer_indices, :][:, similar_items].sum() / len(customer_indices)
        articles.loc[i, 'item_purchase_frequency'] = item_feature.mean()
        
    else:
        articles.loc[i, 'item_purchase_frequency'] = np.nan

In [45]:
articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age,item_index,article_preference,item_purchase_frequency
0,108775015,253,1010016,1676,A,1,16,1002,2.583792e-15,37.409091,61,24,0,1.0,0.857143
1,108775044,253,1010016,1676,A,1,16,1002,0.0,37.0,51,22,1,1.0,0.538462
2,110065001,306,1010016,1339,B,1,61,1017,-1.421085e-15,43.6,55,23,2,1.0,1.25
3,110065002,306,1010016,1339,B,1,61,1017,0.0,50.0,50,50,3,1.0,5.0
4,110065011,306,1010016,1339,B,1,61,1017,1.937844e-15,46.181818,59,29,4,1.0,0.666667


Intuition behind feature: <br>

`item_purchase_frequency`: It gives an estimate of how frequently an item is being bought by customers who have similar purchase histories. This feature is useful because it can provide insights into purchasing patterns and identify popular items that are often bought together. <br> It can potentially help identify popular items among customer groups, and the lightgbm model can potentially use this feature.

In [46]:
merged_df = pd.merge(transactions, articles[['item_index']], on='item_index', how='left')

merged_df.head()

Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_1,sales_channel_2,quantity,article_engagement_ratio,user_index,item_index
0,2018-09-20,1135991499650384534,668766002,0.042358,False,True,1.0,1.0,5,11563
1,2018-09-20,1135991499650384534,652946001,0.050842,False,True,1.0,1.0,5,9899
2,2018-09-20,1135991499650384534,691275008,0.06781,False,True,1.0,1.0,5,14438
3,2018-09-20,5085370976430926408,657476001,0.016937,False,True,1.0,0.5,10,10307
4,2018-09-20,5085370976430926408,685687003,0.016937,False,True,1.0,0.166667,10,13608


In [47]:
# Merge transactions with articles on item_index to access the price of each article
merged_df = pd.merge(transactions, articles[['item_index']], on='item_index', how='left')

# Compute the average price levels of all articles purchased by similar customers who have purchased this particular article in the past
for i in range(len(articles)):
    item_id = articles.loc[i, 'item_index']
    item_idx = item_index_dict.get(item_id, -1)
    
    if item_idx != -1:
        similar_items = top_k_similar_items[item_idx]
        
        # Find the customers who have purchased the current item
        customer_indices = np.where(user_item_matrix[:, item_idx].toarray()[:, 0] == 1)[0]
        
        # Compute the average price levels of all articles purchased by similar customers who have purchased this particular article in the past
        item_feature = merged_df.loc[(merged_df['item_index'] == item_id) & (merged_df['user_index'].isin(customer_indices)), 'price'].mean()
        articles.loc[i, 'item_avg_price_level'] = item_feature
    else:
        articles.loc[i, 'item_avg_price_level'] = np.nan

Intuition behind feature: <br>

`item_avg_price_level`: Calculates the average price levels of all articles purchased by similar customers who have purchased this particular article in the past.  <br> It provides information on the typical price level of articles that are purchased together with a given article, as indicated by the purchasing patterns of similar customers. 

For example, if customers who frequently purchase articles A also tend to purchase higher-priced article, then the "item_avg_price_level" feature for article A would be relatively high. 

In [48]:
articles.head()

Unnamed: 0,article_id,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age,item_index,article_preference,item_purchase_frequency,item_avg_price_level
0,108775015,253,1010016,1676,A,1,16,1002,2.583792e-15,37.409091,61,24,0,1.0,0.857143,0.007713
1,108775044,253,1010016,1676,A,1,16,1002,0.0,37.0,51,22,1,1.0,0.538462,0.007809
2,110065001,306,1010016,1339,B,1,61,1017,-1.421085e-15,43.6,55,23,2,1.0,1.25,0.025421
3,110065002,306,1010016,1339,B,1,61,1017,0.0,50.0,50,50,3,1.0,5.0,0.024124
4,110065011,306,1010016,1339,B,1,61,1017,1.937844e-15,46.181818,59,29,4,1.0,0.666667,0.016663


In [49]:
# # print num of unique articles['graphical_appearance_no']
print(articles['graphical_appearance_no'].nunique())
print(articles['product_type_no'].nunique())
print(articles['department_no'].nunique())
print(articles['index_code'].nunique())
print(articles['index_group_no'].nunique()) 
print(articles['section_no'].nunique())
print(articles['garment_group_no'].nunique())

30
114
288
10
5
57
21


In [50]:
# Calculate RFM features, reference: https://www.geeksforgeeks.org/rfm-analysis-analysis-using-python/

from datetime import datetime, timedelta

# Calculate recency
last_purchase_date = transactions.groupby('user_index')['t_dat'].max().reset_index()
last_purchase_date.columns = ['user_index', 'last_purchase_date']
last_purchase_date['recency'] = (last_purchase_date['last_purchase_date'].max() - last_purchase_date['last_purchase_date']).dt.days
last_purchase_date.drop('last_purchase_date', axis=1, inplace=True)


# Calculate Frequency
frequency = transactions.groupby('user_index')['t_dat'].count().reset_index()
frequency.columns = ['user_index', 'frequency']

# Calculate Monetary Value
monetary_value = transactions.groupby('user_index')['price'].sum().reset_index()
monetary_value.columns = ['user_index', 'monetary_value']

# Merge all RFM features into a single DataFrame
rfm = last_purchase_date[['user_index', 'recency']].merge(frequency, on='user_index').merge(monetary_value, on='user_index')

# Calculate RFM Scores
quantiles = rfm.quantile(q=[0.25, 0.5, 0.75])
quantiles = quantiles.to_dict()

def rfm_segmenter(x, quantiles):
    if x <= quantiles['recency'][0.25]:
        return 4
    elif x <= quantiles['recency'][0.50]:
        return 3
    elif x <= quantiles['recency'][0.75]: 
        return 2
    else:
        return 1
    
rfm['R'] = rfm['recency'].apply(rfm_segmenter, args=(quantiles,))

def f_segmenter(x, quantiles):
    if x <= quantiles['frequency'][0.25]:
        return 1
    elif x <= quantiles['frequency'][0.50]:
        return 2
    elif x <= quantiles['frequency'][0.75]: 
        return 3
    else:
        return 4

rfm['F'] = rfm['frequency'].apply(f_segmenter, args=(quantiles,))

def m_segmenter(x, quantiles):
    if x <= quantiles['monetary_value'][0.25]:
        return 1
    elif x <= quantiles['monetary_value'][0.50]:
        return 2
    elif x <= quantiles['monetary_value'][0.75]: 
        return 3
    else:
        return 4

rfm['M'] = rfm['monetary_value'].apply(m_segmenter, args=(quantiles,))

# Calculate RFM Score
rfm['RFM_Score'] = rfm['R'].map(str) + rfm['F'].map(str) + rfm['M'].map(str)
rfm['RFM_Score'] = rfm['RFM_Score'].astype(int)
rfm = rfm.drop(['R', 'F', 'M'], axis=1)

# drop the recency frequency and monetary value columns
rfm = rfm.drop(['recency', 'frequency', 'monetary_value'], axis=1)

# Display sample of RFM DataFrame
print(rfm.head())

   user_index  RFM_Score
0           0        234
1           1        121
2           2        412
3           3        321
4           4        132


In [51]:
# merge customer features with rfm features
customers = pd.merge(customers, rfm, on='user_index', how='left')

In [52]:
articles.columns

Index(['article_id', 'product_type_no', 'graphical_appearance_no',
       'department_no', 'index_code', 'index_group_no', 'section_no',
       'garment_group_no', 'age_diff', 'mean_purchase_age', 'max_purchase_age',
       'min_purchase_age', 'item_index', 'article_preference',
       'item_purchase_frequency', 'item_avg_price_level'],
      dtype='object')

In [53]:
customers.columns

Index(['customer_id', 'FN', 'Active', 'club_member_status',
       'fashion_news_frequency', 'age', 'time_diff_days', 'user_index',
       'user_purchase_quant', 'RFM_Score'],
      dtype='object')

In [54]:
transactions.columns

Index(['t_dat', 'customer_id', 'article_id', 'price', 'sales_channel_1',
       'sales_channel_2', 'quantity', 'article_engagement_ratio', 'user_index',
       'item_index'],
      dtype='object')

In [56]:
# save the dataframes to pkl files
# customers.to_pickle('customers_features.pkl')
# articles.to_pickle('articles_features.pkl')
# transactions.to_pickle('transactions_features.pkl')

In [186]:
# load the dataframes from pkl files
customers = pd.read_pickle('customers_features.pkl')
articles = pd.read_pickle('articles_features.pkl')
transactions = pd.read_pickle('transactions_features.pkl')

In [187]:
# already dropped for articles
articles = articles.drop(['article_id'], axis=1).copy()
customers = customers.drop(['customer_id'], axis=1).copy()
transactions = transactions.drop(['article_id', 'customer_id'], axis=1).copy()

# Merge transactions with customers
pos_df = pd.merge(transactions_final_df, customers_final_df, on='user_index', how='left')

# Merge resulting dataframe with articles_final_df usually
pos_df = pd.merge(pos_df, articles, on='item_index', how='left')

In [188]:
pos_df['target'] = 1
pos_df.dtypes

t_dat                       datetime64[ns]
price                              float16
sales_channel_1                       bool
sales_channel_2                       bool
quantity                           float64
article_engagement_ratio           float64
user_index                           int64
item_index                           int64
FN                                 float16
Active                             float16
club_member_status                    int8
fashion_news_frequency               int64
age                                   int8
time_diff_days                       int32
user_purchase_quant                float64
RFM_Score                            int32
product_type_no                      int16
graphical_appearance_no              int32
department_no                        int16
index_code                          object
index_group_no                        int8
section_no                            int8
garment_group_no                     int16
age_diff   

In [189]:
pos_df.head()

Unnamed: 0,t_dat,price,sales_channel_1,sales_channel_2,quantity,article_engagement_ratio,user_index,item_index,FN,Active,...,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age,article_preference,item_purchase_frequency,item_avg_price_level,target
0,2018-09-20,0.042358,False,True,1.0,1.0,5,11563,1.0,1.0,...,6,1010,0.0,51.0,51,51,0.0,0.0,0.042358,1
1,2018-09-20,0.050842,False,True,1.0,1.0,5,9899,1.0,1.0,...,57,1016,0.0,51.0,51,51,1.0,1.0,0.050842,1
2,2018-09-20,0.06781,False,True,1.0,1.0,5,14438,1.0,1.0,...,18,1010,0.0,51.0,51,51,0.0,0.0,0.06781,1
3,2018-09-20,0.016937,False,True,1.0,0.5,10,10307,0.0,0.0,...,53,1005,0.0,58.0,67,49,1.0,3.0,0.016937,1
4,2018-09-20,0.016937,False,True,1.0,0.166667,10,13608,0.0,0.0,...,15,1023,0.0,42.0,67,29,1.0,0.833333,0.01976,1


In [190]:
transactions.dtypes

t_dat                       datetime64[ns]
price                              float16
sales_channel_1                       bool
sales_channel_2                       bool
quantity                           float64
article_engagement_ratio           float64
user_index                           int64
item_index                           int64
dtype: object

In [191]:
def prepare_negative_samples(df, user_col, item_col):
    # Get a list of unique users and items
    users = df[user_col].unique()
    items = df[item_col].unique()
    
    # Create an array to store negative samples
    negatives = []

    # Loop through each user and generate negative samples
    for i, user in enumerate(users):
        # Get the items that the user has interacted with
        user_items = df[df[user_col] == user][item_col].unique()
        
        # Calculate the number of purchases made by the user
        n_purchases = len(user_items)
        
        # Cap the number of negatives at n_purchases or 200
        n_negatives = min(n_purchases, 200)
        
        # Get a list of candidate items that the user has not interacted with
        not_interacted_items = np.setdiff1d(items, user_items)
        
        # Sample n_negatives candidate items randomly
        negative_items = np.random.choice(not_interacted_items, size=n_negatives, replace=False)
        
        # Store the user and negative item pairs
        user_negatives = np.zeros((n_negatives, 2))
        user_negatives[:, 0] = user
        user_negatives[:, 1] = negative_items
        negatives.append(user_negatives)
    
    negatives = np.concatenate(negatives, axis=0)
    negatives_df = pd.DataFrame(negatives, columns=[user_col, item_col])
    
    # neg sample label
    negatives_df['target'] = 0
    
    return negatives_df

In [331]:
neg_df = prepare_negative_samples(transactions, 'user_index', 'item_index')
neg_df['user_index'] = neg_df['user_index'].astype('int')
neg_df['item_index'] = neg_df['item_index'].astype('int')
neg_df.head()

Unnamed: 0,user_index,item_index,target
0,5,36378,0
1,5,19136,0
2,5,10125,0
3,5,22815,0
4,5,10139,0


In [332]:
last_transaction_dates = (transactions.groupby('user_index')['t_dat'].max().to_dict())
neg_df['t_dat'] = neg_df['user_index'].map(last_transaction_dates)
neg_df.shape

(39700, 4)

In [333]:
# print customer_id article id in pos_df
pos_df[['user_index', 'item_index']].head()

Unnamed: 0,user_index,item_index
0,5,11563
1,5,9899
2,5,14438
3,10,10307
4,10,13608


In [334]:
# Merge additional features of users, items, and transactions to the negative samples
neg_df = pd.merge(neg_df, articles, on='item_index')
neg_df.shape

(39700, 18)

In [335]:
neg_df.head()

Unnamed: 0,user_index,item_index,target,t_dat,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,garment_group_no,age_diff,mean_purchase_age,max_purchase_age,min_purchase_age,article_preference,item_purchase_frequency,item_avg_price_level
0,5,36378,0,2020-09-16,-1,1010016,1339,B,1,61,1017,0.0,28.0,28,28,1.0,5.0,0.025406
1,5,19136,0,2020-09-16,254,1010016,1641,A,1,18,1005,2.368476e-15,33.666667,43,27,0.0,0.0,0.033875
2,5,10125,0,2020-09-16,272,1010004,1717,A,1,11,1009,2.368476e-15,40.666667,50,29,0.0,0.0,0.031616
3,5,22815,0,2020-09-16,274,1010016,1723,A,1,15,1025,0.0,52.5,68,37,1.0,2.5,0.022018
4,39,22815,0,2020-09-19,274,1010016,1723,A,1,15,1025,0.0,52.5,68,37,1.0,2.5,0.022018


In [336]:
customers.head()

Unnamed: 0,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days,user_index,user_purchase_quant,RFM_Score
0,1.0,1.0,2,1,44,714,0,0.020607,234
1,0.0,0.0,2,0,30,714,1,0.017297,121
2,1.0,1.0,2,1,23,727,2,0.016722,412
3,0.0,0.0,2,0,26,724,3,0.016424,321
4,1.0,1.0,2,1,26,520,4,0.016403,132


In [337]:
neg_df = pd.merge(neg_df, customers, on='user_index')
neg_df.shape

(39700, 26)

In [338]:
neg_df.head()

Unnamed: 0,user_index,item_index,target,t_dat,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,...,item_purchase_frequency,item_avg_price_level,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days,user_purchase_quant,RFM_Score
0,5,36378,0,2020-09-16,-1,1010016,1339,B,1,61,...,5.0,0.025406,1.0,1.0,2,1,51,727,0.027817,244
1,5,19136,0,2020-09-16,254,1010016,1641,A,1,18,...,0.0,0.033875,1.0,1.0,2,1,51,727,0.027817,244
2,5,10125,0,2020-09-16,272,1010004,1717,A,1,11,...,0.0,0.031616,1.0,1.0,2,1,51,727,0.027817,244
3,5,22815,0,2020-09-16,274,1010016,1723,A,1,15,...,2.5,0.022018,1.0,1.0,2,1,51,727,0.027817,244
4,5,10139,0,2020-09-16,258,1010017,1543,D,2,53,...,2.0,0.011848,1.0,1.0,2,1,51,727,0.027817,244


In [339]:
missing_cols = set(pos_df.columns) - set(neg_df.columns)
missing_cols

{'article_engagement_ratio',
 'price',
 'quantity',
 'sales_channel_1',
 'sales_channel_2'}

In [340]:
neg_df.head()

Unnamed: 0,user_index,item_index,target,t_dat,product_type_no,graphical_appearance_no,department_no,index_code,index_group_no,section_no,...,item_purchase_frequency,item_avg_price_level,FN,Active,club_member_status,fashion_news_frequency,age,time_diff_days,user_purchase_quant,RFM_Score
0,5,36378,0,2020-09-16,-1,1010016,1339,B,1,61,...,5.0,0.025406,1.0,1.0,2,1,51,727,0.027817,244
1,5,19136,0,2020-09-16,254,1010016,1641,A,1,18,...,0.0,0.033875,1.0,1.0,2,1,51,727,0.027817,244
2,5,10125,0,2020-09-16,272,1010004,1717,A,1,11,...,0.0,0.031616,1.0,1.0,2,1,51,727,0.027817,244
3,5,22815,0,2020-09-16,274,1010016,1723,A,1,15,...,2.5,0.022018,1.0,1.0,2,1,51,727,0.027817,244
4,5,10139,0,2020-09-16,258,1010017,1543,D,2,53,...,2.0,0.011848,1.0,1.0,2,1,51,727,0.027817,244


In [341]:
# get price of item_index 13422 in transactions
transactions[transactions['item_index'] == 11563]['price'].values


array([0.04236], dtype=float16)

In [342]:
transactions.columns

Index(['t_dat', 'price', 'sales_channel_1', 'sales_channel_2', 'quantity',
       'article_engagement_ratio', 'user_index', 'item_index'],
      dtype='object')

In [343]:
def add_transaction_features(neg_df, transactions):
    # Calculate mean price and quantity for each item_index
    item_info = transactions.groupby('item_index').agg({'price': 'mean', 'quantity': 'mean', 'article_engagement_ratio': 'mean'}).reset_index()
    
    # Calculate mode of sales_channel_1 and sales_channel_2 for each item_index
    sales_channel_info = transactions.groupby('item_index').agg({'sales_channel_1': lambda x: x.mode()[0], 
                                                                  'sales_channel_2': lambda x: x.mode()[0]}).reset_index()
    
    # Merge the item_info and sales_channel_info dataframes
    item_sales_info = pd.merge(item_info, sales_channel_info, on='item_index')
    
    # Add the transaction features to neg_df based on item_index
    for idx in item_sales_info.index:
        item_idx = item_sales_info.at[idx, 'item_index']
        price = item_sales_info.at[idx, 'price']
        quantity = item_sales_info.at[idx, 'quantity']
        sales_channel_1 = item_sales_info.at[idx, 'sales_channel_1']
        sales_channel_2 = item_sales_info.at[idx, 'sales_channel_2']
        article_engagement_ratio = item_sales_info.at[idx, 'article_engagement_ratio']
        

        neg_df.loc[neg_df['item_index'] == item_idx, 'price'] = price
        neg_df.loc[neg_df['item_index'] == item_idx, 'quantity'] = quantity
        neg_df.loc[neg_df['item_index'] == item_idx, 'sales_channel_1'] = sales_channel_1
        neg_df.loc[neg_df['item_index'] == item_idx, 'sales_channel_2'] = sales_channel_2
        neg_df.loc[neg_df['item_index'] == item_idx, 'article_engagement_ratio'] = article_engagement_ratio
        
        # If mode of sales_channel_1 is True, set sales_channel_2 to False, and vice versa
        if sales_channel_1 == True:
            neg_df.loc[neg_df['item_index'] == item_idx, 'sales_channel_2'] = False
        elif sales_channel_2 == True:
            neg_df.loc[neg_df['item_index'] == item_idx, 'sales_channel_1'] = False
    
    return neg_df

In [344]:
neg_df = add_transaction_features(neg_df, transactions)
neg_df.shape

(39700, 31)

In [347]:
missing_cols = set(pos_df.columns) - set(neg_df.columns)
missing_cols

set()

In [348]:
pos_df.shape

(126622, 31)

In [349]:
neg_df.shape

(39700, 31)

In [350]:
# find columns that pos_df that are not in neg_df
pos_df.columns.difference(neg_df.columns)

Index([], dtype='object')

In [351]:
neg_df.columns.difference(pos_df.columns)

Index([], dtype='object')

In [352]:
pos_df.columns

Index(['t_dat', 'price', 'sales_channel_1', 'sales_channel_2', 'quantity',
       'article_engagement_ratio', 'user_index', 'item_index', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'time_diff_days',
       'user_purchase_quant', 'RFM_Score', 'product_type_no',
       'graphical_appearance_no', 'department_no', 'index_code',
       'index_group_no', 'section_no', 'garment_group_no', 'age_diff',
       'mean_purchase_age', 'max_purchase_age', 'min_purchase_age',
       'article_preference', 'item_purchase_frequency', 'item_avg_price_level',
       'target'],
      dtype='object')

In [353]:
# order the columns in the same order
neg_df = neg_df[pos_df.columns]
neg_df.columns

Index(['t_dat', 'price', 'sales_channel_1', 'sales_channel_2', 'quantity',
       'article_engagement_ratio', 'user_index', 'item_index', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'time_diff_days',
       'user_purchase_quant', 'RFM_Score', 'product_type_no',
       'graphical_appearance_no', 'department_no', 'index_code',
       'index_group_no', 'section_no', 'garment_group_no', 'age_diff',
       'mean_purchase_age', 'max_purchase_age', 'min_purchase_age',
       'article_preference', 'item_purchase_frequency', 'item_avg_price_level',
       'target'],
      dtype='object')

In [355]:
df_pos_neg = pd.concat([pos_df, neg_df])
df_pos_neg.sort_values(['t_dat', 'user_index'], inplace = True)

In [356]:
# print unique values of target
df_pos_neg.shape

(166322, 31)

In [357]:
# extracting time-based features

df_pos_neg['t_dat'] = pd.to_datetime(df_pos_neg['t_dat'])
df_pos_neg['year'] = df_pos_neg['t_dat'].dt.year
df_pos_neg['month'] = df_pos_neg['t_dat'].dt.month
df_pos_neg['day'] = df_pos_neg['t_dat'].dt.day

In [358]:
# one-hot encode garment_group_no and index_group_no columns
one_hot_cols = ['garment_group_no', 'index_group_no']
df_pos_neg = pd.get_dummies(df_pos_neg, columns=one_hot_cols, prefix=one_hot_cols)

df_pos_neg.shape

(166322, 58)

In [359]:
df_pos_neg.columns

Index(['t_dat', 'price', 'sales_channel_1', 'sales_channel_2', 'quantity',
       'article_engagement_ratio', 'user_index', 'item_index', 'FN', 'Active',
       'club_member_status', 'fashion_news_frequency', 'age', 'time_diff_days',
       'user_purchase_quant', 'RFM_Score', 'product_type_no',
       'graphical_appearance_no', 'department_no', 'index_code', 'section_no',
       'age_diff', 'mean_purchase_age', 'max_purchase_age', 'min_purchase_age',
       'article_preference', 'item_purchase_frequency', 'item_avg_price_level',
       'target', 'year', 'month', 'day', 'garment_group_no_1001',
       'garment_group_no_1002', 'garment_group_no_1003',
       'garment_group_no_1005', 'garment_group_no_1006',
       'garment_group_no_1007', 'garment_group_no_1008',
       'garment_group_no_1009', 'garment_group_no_1010',
       'garment_group_no_1011', 'garment_group_no_1012',
       'garment_group_no_1013', 'garment_group_no_1014',
       'garment_group_no_1016', 'garment_group_no_1017',


In [360]:
df_pos_neg.dtypes

t_dat                       datetime64[ns]
price                              float64
sales_channel_1                     object
sales_channel_2                     object
quantity                           float64
article_engagement_ratio           float64
user_index                           int64
item_index                           int64
FN                                 float16
Active                             float16
club_member_status                    int8
fashion_news_frequency               int64
age                                   int8
time_diff_days                       int32
user_purchase_quant                float64
RFM_Score                            int32
product_type_no                      int16
graphical_appearance_no              int32
department_no                        int16
index_code                          object
section_no                            int8
age_diff                           float64
mean_purchase_age                  float64
max_purchas

In [361]:
df_pos_neg.isnull().sum()

t_dat                       0
price                       0
sales_channel_1             0
sales_channel_2             0
quantity                    0
article_engagement_ratio    0
user_index                  0
item_index                  0
FN                          0
Active                      0
club_member_status          0
fashion_news_frequency      0
age                         0
time_diff_days              0
user_purchase_quant         0
RFM_Score                   0
product_type_no             0
graphical_appearance_no     0
department_no               0
index_code                  0
section_no                  0
age_diff                    0
mean_purchase_age           0
max_purchase_age            0
min_purchase_age            0
article_preference          0
item_purchase_frequency     0
item_avg_price_level        0
target                      0
year                        0
month                       0
day                         0
garment_group_no_1001       0
garment_gr

In [362]:
# final touches

# drop index_code
# df_pos_neg = df_pos_neg.drop(['index_code'], axis=1)

# convert sales_channel_ 1 to to boolean
df_pos_neg['sales_channel_1'] = df_pos_neg['sales_channel_1'].astype('bool')
df_pos_neg['sales_channel_2'] = df_pos_neg['sales_channel_2'].astype('bool')
df_pos_neg.to_pickle('lightgbm/df_pos_neg.pkl')
# df_pos_neg.dtypes

In [365]:
df_pos_neg.shape

(166322, 58)

In [None]:
# user_indices, item_indices = user_item_matrix.get_shape()

# print('Number of users: %d' % user_indices)
# print('Number of items: %d' % item_indices)

In [None]:
# import pickle

# # Save the matrix as a pickle file
# with open('user_item_matrix_200.pkl', 'wb') as f:
#     pickle.dump(user_item_matrix, f)

In [None]:
# # load user_item_matrix from pickle file

# with open('user_item_matrix_200.pkl', 'rb') as f:
#     user_item_matrix = pickle.load(f)

# user_item_matrix = user_item_matrix.toarray()

# # extract indices of non-zero elements
# user_purchased_indices, item_purchased_indices = user_item_matrix.nonzero()

# print('user_purchased_indices: ', user_indices)
# print('item_purchased_indices: ', item_indices)

In [None]:
# # save the article_id_indices_map and user_id_indices_map as pickle files
# with open('lightgbm/article_id_indices_map.pkl', 'wb') as f:
#     pickle.dump(article_id_indices_map, f)
# with open('lightgbm/customer_id_indices_map.pkl', 'wb') as f:
#     pickle.dump(customer_id_indices_map, f)

The feature engineering is now complete, and we can now train the model. Taerget encoding will be apploed ad-hoc to the categorical features with >30 unique values.