# **<a id="Content">HnM RecSys Notebook 9417</a>**

## **<a id="Content">Table of Contents</a>**
* [**<span>1. Imports</span>**](#Imports)  
* [**<span>2. Pre-Processing</span>**](#Pre-Processing)
* [**<span>3. Exploratory Data Analysis</span>**](#Exploratory-Data-Analysis)  
    * [**<span>3.1 Articles</span>**](#EDA::Articles)  
    * [**<span>3.2 Customers</span>**](#EDA::Customers)
    * [**<span>3.3 Transactions</span>**](#EDA::Transactions)
* [**<span>4. Helper FunctionsDecorators</span>**](#Helper-Functions)
* [**<span>5. Models</span>**](#Models) 
    * [**<span>5.1 Popularity</span>**](#Popularity-Model)   
    * [**<span>5.2 ALS</span>**](#Alternating-Least-Squares)  
    * [**<span>5.2 GBDT</span>**](#GBDT)  
    * [**<span>5.3 SGD/similar</span>**](#SGD)  
    * [**<span>5.4 NN</span>**](#NN)

## Imports

In [41]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import os
import re
import warnings
# import cudf # switch on P100 GPU for this to work in Kaggle
# import cupy as cp

# Importing data
articles = pd.read_csv('articles.csv')
print(articles.head())
print("--")
customers = pd.read_csv('customers.csv')
print(customers.head())
print("--")
transactions = pd.read_csv("transactions_train.csv")
print(transactions.head())
print("--")

   article_id  product_code          prod_name  product_type_no   
0   108775015        108775          Strap top              253  \
1   108775044        108775          Strap top              253   
2   108775051        108775      Strap top (1)              253   
3   110065001        110065  OP T-shirt (Idro)              306   
4   110065002        110065  OP T-shirt (Idro)              306   

  product_type_name  product_group_name  graphical_appearance_no   
0          Vest top  Garment Upper body                  1010016  \
1          Vest top  Garment Upper body                  1010016   
2          Vest top  Garment Upper body                  1010017   
3               Bra           Underwear                  1010016   
4               Bra           Underwear                  1010016   

  graphical_appearance_name  colour_group_code colour_group_name  ...   
0                     Solid                  9             Black  ...  \
1                     Solid               

## Pre-Processing

In [42]:
# ----- empty value stats -------------
print("Missing values: ")
print(customers.isnull().sum())
print("--\n")

print("FN Newsletter vals: ", customers['FN'].unique())
print("Active communication vals: ",customers['Active'].unique())
print("Club member status vals: ", customers['club_member_status'].unique())
print("Fashion News frequency vals: ", customers['fashion_news_frequency'].unique())
print("--\n")

# ---- data cleaning -------------

customers['FN'] = customers['FN'].fillna(0)
customers['Active'] = customers['Active'].fillna(0)

# replace club_member_status missing values with 'LEFT CLUB' --> no members with LEFT CLUB status in data
customers['club_member_status'] = customers['club_member_status'].fillna('LEFT CLUB')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].fillna('None')
customers['fashion_news_frequency'] = customers['fashion_news_frequency'].replace('NONE', 'None')
customers['age'] = customers['age'].fillna(customers['age'].mean())
customers['age'] = customers['age'].astype(int)
articles['detail_desc'] = articles['detail_desc'].fillna('None')


print("Customers' Missing values: ")
print(customers.isnull().sum())
print("--\n")

Missing values: 
customer_id                    0
FN                        895050
Active                    907576
club_member_status          6062
fashion_news_frequency     16011
age                        15861
postal_code                    0
dtype: int64
--

FN Newsletter vals:  [nan  1.]
Active communication vals:  [nan  1.]
Club member status vals:  ['ACTIVE' nan 'PRE-CREATE' 'LEFT CLUB']
Fashion News frequency vals:  ['NONE' 'Regularly' nan 'Monthly']
--

Customers' Missing values: 
customer_id               0
FN                        0
Active                    0
club_member_status        0
fashion_news_frequency    0
age                       0
postal_code               0
dtype: int64
--



In [43]:
# ---- memory optimizations -------------

# reference: https://www.kaggle.com/arjanso/reducing-dataframe-memory-size-by-65

# iterate through all the columns of a dataframe and reduce the int and float data types to the smallest possible size, ex. customer_id should not be reduced from int64 to a samller value as it would have collisions
import numpy as np
import pandas as pd

def reduce_mem_usage(df):
    """Iterate over all the columns of a DataFrame and modify the data type
    to reduce memory usage, handling ordered Categoricals"""
    
    # check the memory usage of the DataFrame
    start_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage of dataframe is {:.2f} MB".format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type == 'category':
            if df[col].cat.ordered:
                # Convert ordered Categorical to an integer
                df[col] = df[col].cat.codes.astype('int16')
            else:
                # Convert unordered Categorical to a string
                df[col] = df[col].astype('str')
        
        elif col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min >= np.iinfo(np.int8).min and c_max <= np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min >= np.iinfo(np.int16).min and c_max <= np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min >= np.iinfo(np.int32).min and c_max <= np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min >= np.iinfo(np.int64).min and c_max <= np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min >= np.finfo(np.float16).min and c_max <= np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min >= np.finfo(np.float32).min and c_max <= np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
    
    # check the memory usage after optimization
    end_mem = df.memory_usage().sum() / 1024**2
    print("Memory usage after optimization is: {:.2f} MB".format(end_mem))

    # calculate the percentage of the memory usage reduction
    mem_reduction = 100 * (start_mem - end_mem) / start_mem
    print("Memory usage decreased by {:.1f}%".format(mem_reduction))
    
    return df

   

In [44]:
print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int64 
 1   product_code                  105542 non-null  int64 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int64 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  int64 
 7   graphical_appearance_name     105542 non-null  object
 8   colour_group_code             105542 non-null  int64 
 9   colour_group_name             105542 non-null  object
 10  perceived_colour_value_id     105542 non-null  int64 
 11  perceived_colour_value_name   105542 non-null  object
 12  perceived_colour_master_id    105542 non-n

In [45]:
# ---- memory optimizations -------------

# uses 8 bytes instead of given 64 byte string, reduces mem by 8x, 
# !!!! have to convert back before merging w/ sample_submissions.csv
# convert transactions['customer_id'] to 8 bytes int
# transactions['customer_id'] = transactions['customer_id'].astype('int64')
transactions['customer_id'] = transactions['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')
customers['customer_id'] = customers['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64')

articles = reduce_mem_usage(articles)
customers = reduce_mem_usage(customers)
transactions = reduce_mem_usage(transactions)

# articles['article_id'] = articles['article_id'].astype('int32')
# transactions['article_id'] = transactions['article_id'].astype('int32') 
# # !!!! ADD LEADING ZERO BACK BEFORE SUBMISSION OF PREDICTIONS TO KAGGLE: 
# # Ex.: transactions['article_id'] = '0' + transactions.article_id.astype('str')

print("Articles Info: ")
print(articles.info())
print("Customer Info: ")
print(customers.info())
print("Transactions Info: ")
print(transactions.info())

Memory usage of dataframe is 20.13 MB
Memory usage after optimization is: 13.59 MB
Memory usage decreased by 32.5%
Memory usage of dataframe is 68.04 MB
Memory usage after optimization is: 48.41 MB
Memory usage decreased by 28.8%
Memory usage of dataframe is 1212.63 MB
Memory usage after optimization is: 697.26 MB
Memory usage decreased by 42.5%
Articles Info: 
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105542 entries, 0 to 105541
Data columns (total 25 columns):
 #   Column                        Non-Null Count   Dtype 
---  ------                        --------------   ----- 
 0   article_id                    105542 non-null  int32 
 1   product_code                  105542 non-null  int32 
 2   prod_name                     105542 non-null  object
 3   product_type_no               105542 non-null  int16 
 4   product_type_name             105542 non-null  object
 5   product_group_name            105542 non-null  object
 6   graphical_appearance_no       105542 non-null  i

## Helper-Functions

In [74]:
# reference: https://towardsdatascience.com/python-decorators-for-data-science-6913f717669a

# memoization decorator
def memoize(func):
    cache = {}
    def wrapper(*args):
        if args in cache:
            return cache[args]
        else:
            result = func(*args)
            cache[args] = result
            return result
    return wrapper

In [75]:
# time-based splitting strategy

def split_train_val_data(transactions, days=7):
    """
    Splits the transaction training data into a training set and a validation set of 7 days to prevent data leakage.
    """
    
    transactions['t_dat'] = pd.to_datetime(transactions['t_dat'])
    transactions = transactions.sort_values(by=['t_dat'])
    latest_transaction_date = transactions['t_dat'].max()
    
    training_set = transactions[transactions['t_dat'] < latest_transaction_date - pd.Timedelta(days=days)]
    validation_set = transactions[transactions['t_dat'] >= latest_transaction_date - pd.Timedelta(days=days)]
    
    print("Training set size:", len(training_set))
    print("Validation set size:", len(validation_set))
    print("Last date in training set:", training_set['t_dat'].max())
    print("Last date in validation set:", validation_set['t_dat'].max())
    
    return training_set, validation_set


### Alternating-Least-Squares

ALS simply factorizes the the user-item interaction matrix, and creates two latent factor matrices: one for users and one for items. <br>
Used Bayesian Optimization for hyperparameter tuning to find the best hyperparameters. <br>
This model can deal well with sparse data, but makes low quality predictions based on the implicit feedback. <br>

The sparsity of the data, due to the fact that most H&M customers only buy a few items, is a problem for the ALS model. The more sparse the data, the lower the recommendation quality.<br>

The final tuned model is also used in LightGBM to factorize the user-item matrix - for cosine similairty features.


In [79]:
import implicit
import scipy.sparse as sparse
from scipy.sparse.linalg import spsolve
import random
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

  from .autonotebook import tqdm as notebook_tqdm


In [80]:
def preprocess_data(transactions_df, customers_df, articles_df, customers_col='customer_id', articles_col='article_id'):
    """
    Preprocesses customer and article IDs for use in a sparse matrix.
    
    Returns:
    - transactions_df: the input transaction DataFrame with two additional columns, 'user_index' and 'item_index',
                       that map customer and article IDs to their corresponding indices in a sparse matrix
    - customer_id_indices_map: a dictionary that maps customer IDs to their corresponding indices
    - article_id_indices_map: a dictionary that maps article IDs to their corresponding indices
    """
    # Create a list of unique customer IDs and product IDs
    all_customers = customers_df[customers_col].unique().tolist()
    all_articles = articles_df[articles_col].unique().tolist()

    # Create dicts mapping IDs to their corresponding indices
    customer_id_indices_map = {customer_id: i for i, customer_id in enumerate(all_customers)}
    article_id_indices_map = {article_id: i for i, article_id in enumerate(all_articles)}

    # Map customer and article IDs to their resp. indices in the transaction DataFrame
    transactions_df['user_index'] = transactions_df[customers_col].map(customer_id_indices_map)
    transactions_df['item_index'] = transactions_df[articles_col].map(article_id_indices_map)

    return transactions_df, all_customers, all_articles, customer_id_indices_map, article_id_indices_map


In [81]:
transactions, all_customers, all_articles, customer_id_indices_map, article_id_indices_map = preprocess_data(transactions, customers, articles)

print("Total num of customers: ", len(all_customers))
print("Total num of articles: ", len(all_articles))
print("Customer ID mapping: ", list(customer_id_indices_map.items())[:5])
print("Article ID mapping: ", list(article_id_indices_map.items())[:5])
transactions.head()

Total num of customers:  1371980
Total num of articles:  105542
Customer ID mapping:  [(6883939031699146327, 0), (-7200416642310594310, 1), (-6846340800584936, 2), (-94071612138601410, 3), (-283965518499174310, 4)]
Article ID mapping:  [(108775015, 0), (108775044, 1), (108775051, 2), (110065001, 3), (110065002, 4)]


Unnamed: 0,t_dat,customer_id,article_id,price,sales_channel_id,user_index,item_index
0,2018-09-20,-6846340800584936,663713001,0.0508,2,2,40179
1,2018-09-20,-6846340800584936,541518023,0.0305,2,2,10520
2,2018-09-20,-8334631767138808638,505221004,0.0152,2,7,6387
3,2018-09-20,-8334631767138808638,685687003,0.0169,2,7,46304
4,2018-09-20,-8334631767138808638,685687004,0.0169,2,7,46305


In [82]:
training_set, validation_set = split_train_val_data(transactions)

Training set size: 31521960
Validation set size: 266364
Last date in training set: 2020-09-14 00:00:00
Last date in validation set: 2020-09-22 00:00:00


<b>ALS strategy.</b><br> binary `implicit feedback`: based on whether a product was purchased or not (purchase quantity is not taken into consideration).

In [83]:
# Create a sparse matrix of all user-item (a.k.a customer_id-article_id) interactions
# Supply training set, val set or entire transactions df

def create_user_item_matrix(transactions_df):
    # all customers and articles in their resp. rows of transaction data indicate that an article was purchased, thus:
    interaction = np.ones(transactions_df.shape[0]) 
    user_rows = transactions_df['user_index'].values
    item_cols = transactions_df['item_index'].values

    user_item_matrix = sparse.csr_matrix((interaction, (user_rows, item_cols)), shape=(len(all_customers), len(all_articles)))
    
    return user_item_matrix

In [84]:
user_item_training_matrix = create_user_item_matrix(training_set)
user_item_validation_matrix = create_user_item_matrix(validation_set)

user_item_training_matrix

# print the first row value in the matrix
print(user_item_training_matrix[0])

  (0, 99)	1.0
  (0, 16003)	2.0
  (0, 16023)	1.0
  (0, 23996)	1.0
  (0, 29516)	1.0
  (0, 30327)	1.0
  (0, 38172)	1.0
  (0, 49478)	1.0
  (0, 50724)	1.0
  (0, 65667)	1.0
  (0, 76503)	1.0
  (0, 76590)	1.0
  (0, 78719)	1.0
  (0, 79278)	2.0
  (0, 83622)	1.0
  (0, 90060)	1.0
  (0, 93744)	1.0
  (0, 99926)	1.0
  (0, 100484)	1.0


In the above CSR matrix output row: `(x, yyyy)	z`: <br>
'x' (row index) corresponds to the index-encoded `customer_id` <br>
'y' (column index) corresponds to the index-encoded `article_id` <br>
'z' corresponds to total number of purchases of 'yyyy' by 'x'<br>

We have 1,371,980 customers and 105542 articles in transaction data (as seen previously). <br>
There are 27,079,047 interactions in the matrix. There are a total of 27,306,439 unique interactions (if we used the entire transaction dataset to create the matrix).

In [85]:
# Calculating Sparsity of sparse_ratings_matrix

sparse_matrix_size = user_item_training_matrix.shape[0]*user_item_training_matrix.shape[1] 
num_purchases = len(user_item_training_matrix.nonzero()[0]) 
sparsity = 100*(1 - (num_purchases/sparse_matrix_size))
print("ratings matrix sparsity: ", sparsity)


ratings matrix sparsity:  99.98129919611401


<b>note: the current ratings matrix has 99.98% sparsity, and could thus affect the quality of recommendations.</b> <br>
~31,000,000 (all) transactions sparsity: 99.98% <br>
1,000,000 transactions sparsity: 99.997% <br>
500,000 transactions sparsity: 99.997% <br>
100,000 transactions sparsity: 99.996% <br>

<b>Training:</b> <br>

In [86]:
import logging
import implicit

# default values taken from : http://yifanhu.net/PUB/cf.pdf

def train_als_model(user_item_matrix, factors=50, iterations=20, regularization=0.1, alpha = 50, random_state=69, use_gpu=False):
    """
    Trains an ALS model using implicit library and returns the trained model.
    
    Args:
    - user_item_matrix: a sparse user-item matrix in CSR format
    - factors: the number of latent factors to use (default=500)
    - alpha: multiply alpha to training to scale the confidence of the user-item interactions (default=1)
  
    Returns:
    - als_model: the trained ALS model
    """
    
    # set logging level to DEBUG
    logging.basicConfig(level=logging.DEBUG)
    
    
    als_model = implicit.als.AlternatingLeastSquares(factors=factors, iterations=iterations, regularization=regularization, 
                                                     random_state=random_state, use_gpu=use_gpu)
    als_model.fit(user_item_matrix*alpha, show_progress=True)
    
    return als_model


In [87]:
als_model = train_als_model(user_item_training_matrix, alpha=50)

DEBUG:implicit:Calculated transpose in 0.311s
DEBUG:implicit:Initialized factors in 0.7812304496765137
DEBUG:implicit:Running 20 ALS iterations
100%|██████████| 20/20 [08:41<00:00, 26.06s/it]


In [88]:
# Check the array of user and item latent factors in the training set
print(als_model.user_factors.shape)
print("---")
print(als_model.item_factors.shape)

(1371980, 50)
---
(105542, 50)


<b>Evaluation:</b> <br>

In [89]:
# Using KMAP@12 to validate the model

from implicit.evaluation import mean_average_precision_at_k as map_at_k

map_12 = map_at_k(als_model, user_item_training_matrix, user_item_validation_matrix, K=12, show_progress=True, num_threads=1)


100%|██████████| 75481/75481 [00:20<00:00, 3716.03it/s]


In [90]:
map_12

0.0037245391463783645

## Evaluation

In [91]:
# hyperparameter tuning

# references: https://github.com/fmfn/BayesianOptimization/blob/master/examples/basic-tour.ipynb
#             https://towardsdatascience.com/bayesian-optimization-concept-explained-in-layman-terms

from bayes_opt import BayesianOptimization
from bayes_opt.logger import JSONLogger
from bayes_opt.event import Events
from implicit.evaluation import mean_average_precision_at_k as map_at_k


# map_at_k is the objective function to be maximized
def map_at_k_als(factors, iterations, regularization, alpha):
    global user_item_training_matrix, user_item_validation_matrix
    als_model = implicit.als.AlternatingLeastSquares(factors=int(factors), iterations=int(iterations), 
                                                     regularization=regularization, random_state=69, use_gpu=False)
    als_model.fit(user_item_training_matrix*alpha, show_progress=True)
    map_12 = map_at_k(als_model, user_item_training_matrix, user_item_validation_matrix, K=12, show_progress=False, num_threads=1)
    return map_12


# Bounded region of parameter space
pbounds = {'factors': (10, 500), 'iterations': (1, 100), 'regularization': (0.01, 0.5), 'alpha': (10, 100)}

In [92]:
optimizer = BayesianOptimization(
    f=map_at_k_als,
    pbounds=pbounds,
    random_state=1,
    verbose=2
)

logger = JSONLogger(path="./als/als_strat1_logs.json")
optimizer.subscribe(Events.OPTIMIZATION_STEP, logger)

user_item_training_matrix, user_item_validation_matrix
optimizer.maximize(
    init_points=5, # number of bayesian optimization steps to perform
    n_iter=10, # how many steps of random exploration to perform -- diversifies exploration space
    # acq='ucb', # acquisition function type
    # kappa=2.576, # balance between exploration and exploitation
)

In [93]:
from bayes_opt.util import load_logs

loaded_optimizer = BayesianOptimization(
    f=map_at_k_als,
    pbounds=pbounds,
    verbose=1,
    random_state=2,
)


load_logs(loaded_optimizer, logs=["./als/als_strat1_logs.json"])
loaded_optimizer.max

{'target': 0.004267258343715801,
 'params': {'alpha': 20.51678125995256,
  'factors': 151.9606520546814,
  'iterations': 99.51984263986631,
  'regularization': 0.4375570316637724}}

In [96]:
# Final ALS model with best hyperparameters and all training data

best_hyperparams = loaded_optimizer.max['params']

user_item_train_matrix = create_user_item_matrix(transactions)
als_model = train_als_model(user_item_train_matrix, int(best_hyperparams['factors']), int(best_hyperparams['iterations']), int(best_hyperparams['regularization']), int(best_hyperparams['alpha']))

# Check the array of user and item latent factors in the training set
print(als_model.user_factors.shape)
print("---")
print(als_model.item_factors.shape)

DEBUG:implicit:Calculated transpose in 0.379s
DEBUG:implicit:Initialized factors in 2.5063974857330322
DEBUG:implicit:Running 99 ALS iterations
100%|██████████| 99/99 [1:06:43<00:00, 40.44s/it]


(1371980, 151)
---
(105542, 151)


<b> As we can see, the ALS model performs quite poorly on the test set, with a max MAP score od 0.0043 post-Bayesian Optimizaation. </b> <br>

In [98]:
# Save the als model as a pickle file
import pickle

with open('als/als_model_strat1.pkl', 'wb') as f:
    pickle.dump(als_model, f)


In [101]:
# Submission function that generates top 12 recommendations for each customer_id and appends to submission.csv
# if the customer_id is not in the training set, then the top 12 articles are recommended
# ensure that customer_id and article_id are strings


# reverse dictionary that maps index to article_id
index_article_id_map = {v: k for k, v in article_id_indices_map.items()}
index_customer_id_map = {v: k for k, v in customer_id_indices_map.items()}

# given an article_index, return the original article_id
def get_og_article_id(article_index):
    return '0' + str(index_article_id_map.get(article_index, None))


def submission_als(als_model):
    """ 
    Args:
    - als_model: the trained ALS model
 
    Returns:
    - submission.csv: a csv file with the top 12 recommendations for each customer_id
    """
    no_interaction_count = 0
    samp_sub = pd.read_csv('sample_submission.csv')
    sub_customer_ids = samp_sub['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64') 

    # map sample_sub customer_id to sub_customer_ids
    customer_id_sub_map = dict(zip(sub_customer_ids, samp_sub['customer_id']))

    # iterate thru all customer_ids in all_customers
    # generate top 12 recommendations for each customer_id using the trained als model

    predictions = []
    for customer_index in range(0, len(all_customers)):
        if customer_index not in user_item_train_matrix.indptr:
            recommended_articles = latest_top_12_products
            no_interaction_count += 1
        else:
            # filter_already_liked_items=False ==> recommend items that the user has already interacted with as well
            recco_articles, scores = als_model.recommend(customer_index, user_item_train_matrix, N=12, filter_already_liked_items=False)
            recommended_articles = [article_id for article_id in recco_articles]
            
        article_ids = [get_og_article_id(article_id) for article_id in recommended_articles]
        # append the top 12 recommendations for each customer_id to predictions
        predictions.append((customer_id_sub_map[index_customer_id_map[customer_index]], ' '.join(article_ids)))

    submission_df = pd.DataFrame(predictions, columns=['customer_id', 'prediction'])
    submission_df.to_csv('als/als_strat_submission.csv', index=False)
    display(submission_df.head())
    print(submission_df.shape)

    return submission_df, no_interaction_count

In [102]:
predictions, no_interaction_count = submission_als(als_model)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0568601006 0568597006 0568601007 0795440001 05...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0306847011 0288825017 0288825012 0283236034 02...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0306847011 0288825017 0288825012 0283236034 02...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0306847011 0288825017 0288825012 0283236034 02...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0306847011 0288825017 0288825012 0283236034 02...


(1371980, 2)


In [103]:
no_interaction_count/len(all_customers) * 100

95.03600635577779

As we can see, 95% of customers dont have any interaction rows in the sparse matrix, and thus the model is unable to make recommendations for these customers. <br> This is a cold start problem, and we will use the popularity model to recommend the top 12 items to these customers instead. <br>

Intuitively, we can see that ALS is not the best model for this dataset, since the ratings matrix is very sparse, and thus the model is unable to make recommendations for most customers. <br> 

In [1]:
# Submission function that just pust latest top 12 products for each customer_id and appends to submission.csv

def submission_pop(popularity_model):
    """ 
    Args:
    - als_model: the trained ALS model
 
    Returns:
    - submission.csv: a csv file with the top 12 recommendations for each customer_id
    """
    samp_sub = pd.read_csv('sample_submission.csv')
    sub_customer_ids = samp_sub['customer_id'].apply(lambda x: int(x[-16:], 16)).astype('int64') 

    # map sample_sub customer_id to sub_customer_ids
    customer_id_sub_map = dict(zip(sub_customer_ids, samp_sub['customer_id']))

    # iterate thru all customer_ids in all_customers
    # generate top 12 recommendations for each customer_id using the trained als model

    predictions = []
    for customer_index in range(0, len(all_customers)):
        recommended_articles = popularity_model
        article_ids = [get_og_article_id(article_id) for article_id in recommended_articles]
        # append the top 12 recommendations for each customer_id to predictions
        predictions.append((customer_id_sub_map[index_customer_id_map[customer_index]], ' '.join(article_ids)))

    submission_df = pd.DataFrame(predictions, columns=['customer_id', 'prediction'])
    submission_df.to_csv('pop/popularity_strat_submission.csv', index=False)
    display(submission_df.head())
    print(submission_df.shape)

    return submission_df

In [105]:
popularity_predictions = submission_pop(latest_top_12_products)

Unnamed: 0,customer_id,prediction
0,00000dbacae5abe5e23885899a1fa44253a17956c6d1c3...,0306847011 0288825017 0288825012 0283236034 02...
1,0000423b00ade91418cceaf3b26c6af3dd342b51fd051e...,0306847011 0288825017 0288825012 0283236034 02...
2,000058a12d5b43e67d225668fa1f8d618c13dc232df0ca...,0306847011 0288825017 0288825012 0283236034 02...
3,00005ca1c9ed5f5146b52ac8639a40ca9d57aeff4d1bd2...,0306847011 0288825017 0288825012 0283236034 02...
4,00006413d8573cd20ed7128e53b7b13819fe5cfc2d801f...,0306847011 0288825017 0288825012 0283236034 02...


(1371980, 2)
