# Business Case 4 - Recommender system
Ana Marta Silva: M20200971
Natalia Cristina Castañeda: M20200575
María Luisa Noguera: M20201005
Gustavo Tourinho: M20180846


In [1]:
#!pip install annoy
#!pip install nmslib
import os
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix, csr_matrix
from implicit.als import AlternatingLeastSquares
from implicit.bpr import BayesianPersonalizedRanking
from implicit.lmf import LogisticMatrixFactorization
from implicit.evaluation import ranking_metrics_at_k
from implicit.approximate_als import AnnoyAlternatingLeastSquares
from implicit.approximate_als import NMSLibAlternatingLeastSquares
from tqdm import tqdm
import scipy.sparse as sparse
import random
from scipy.sparse.linalg import spsolve
import implicit
from implicit.als import AlternatingLeastSquares
from sklearn.utils import check_random_state
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler

# Data Exploration

In [2]:
# Read csv files
df_retail = pd.read_csv('retail.csv')

In [3]:
df_retail.head()

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,12/1/2010 8:26,2.55,17850.0,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,12/1/2010 8:26,2.75,17850.0,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,12/1/2010 8:26,3.39,17850.0,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,12/1/2010 8:26,3.39,17850.0,United Kingdom


In [4]:
#inspect the type of each column 
df_retail.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 541909 entries, 0 to 541908
Data columns (total 8 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   InvoiceNo    541909 non-null  object 
 1   StockCode    541909 non-null  object 
 2   Description  540455 non-null  object 
 3   Quantity     541909 non-null  int64  
 4   InvoiceDate  541909 non-null  object 
 5   UnitPrice    541909 non-null  float64
 6   CustomerID   406829 non-null  float64
 7   Country      541909 non-null  object 
dtypes: float64(2), int64(1), object(5)
memory usage: 33.1+ MB


In [5]:
# Describe dataset
df_retail.describe(include='all')

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
count,541909.0,541909,540455,541909.0,541909,541909.0,406829.0,541909
unique,25900.0,4070,4223,,23260,,,38
top,573585.0,85123A,WHITE HANGING HEART T-LIGHT HOLDER,,10/31/2011 14:41,,,United Kingdom
freq,1114.0,2313,2369,,1114,,,495478
mean,,,,9.55225,,4.611114,15287.69057,
std,,,,218.081158,,96.759853,1713.600303,
min,,,,-80995.0,,-11062.06,12346.0,
25%,,,,1.0,,1.25,13953.0,
50%,,,,3.0,,2.08,15152.0,
75%,,,,10.0,,4.13,16791.0,


In [6]:
# Inspect Missing values
df_retail.isna().sum() # transactionid is only present when the event is transaction

InvoiceNo           0
StockCode           0
Description      1454
Quantity            0
InvoiceDate         0
UnitPrice           0
CustomerID     135080
Country             0
dtype: int64

In [7]:
#create the dataframe with new costumers
new_customers = df_retail.loc[pd.isnull(df_retail.CustomerID) == True]
new_customers

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
622,536414,22139,,56,12/1/2010 11:52,0.00,,United Kingdom
1443,536544,21773,DECORATIVE ROSE BATHROOM BOTTLE,1,12/1/2010 14:32,2.51,,United Kingdom
1444,536544,21774,DECORATIVE CATS BATHROOM BOTTLE,2,12/1/2010 14:32,2.51,,United Kingdom
1445,536544,21786,POLKADOT RAIN HAT,4,12/1/2010 14:32,0.85,,United Kingdom
1446,536544,21787,RAIN PONCHO RETROSPOT,2,12/1/2010 14:32,1.66,,United Kingdom
...,...,...,...,...,...,...,...,...
541536,581498,85099B,JUMBO BAG RED RETROSPOT,5,12/9/2011 10:26,4.13,,United Kingdom
541537,581498,85099C,JUMBO BAG BAROQUE BLACK WHITE,4,12/9/2011 10:26,4.13,,United Kingdom
541538,581498,85150,LADIES & GENTLEMEN METAL SIGN,1,12/9/2011 10:26,4.96,,United Kingdom
541539,581498,85174,S/4 CACTI CANDLES,1,12/9/2011 10:26,10.79,,United Kingdom


# Data preparation

In [8]:
#eliminate new customers
treated_df = df_retail.loc[pd.isnull(df_retail.CustomerID) == False]

In [9]:
#Create a dataframe of unique products
unique_products = treated_df[['StockCode', 'Description']].drop_duplicates() 
unique_products['StockCode'] = unique_products.StockCode.astype(str) 

In [10]:
unique_products.head()

Unnamed: 0,StockCode,Description
0,85123A,WHITE HANGING HEART T-LIGHT HOLDER
1,71053,WHITE METAL LANTERN
2,84406B,CREAM CUPID HEARTS COAT HANGER
3,84029G,KNITTED UNION FLAG HOT WATER BOTTLE
4,84029E,RED WOOLLY HOTTIE WHITE HEART.


In [11]:
treated_df['CustomerID'] = treated_df.CustomerID.astype(int)
#eliminate transactions with a value of zero because they consist of offerings and doesn't represent their preferences
treated_df = treated_df.loc[treated_df.UnitPrice != 0]

#create a dataframe with the relevant columns to create the recommendation system and group by aggregating the quantity 
treated_df = treated_df[['StockCode', 'Quantity', 'CustomerID']] 
grouped_df = treated_df.groupby(['CustomerID', 'StockCode']).sum().reset_index() 

# eliminate the interactions that result in quantity below 0 as they consist in returning products
grouped_df = grouped_df.query('Quantity > 0')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  treated_df['CustomerID'] = treated_df.CustomerID.astype(int)


In [12]:
#transform the columns of the grouped dataframe into category
user_category = grouped_df.CustomerID.astype('category')
item_category = grouped_df.StockCode.astype('category')

In [13]:
#build the sparse matrix
customers_df = list(np.sort(grouped_df.CustomerID.unique())) 
products_df = list(grouped_df.StockCode.unique()) 
quantity_df = list(grouped_df.Quantity) 

rows = user_category.cat.codes 
cols = item_category.cat.codes 

matrix_sparse = sparse.csr_matrix((quantity_df, (rows, cols)), shape=(len(customers_df), len(products_df)))

In [14]:
matrix_sparse

<4325x3649 sparse matrix of type '<class 'numpy.intc'>'
	with 265210 stored elements in Compressed Sparse Row format>

In [15]:
#size of the matrix
matrix_size = matrix_sparse.shape[0]*matrix_sparse.shape[1] 
matrix_size

15781925

In [16]:
matrix_sparse.shape[0]

4325

In [17]:
matrix_sparse.shape[1]

3649

In [18]:
number_purchases = len(matrix_sparse.nonzero()[0])
number_purchases

265210

In [19]:
#level of sparsity
sparsity = 100*(1 - (number_purchases/matrix_size))
sparsity

98.31953326352773

# Modelling

## Split Train and Test

In [20]:
#function to create the split between train and test
def train_test_split(matrix, pct_test = 0.2):
    test_set = matrix.copy() 
    test_set[test_set != 0] = 1 
    train_set = matrix.copy() 
    nonzero_inds = train_set.nonzero() 
    nonzero_pairs = list(zip(nonzero_inds[0], nonzero_inds[1]))
    random.seed(0)
    number_samples = int(np.ceil(pct_test*len(nonzero_pairs))) 
    samples = random.sample(nonzero_pairs, number_samples) 
    user_index = [index[0] for index in samples] 
    item_index = [index[1] for index in samples] 
    train_set[user_index, item_index] = 0 
    train_set.eliminate_zeros() 
    return train_set, test_set, list(set(user_index)) 

In [21]:
retail_train, retail_test, retail_users_altered = train_test_split(matrix_sparse, pct_test = 0.2)

# Fit the Alternating Least Squares (ALS) model



In [22]:

alpha = 15  
als_model = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50, random_state=0)
als_fit = als_model.fit((retail_train * alpha).astype('double'))





HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




# Hyperparameter tunning

## different alpha

In [23]:
alpha = 80  
als_model2 = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50, random_state=0)
als_model2.fit((retail_train * alpha).astype('double'))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




In [24]:
alpha = 50  
als_model3 = AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50, random_state=0)
als_model3.fit((retail_train * alpha).astype('double'))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




## different regularization 

In [25]:
alpha = 15  
als_model4 = AlternatingLeastSquares(factors=20, regularization=0.01, iterations=50, random_state=0)
als_model4.fit((retail_train * alpha).astype('double'))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




# Fit the Bayesian Personalized Ranking model

In [26]:
alpha = 15 
bpr_model =BayesianPersonalizedRanking(factors=20, regularization=0.1, iterations=50, random_state=0)
bpr_model.fit((retail_train * alpha).astype('double'))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




# Fit the Logistic Matrix Factorization model

In [27]:
alpha = 15  
lmf_model = LogisticMatrixFactorization(factors=20, regularization=0.1, iterations=50, random_state=0)
lmf_model.fit((retail_train * alpha).astype('double'))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 50/50 [00:01<00:00, 26.02it/s]


# Fit the Annoy ALS model

In [28]:
alpha = 15 
annoy_model =AnnoyAlternatingLeastSquares(factors=20, regularization=0.1, iterations=50, random_state=0)
annoy_model.fit((retail_train * alpha).astype('double'))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




# Fit the NMSL ALS model

In [29]:
alpha = 15 
nms_model =NMSLibAlternatingLeastSquares(factors=20, regularization=0.1, iterations=50, random_state=0)
nms_model.fit((retail_train * alpha).astype('double'))

HBox(children=(FloatProgress(value=0.0, max=50.0), HTML(value='')))




# Evaluation


In [30]:
#create a model that recommends the top sold products( baseline model)
class PopularRecommender():
    def fit(self, item_users):
        self.item_id_sort = np.argsort(np.squeeze(np.asarray(item_users.sum(axis=1).reshape(-1))))[::-1]
    
    def recommend(self, userid, user_items, N=10, filter_already_liked_items=None, filter_items=None, recalculate_user=None):
        if filter_already_liked_items != None or filter_items != None or recalculate_user != None:
            raise NotImplementedError("filter_already_liked_items, filter_items and recalculate_user aren't support yet")
        
        return list(zip(self.item_id_sort[:N], range(1, N + 1)))

In [31]:
#fit the model
pop_model = PopularRecommender()
pop_model.fit(retail_train)

In [32]:
# evaluatio of models
eval_models = {'pop_model': pop_model, 'als_model': als_model,'als_model2':als_model2,'als_model3':als_model3,'als_model4':als_model4,'bpr_model':bpr_model, 'lmf_model':lmf_model, 'annoy_model':annoy_model, 'nms_model':nms_model}
eval_table = {}
for k, v in eval_models.items():
    eval_table[k] = ranking_metrics_at_k(v, retail_train.T, retail_test.T, K=10, show_progress=True, num_threads=0)
eval_table = pd.DataFrame(eval_table)
eval_table

HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=3649.0), HTML(value='')))




Unnamed: 0,pop_model,als_model,als_model2,als_model3,als_model4,bpr_model,lmf_model,annoy_model,nms_model
precision,0.181158,0.035754,0.016046,0.01958,0.037314,0.076443,0.011048,0.036104,0.037728
map,0.085186,0.009857,0.004456,0.005828,0.010125,0.040597,0.002787,0.009949,0.010367
ndcg,0.171152,0.030286,0.01399,0.017573,0.031248,0.088466,0.009099,0.030601,0.031754
auc,0.515691,0.502633,0.501553,0.50192,0.502757,0.504871,0.499211,0.502783,0.502624


#  Recalculate AUC for all models

In [33]:
def auc_score(predictions, test):
    fpr, tpr, thresholds = metrics.roc_curve(test, predictions)
    return metrics.auc(fpr, tpr)   

In [34]:
def mean_auc_model(train_set, altered_users, predictions, test_set):
    store_auc = []
    popularity_auc = [] 
    pop_items = np.array(test_set.sum(axis = 0)).reshape(-1)
    item_vecs = predictions[1]
    for user in altered_users: 
        train_row = train_set[user,:].toarray().reshape(-1) 
        zero_inds = np.where(train_row == 0) 
        user_vec = predictions[0][user,:]
        pred = user_vec.dot(item_vecs).toarray()[0,zero_inds].reshape(-1)
        actual = test_set[user,:].toarray()[0,zero_inds].reshape(-1) 
        pop = pop_items[zero_inds] 
        store_auc.append(auc_score(pred, actual)) 
        popularity_auc.append(auc_score(pop, actual)) 
    
    return float('%.3f'%np.mean(store_auc))  
   

In [35]:
# calculate the AUC for all the models 
auc_models = {'als_model': als_model,'als_model2':als_model2,'als_model3':als_model3,'als_model4':als_model4,'bpr_model':bpr_model, 'lmf_model':lmf_model, 'annoy_model':annoy_model, 'nms_model':nms_model}
auc_table = {}
for k, v in auc_models.items():
    auc_table[k] = mean_auc_model(retail_train, retail_users_altered,[sparse.csr_matrix(v.item_factors), sparse.csr_matrix(v.user_factors.T)], retail_test)
auc_table

{'als_model': 0.872,
 'als_model2': 0.86,
 'als_model3': 0.865,
 'als_model4': 0.873,
 'bpr_model': 0.674,
 'lmf_model': 0.649,
 'annoy_model': 0.872,
 'nms_model': 0.872}

In [36]:
#put the results in a table
auc_table = pd.DataFrame(list(auc_table.values()), index=list(auc_table.keys()))
auc_table.columns = ['AUC']

In [37]:
auc_table

Unnamed: 0,AUC
als_model,0.872
als_model2,0.86
als_model3,0.865
als_model4,0.873
bpr_model,0.674
lmf_model,0.649
annoy_model,0.872
nms_model,0.872


# Recommendation example

In [38]:
#create array of customers and products ID
customers_arr = np.array(customers_df) 
products_arr = np.array(products_df) 

In [39]:
#function that will retrieve the products bought in the past by a specific customer in the train_set
def items_purchased(customer_id, mf_train, customers_list, products_list, item_lookup):
    cust_ind = np.where(customers_list == customer_id)[0][0]
    purchased_ind = mf_train[cust_ind,:].nonzero()[1] 
    prod_codes = products_list[purchased_ind] 
    return item_lookup.loc[item_lookup.StockCode.isin(prod_codes)]

In [40]:
#function that returns the top recommendations to the users
def recommend_items(customer_id, mf_train, user_v, item_v, customer_list, item_list, item_lookup, num_items = 10):
    cust_ind = np.where(customer_list == customer_id)[0][0] 
    pref_vec = mf_train[cust_ind,:].toarray() 
    pref_vec = pref_vec.reshape(-1) + 1 
    pref_vec[pref_vec > 1] = 0 
    rec_vector = user_v[cust_ind,:].dot(item_v.T) 
    min_max = MinMaxScaler()
    rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1,1))[:,0] 
    recommend_vector = pref_vec*rec_vector_scaled 
    product_idx = np.argsort(recommend_vector)[::-1][:num_items] 
    rec_list = [] 
    for index in product_idx:
        code = item_list[index]
        rec_list.append([code, item_lookup.Description.loc[item_lookup.StockCode == code].iloc[0]]) 
    codes = [item[0] for item in rec_list]
    descriptions = [item[1] for item in rec_list]
    final_frame = pd.DataFrame({'StockCode': codes, 'Description': descriptions}) # Create a dataframe 
    return final_frame[['StockCode', 'Description']] # Switch order of columns around

# Example 1

In [41]:
items_purchased(18068, retail_train, customers_arr, products_arr, unique_products)

Unnamed: 0,StockCode,Description
32025,21194,PINK HONEYCOMB PAPER FAN


In [42]:
#top 10 recommendations
recommend_items(18068, retail_train, als_model.item_factors, als_model.user_factors, customers_arr, products_arr, unique_products, num_items = 10)

Unnamed: 0,StockCode,Description
0,84789,ENCHANTED BIRD PLANT CAGE
1,85188B,PINK METAL SWINGING BUNNY
2,84535A,ENGLISH ROSE NOTEBOOK A6 SIZE
3,85214,TUB 24 PINK FLOWER PEGS
4,35004G,SET OF 3 GOLD FLYING DUCKS
5,90039B,FIRE POLISHED GLASS BRACELET MONTAN
6,85136A,YELLOW SHARK HELICOPTER
7,90155,RESIN NECKLACE W PASTEL BEADS
8,85086A,CANDY SPOT HEART DECORATION
9,84199,GLOW IN DARK DOLPHINS


# Example 2

In [43]:
items_purchased(16986, retail_train, customers_arr, products_arr, unique_products)

Unnamed: 0,StockCode,Description
75887,84907,PINK YELLOW PATCH CUSHION COVER
95582,44092B,BLUE WHITE PLASTIC RINGS LAMPSHADE
326435,23381,PACK OF 12 VINTAGE LEAF TISSUES


In [44]:
#top 10 recommendations
recommend_items(16986, retail_train, als_model.item_factors, als_model.user_factors, customers_arr, products_arr, unique_products, num_items = 10)

Unnamed: 0,StockCode,Description
0,44092C,PURPLE/COPPER HANGING LAMPSHADE
1,20902,VINTAGE KEEPSAKE BOX PARIS DAYS
2,23296,SET OF 6 TEA TIME BAKING CASES
3,84678,CLASSICAL ROSE SMALL VASE
4,84006,MAGIC TREE -PAPER FLOWERS
5,23293,SET OF 12 FAIRY CAKE BAKING CASES
6,22393,PAPERWEIGHT VINTAGE COLLAGE
7,79302M,"ART LIGHTS,FUNK MONKEY"
8,23439,HAND WARMER RED LOVE HEART
9,84568,GIRLS ALPHABET IRON ON PATCHES


# Example 3

In [45]:
items_purchased(12361, retail_train, customers_arr, products_arr, unique_products)

Unnamed: 0,StockCode,Description
1256,21520,BOOZE & WOMEN GREETING CARD
3920,22806,SET OF 6 T-LIGHTS WEDDING CAKE
7222,22307,GOLD MUG BONE CHINA TREE OF LIFE
9834,22949,36 DOILIES DOLLY GIRL
98375,21414,SCALLOP SHELL SOAP DISH
238985,23342,MINT DINER WALL CLOCK
260521,23294,SET OF 6 SNACK LOAF BAKING CASES
260523,23295,SET OF 12 MINI LOAF BAKING CASES
324499,23342,MINT DINER CLOCK
417712,22949,SET OF 36 DOLLY GIRL PAPER DOILIES


In [46]:
#top 10 recommendations
recommend_items(12361, retail_train, als_model.item_factors, als_model.user_factors, customers_arr, products_arr, unique_products, num_items = 10)

Unnamed: 0,StockCode,Description
0,22176,BLUE OWL SOFT TOY
1,23273,HEART T-LIGHT HOLDER WILLIE WINKIE
2,23281,FOLDING BUTTERFLY MIRROR RED
3,85169A,IVORY LOVE BIRD CANDLE
4,23296,SET OF 6 TEA TIME BAKING CASES
5,23379,PACK OF 12 RED APPLE TISSUES
6,22245,"HOOK, 1 HANGER ,MAGIC GARDEN"
7,22393,PAPERWEIGHT VINTAGE COLLAGE
8,23439,HAND WARMER RED LOVE HEART
9,21993,FLORAL FOLK STATIONERY SET


# Cold start

In [47]:
#we will propose the top most popular products to new users of the website

In [48]:
# the auc of the popular model is 81.3%
mean_auc_model(retail_train, retail_users_altered,[sparse.csr_matrix(als_model4.item_factors), sparse.csr_matrix(als_model4.user_factors.T)],retail_test)

0.873

In [49]:
#top 10 popular products
user_items = retail_train.T.tocsr()
recommendations=pop_model.recommend(retail_train, user_items)
recommendations

[(1682, 1),
 (54, 2),
 (1871, 3),
 (4190, 4),
 (3761, 5),
 (991, 6),
 (3718, 7),
 (1428, 8),
 (1327, 9),
 (3167, 10)]