In [1]:
import numpy as np
import scipy
import pandas as pd
import math
import random
import sklearn
from nltk.corpus import stopwords
from scipy.sparse import csr_matrix
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse.linalg import svds
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt

In [2]:
cs_df = pd.read_csv('../data/cleaned_online_retail.csv')

In [3]:
cs_df['invoicedate'] = pd.to_datetime(cs_df['invoicedate'])


cs_df.dtypes

Unnamed: 0              int64
invoiceno               int64
stockcode              object
description            object
quantity                int64
invoicedate    datetime64[ns]
unitprice             float64
customerid              int64
country                object
amount                float64
dtype: object

In [4]:
cs_df["description_enc"] = cs_df["description"].astype('category').cat.codes
cs_df.head(10)

Unnamed: 0.1,Unnamed: 0,invoiceno,stockcode,description,quantity,invoicedate,unitprice,customerid,country,amount,description_enc
0,0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,15.3,3480
1,1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,3488
2,2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,22.0,816
3,3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,1686
4,4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,20.34,2601
5,5,536365,22752,SET 7 BABUSHKA NESTING BOXES,2,2010-12-01 08:26:00,7.65,17850,United Kingdom,15.3,2790
6,6,536365,21730,GLASS STAR FROSTED T-LIGHT HOLDER,6,2010-12-01 08:26:00,4.25,17850,United Kingdom,25.5,1342
7,7,536366,22633,HAND WARMER UNION JACK,6,2010-12-01 08:28:00,1.85,17850,United Kingdom,11.1,1454
8,8,536366,22632,HAND WARMER RED RETROSPOT,6,2010-12-01 08:28:00,1.85,17850,United Kingdom,11.1,1452
9,9,536367,84879,ASSORTED COLOUR BIRD ORNAMENT,32,2010-12-01 08:34:00,1.69,13047,United Kingdom,54.08,204


In [5]:
users_interactions_count_df = cs_df.groupby(['customerid', 'description_enc']).size().groupby('customerid').size()
print('# users: %d' % len(users_interactions_count_df))
users_with_enough_interactions_df = users_interactions_count_df[users_interactions_count_df >= 5].reset_index()[['customerid']]
print('# users with at least 5 times shopping: %d' % len(users_with_enough_interactions_df))

# users: 4338
# users with at least 5 times shopping: 4080


In [6]:
print('# of interactions: %d' % len(cs_df))
interactions_from_selected_users_df = cs_df.merge(users_with_enough_interactions_df, 
               how = 'right',
               left_on = 'customerid',
               right_on = 'customerid')
print('# of interactions from users with at least 5 interactions: %d' % len(interactions_from_selected_users_df))

# of interactions: 397884
# of interactions from users with at least 5 interactions: 397142


In [7]:
cs_df.dtypes

Unnamed: 0                  int64
invoiceno                   int64
stockcode                  object
description                object
quantity                    int64
invoicedate        datetime64[ns]
unitprice                 float64
customerid                  int64
country                    object
amount                    float64
description_enc             int16
dtype: object

In [8]:
def smooth_user_preference(x):
    return math.log(1+x, 2)


interactions_full_df = interactions_from_selected_users_df \
                    .groupby(['customerid', 'description_enc'])['quantity'].sum()\
                    .apply(smooth_user_preference).reset_index() 
print('# of unique user/item interactions: %d' % len(interactions_full_df))
interactions_full_df

# of unique user/item interactions: 266095


Unnamed: 0,customerid,description_enc,quantity
0,12347,65,6.988685
1,12347,67,3.700440
2,12347,68,3.700440
3,12347,104,6.189825
4,12347,106,6.189825
...,...,...,...
266090,18287,3143,3.700440
266091,18287,3155,4.954196
266092,18287,3156,3.700440
266093,18287,3194,5.614710


In [9]:
interactions_train_df, interactions_test_df = train_test_split(interactions_full_df,
                                   stratify=interactions_full_df['customerid'], 
                                   test_size=0.20,
                                   random_state=42)

print('# interactions on Train set: %d' % len(interactions_train_df))
print('# interactions on Test set: %d' % len(interactions_test_df))

# interactions on Train set: 212876
# interactions on Test set: 53219


In [10]:
#Indexing by personId to speed up the searches during evaluation
interactions_full_indexed_df = interactions_full_df.set_index('customerid')
interactions_train_indexed_df = interactions_train_df.set_index('customerid')
interactions_test_indexed_df = interactions_test_df.set_index('customerid')

In [11]:
def get_items_interacted(customerid, interactions_df):
    # Get the user's data and merge in the movie information.
    interacted_items = cs_df.loc[customerid]['description_enc']
    return set(interacted_items if type(interacted_items) == pd.Series else [interacted_items])

In [12]:
#Top-N accuracy metrics consts
EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS = 100

class ModelEvaluator:


    def get_not_interacted_items_sample(self, customerid, sample_size, seed=42):
        interacted_items = get_items_interacted(customerid, interactions_full_indexed_df)
        all_items = set(cs_df['customerid'])
        non_interacted_items = all_items - interacted_items

        random.seed(seed)
        non_interacted_items_sample = random.sample(non_interacted_items, sample_size)
        return set(non_interacted_items_sample)

    def _verify_hit_top_n(self, item_id, recommended_items, topn):        
            try:
                index = next(i for i, c in enumerate(recommended_items) if c == item_id)
            except:
                index = -1
            hit = int(index in range(0, topn))
            return hit, index

    def evaluate_model_for_user(self, model, customerid):
        #Getting the items in test set
        interacted_values_testset = interactions_test_indexed_df.loc[customerid]
        if type(interacted_values_testset['description_enc']) == pd.Series:
            person_interacted_items_testset = set(interacted_values_testset['description_enc'])
        else:
            person_interacted_items_testset = set([int(interacted_values_testset['description_enc'])])  
        interacted_items_count_testset = len(person_interacted_items_testset) 

        #Getting a ranked recommendation list from a model for a given user
        person_recs_df = model.recommend_items(customerid, 
                                               items_to_ignore=get_items_interacted(customerid, 
                                                                                    interactions_train_indexed_df), 
                                               topn=10000000000)

        hits_at_5_count = 0
        hits_at_10_count = 0
        #For each item the user has interacted in test set
        for item_id in person_interacted_items_testset:
            #Getting a random sample (100) items the user has not interacted 
            #(to represent items that are assumed to be no relevant to the user)
            non_interacted_items_sample = self.get_not_interacted_items_sample(customerid, 
                                                                          sample_size=EVAL_RANDOM_SAMPLE_NON_INTERACTED_ITEMS, 
                                                                          seed=item_id%(2**32))

            #Combining the current interacted item with the 100 random items
            items_to_filter_recs = non_interacted_items_sample.union(set([item_id]))

            #Filtering only recommendations that are either the interacted item or from a random sample of 100 non-interacted items
            valid_recs_df = person_recs_df[person_recs_df['customerid'].isin(items_to_filter_recs)]                    
            valid_recs = valid_recs_df['customerid'].values
            #Verifying if the current interacted item is among the Top-N recommended items
            hit_at_5, index_at_5 = self._verify_hit_top_n(item_id, valid_recs, 5)
            hits_at_5_count += hit_at_5
            hit_at_10, index_at_10 = self._verify_hit_top_n(item_id, valid_recs, 10)
            hits_at_10_count += hit_at_10

        #Recall is the rate of the interacted items that are ranked among the Top-N recommended items, 
        #when mixed with a set of non-relevant items
        recall_at_5 = hits_at_5_count / float(interacted_items_count_testset)
        recall_at_10 = hits_at_10_count / float(interacted_items_count_testset)

        person_metrics = {'hits@5_count':hits_at_5_count, 
                          'hits@10_count':hits_at_10_count, 
                          'interacted_count': interacted_items_count_testset,
                          'recall@5': recall_at_5,
                          'recall@10': recall_at_10}
        return person_metrics

    def evaluate_model(self, model):
        #print('Running evaluation for users')
        people_metrics = []
        for idx, customerid in enumerate(list(interactions_test_indexed_df.index.unique().values)):
            #if idx % 100 == 0 and idx > 0:
            #    print('%d users processed' % idx)
            person_metrics = self.evaluate_model_for_user(model, customerid)  
            person_metrics['customerid'] = customerid
            people_metrics.append(person_metrics)
        print('%d users processed' % idx)

        detailed_results_df = pd.DataFrame(people_metrics) \
                            .sort_values('interacted_count', ascending=False)
        
        global_recall_at_5 = detailed_results_df['hits@5_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        global_recall_at_10 = detailed_results_df['hits@10_count'].sum() / float(detailed_results_df['interacted_count'].sum())
        
        global_metrics = {'modelName': model.get_model_name(),
                          'recall@5': global_recall_at_5,
                          'recall@10': global_recall_at_10}    
        return global_metrics, detailed_results_df
    
model_evaluator = ModelEvaluator() 

In [13]:
#Computes the most popular items
item_popularity_df = interactions_full_df.groupby('customerid')['quantity'].sum().sort_values(ascending=False).reset_index()
item_popularity_df.head(10)

Unnamed: 0,customerid,quantity
0,14911,8008.859015
1,12748,4906.906435
2,14646,4778.910535
3,14298,4641.350686
4,17841,3930.101964
5,14096,3276.224832
6,14156,3273.206363
7,12415,3014.23454
8,13089,2953.639208
9,17511,2833.413686


In [14]:
class PopularityRecommender:
    
    MODEL_NAME = 'Popularity'
    
    def __init__(self, popularity_df, items_df=None):
        self.popularity_df = popularity_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=False):
        # Recommend the more popular items that the user hasn't seen yet.
        recommendations_df = self.popularity_df[~self.popularity_df['customerid'].isin(items_to_ignore)] \
                               .sort_values('quantity', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'description_enc', 
                                                          right_on = 'description_enc')[[]]


        return recommendations_df
    
popularity_model = PopularityRecommender(item_popularity_df, cs_df)

In [None]:
print('Evaluating Popularity recommendation model...')
pop_global_metrics, pop_detailed_results_df = model_evaluator.evaluate_model(popularity_model)
print('\nGlobal metrics:\n%s' % pop_global_metrics)
pop_detailed_results_df.head(10)

Evaluating Popularity recommendation model...


since Python 3.9 and will be removed in a subsequent version.
  non_interacted_items_sample = random.sample(non_interacted_items, sample_size)


In [15]:
users_items_pivot_matrix_df = interactions_train_df.pivot(index='customerid', 
                                                          columns='description_enc', 
                                                          values='quantity').fillna(0)

users_items_pivot_matrix_df.head(10)

description_enc,0,1,2,3,4,5,6,7,8,9,...,3637,3638,3639,3640,3641,3642,3643,3644,3645,3646
customerid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
12347,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12352,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12354,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12355,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12356,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12357,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12358,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
users_items_pivot_matrix = users_items_pivot_matrix_df.values
users_items_pivot_matrix[:10]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [18]:
users_ids = list(users_items_pivot_matrix_df.index)
users_ids[:10]

[12347, 12348, 12349, 12350, 12352, 12354, 12355, 12356, 12357, 12358]

In [19]:
users_items_pivot_sparse_matrix = csr_matrix(users_items_pivot_matrix)
users_items_pivot_sparse_matrix

<4080x3596 sparse matrix of type '<class 'numpy.float64'>'
	with 212876 stored elements in Compressed Sparse Row format>

In [20]:
#The number of factors to factor the user-item matrix.
NUMBER_OF_FACTORS_MF = 15
#Performs matrix factorization of the original user item matrix
#U, sigma, Vt = svds(users_items_pivot_matrix, k = NUMBER_OF_FACTORS_MF)
U, sigma, Vt = svds(users_items_pivot_sparse_matrix, k = NUMBER_OF_FACTORS_MF)

In [21]:
U.shape

(4080, 15)

In [22]:
Vt.shape

(15, 3596)

In [23]:
sigma = np.diag(sigma)
sigma.shape

(15, 15)

In [24]:
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt) 
all_user_predicted_ratings

array([[ 0.04413557,  0.24147553,  0.25537166, ...,  0.34179674,
         0.00242437,  0.18870729],
       [ 0.00955596,  0.03172375, -0.08681518, ..., -0.08894269,
        -0.01373387, -0.01059325],
       [ 0.03520091, -0.04550689,  0.21206142, ...,  0.08283794,
         0.00576793,  0.01335491],
       ...,
       [ 0.00539216,  0.01807092,  0.02498249, ...,  0.01540804,
        -0.00089621,  0.01738911],
       [ 0.00562794,  0.29046831,  0.31619701, ..., -0.0015833 ,
         0.00982582, -0.02139477],
       [-0.0038784 ,  0.20961388,  0.05851971, ...,  0.02030997,
        -0.0017169 ,  0.05438633]])

In [25]:
all_user_predicted_ratings_norm = (all_user_predicted_ratings - all_user_predicted_ratings.min()) / (all_user_predicted_ratings.max() - all_user_predicted_ratings.min())

In [26]:
#Converting the reconstructed matrix back to a Pandas dataframe
cf_preds_df = pd.DataFrame(all_user_predicted_ratings_norm, columns = users_items_pivot_matrix_df.columns, index=users_ids).transpose()
cf_preds_df.head(10)

Unnamed: 0_level_0,12347,12348,12349,12350,12352,12354,12355,12356,12357,12358,...,18272,18274,18276,18277,18278,18280,18281,18282,18283,18287
description_enc,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.275739,0.274367,0.275384,0.27449,0.27558,0.274305,0.274431,0.276283,0.275353,0.274714,...,0.2751,0.274467,0.274071,0.273661,0.274174,0.274123,0.273782,0.274202,0.274212,0.273835
1,0.283564,0.275246,0.272184,0.272376,0.272114,0.268494,0.274788,0.270052,0.282828,0.272636,...,0.274875,0.273,0.277184,0.275126,0.275075,0.275066,0.274796,0.274705,0.285506,0.2823
2,0.284115,0.270546,0.282397,0.27405,0.278015,0.284321,0.277011,0.271089,0.284867,0.273725,...,0.289397,0.274298,0.274041,0.274851,0.273294,0.273766,0.277098,0.274979,0.286527,0.276309
3,0.278866,0.275672,0.275988,0.273297,0.275434,0.278785,0.276284,0.275647,0.27977,0.274441,...,0.27523,0.273848,0.274627,0.274782,0.273949,0.273916,0.274971,0.273885,0.277263,0.277435
4,0.274508,0.274112,0.274045,0.273915,0.274132,0.273628,0.274169,0.273689,0.274805,0.274018,...,0.273792,0.273951,0.274187,0.2741,0.274211,0.274154,0.273965,0.274004,0.274114,0.27453
5,0.278221,0.276575,0.273322,0.274323,0.274112,0.273165,0.275036,0.273441,0.273941,0.274149,...,0.272862,0.273958,0.274363,0.274207,0.273876,0.273977,0.273869,0.274541,0.275694,0.27627
6,0.283072,0.273007,0.274376,0.274097,0.273642,0.270612,0.274771,0.268946,0.283457,0.273193,...,0.276377,0.273282,0.27636,0.273842,0.274218,0.27472,0.275525,0.274694,0.282194,0.27923
7,0.281818,0.288246,0.280588,0.273588,0.276937,0.281342,0.280997,0.281986,0.309502,0.277211,...,0.272809,0.274734,0.27597,0.272084,0.277192,0.275881,0.275926,0.275392,0.283241,0.28807
8,0.286708,0.272032,0.278129,0.274826,0.277374,0.279224,0.276167,0.27579,0.279682,0.274461,...,0.279279,0.274277,0.275191,0.274016,0.273304,0.274241,0.274989,0.274554,0.275317,0.275469
9,0.267961,0.272668,0.276432,0.274118,0.275024,0.274855,0.273583,0.270747,0.279635,0.273877,...,0.278815,0.274331,0.273185,0.27373,0.274486,0.274068,0.275046,0.274086,0.277198,0.273019


In [27]:
len(cf_preds_df.columns)

4080

In [34]:
class CFRecommender:
    
    MODEL_NAME = 'Collaborative Filtering'
    
    def __init__(self, cf_predictions_df, items_df=None):
        self.cf_predictions_df = cf_predictions_df
        self.items_df = items_df
        
    def get_model_name(self):
        return self.MODEL_NAME
        
    def recommend_items(self, user_id, items_to_ignore=[], topn=10, verbose=True):
        # Get and sort the user's predictions
        sorted_user_predictions = self.cf_predictions_df[user_id].sort_values(ascending=False) \
                                    .reset_index().rename(columns={user_id: 'quantity'})

        # Recommend the highest predicted rating movies that the user hasn't seen yet.
        recommendations_df = sorted_user_predictions[~sorted_user_predictions['customerid'].isin(items_to_ignore)] \
                               .sort_values('quantity', ascending = False) \
                               .head(topn)

        if verbose:
            if self.items_df is None:
                raise Exception('"items_df" is required in verbose mode')

            recommendations_df = recommendations_df.merge(self.items_df, how = 'left', 
                                                          left_on = 'customerid', 
                                                          right_on = 'customerid')[[]]


        return recommendations_df
    
cf_recommender_model = CFRecommender(cf_preds_df, cs_df)

In [35]:
print('Evaluating Collaborative Filtering (SVD Matrix Factorization) model...')
cf_global_metrics, cf_detailed_results_df = model_evaluator.evaluate_model(cf_recommender_model)
print('\nGlobal metrics:\n%s' % cf_global_metrics)
cf_detailed_results_df.head(10)

Evaluating Collaborative Filtering (SVD Matrix Factorization) model...


KeyError: 'customerid'