In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

from surprise import Reader, Dataset, KNNWithMeans, SVD, SVDpp, NMF, SlopeOne, KNNBasic
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split

import implicit
import scipy.sparse as sparse
from sklearn.preprocessing import MinMaxScaler

# Collaborative Filtering

There are three main ways to approach recommender systems:
- Collaborative Filtering: This method makes automatic predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating).
- Content-Based Filtering: This method uses only information about the description and attributes of the items users have previously consumed to model user's preferences.
- Hybrid methods: Recent research has demonstrated that a hybrid approach, combining collaborative filtering and content-based filtering could be more effective than pure approaches in some cases.

For extra info please refer to: https://www.kaggle.com/gspmoreira/recommender-systems-in-python-101?scriptVersionId=1477182

This notebook serves only as a POC of a simple collaborative filtering recommendation approach. There are various ways to improve upon it and should not be taken as an optimal solution.

In this case, an example dataset possessing various products will be used, namely an online retail dataset.

In [2]:
base_df = pd.read_excel("../lib/data/online_retail.xlsx")
base_df = base_df.dropna()
base_df['CustomerID'] = base_df['CustomerID'].astype(int)
base_df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom
...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680,France
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France


Firstly lets convert all prices to cents since doing it will make further analysis and other operations easier.

Additionaly, let's discard items with negative pricing.

In [3]:
base_df['Full_Price_C'] = base_df['Quantity'] * base_df['UnitPrice'] * 100
base_df['Full_Price_C'] = base_df['Full_Price_C'].astype(int)
base_df = base_df[base_df['Full_Price_C'] > 0]
base_df

Unnamed: 0,InvoiceNo,StockCode,Description,Quantity,InvoiceDate,UnitPrice,CustomerID,Country,Full_Price_C
0,536365,85123A,WHITE HANGING HEART T-LIGHT HOLDER,6,2010-12-01 08:26:00,2.55,17850,United Kingdom,1530
1,536365,71053,WHITE METAL LANTERN,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2034
2,536365,84406B,CREAM CUPID HEARTS COAT HANGER,8,2010-12-01 08:26:00,2.75,17850,United Kingdom,2200
3,536365,84029G,KNITTED UNION FLAG HOT WATER BOTTLE,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2034
4,536365,84029E,RED WOOLLY HOTTIE WHITE HEART.,6,2010-12-01 08:26:00,3.39,17850,United Kingdom,2034
...,...,...,...,...,...,...,...,...,...
541904,581587,22613,PACK OF 20 SPACEBOY NAPKINS,12,2011-12-09 12:50:00,0.85,12680,France,1019
541905,581587,22899,CHILDREN'S APRON DOLLY GIRL,6,2011-12-09 12:50:00,2.10,12680,France,1260
541906,581587,23254,CHILDRENS CUTLERY DOLLY GIRL,4,2011-12-09 12:50:00,4.15,12680,France,1660
541907,581587,23255,CHILDRENS CUTLERY CIRCUS PARADE,4,2011-12-09 12:50:00,4.15,12680,France,1660


Now let's compute the absolute amount of money each customer paid for each product and store the data in a dataframe.

In [4]:
ui_stats = base_df.groupby(['StockCode', 'Description', 'CustomerID']).agg({'Full_Price_C':np.sum}).reset_index()
ui_stats

Unnamed: 0,StockCode,Description,CustomerID,Full_Price_C
0,10002,INFLATABLE POLITICAL GLOBE,12451,1019
1,10002,INFLATABLE POLITICAL GLOBE,12510,2039
2,10002,INFLATABLE POLITICAL GLOBE,12583,4079
3,10002,INFLATABLE POLITICAL GLOBE,12637,1019
4,10002,INFLATABLE POLITICAL GLOBE,12673,85
...,...,...,...,...
268390,POST,POSTAGE,17404,9500
268391,POST,POSTAGE,17444,55094
268392,POST,POSTAGE,17788,495
268393,POST,POSTAGE,17828,65500


Let us analyze the price distribution of the aforementioned dataset.

It is possible to see some very clear outliers, however, we are more interested in the main point distribution.

In [5]:
fig = px.box(ui_stats, y="Full_Price_C", points="outliers")
fig.update_yaxes(range=[0, 5000])
fig.show()

Let's now create categories based on the amount spent. 

Note that while its is possible to encode the amount spent to a single continous interval between the min and max value for theses categories through some data manipulation, here, we are just doing it manually.

From the plot above some bins can be deduced:
- 1 = ]0, 1500]
- 2 = ]1500, 2000]
- 3 = ]2000, 5000]

Now in order to establish the last missing bin we choose a value that can be said to be a considerable amount of money to spend in a given item (250), meaning:
- 4 = ]5000, 25000]
- 5 = ]25000, + ∞[

In [6]:
def investment_level(x):
    if x <= 15*100:
        return 1
    if x <= 20*100:
        return 2
    if x <= 50*100:
        return 3
    if x <= 250*100:
        return 4
    return 5

ui_stats = ui_stats.assign(Investment=ui_stats['Full_Price_C'])
ui_stats['Investment'] = ui_stats['Investment'].apply(lambda x: investment_level(x))
ui_stats

Unnamed: 0,StockCode,Description,CustomerID,Full_Price_C,Investment
0,10002,INFLATABLE POLITICAL GLOBE,12451,1019,1
1,10002,INFLATABLE POLITICAL GLOBE,12510,2039,3
2,10002,INFLATABLE POLITICAL GLOBE,12583,4079,3
3,10002,INFLATABLE POLITICAL GLOBE,12637,1019,1
4,10002,INFLATABLE POLITICAL GLOBE,12673,85,1
...,...,...,...,...,...
268390,POST,POSTAGE,17404,9500,4
268391,POST,POSTAGE,17444,55094,5
268392,POST,POSTAGE,17788,495,1
268393,POST,POSTAGE,17828,65500,5


## Test with Surprise

Now that we have a workable dataset, we will try to create a predictor based on the Surprise framework (https://github.com/NicolasHug/Surprise).

To do that SVD based approach will be applied. In linear algebra, the singular value decomposition (SVD) is a factorization of a real or complex matrix that generalizes the eigendecomposition of a square normal matrix to any m * n matrix via an extension of the polar decomposition.

- Original discussion: https://sifter.org/~simon/journal/20061211.html
- Softer introduction: https://medium.com/@jonathan_hui/machine-learning-singular-value-decomposition-svd-principal-component-analysis-pca-1d45e885e491

In [7]:
train, test = train_test_split(ui_stats, test_size=0.2)
data = Dataset.load_from_df(ui_stats[['CustomerID', 'StockCode', 'Investment']], Reader(rating_scale=(1, 5)))
trainset = data.build_full_trainset()

algo = SVD(n_factors=10, n_epochs=10, lr_all=0.005, reg_all=0.4)
algo.fit(trainset)
test

Unnamed: 0,StockCode,Description,CustomerID,Full_Price_C,Investment
252804,82494L,WOODEN FRAME ANTIQUE WHITE,15440,885,1
144472,22792,FLUTED ANTIQUE CANDLE HOLDER,17191,1019,1
247144,18097C,WHITE TALL PORCELAIN T-LIGHT HOLDER,15483,508,1
14212,21033,JUMBO BAG CHARLIE AND LOLA TOYS,13162,2080,3
51822,21876,POTTERING MUG,13735,494,1
...,...,...,...,...,...
43674,21705,BAG 500g SWIRLY MARBLES,13069,936,1
2255,20668,DISCO BALL CHRISTMAS DECORATION,17340,1728,2
147098,22819,"BIRTHDAY CARD, RETRO SPOT",18109,1008,1
204158,23320,GIANT 50'S CHRISTMAS CRACKER,14112,3468,3


Once the model is trained a a pair of customer and item ids is passed to the prediction function in order to get a prediction of how much (in categorical terms) a user would be likely to spend on a given item. This value can then be used to decide wether to show the item as a possible recommendation or not.

In [8]:
pred = algo.predict(uid=16818, iid='85039B')
pred

Prediction(uid=16818, iid='85039B', r_ui=None, est=1.7327932798064485, details={'was_impossible': False})

## Test with Implicit

Now lets test implicit.

Here we will apply an Alternating Least Squares approach. Similarly to SVD is ALS is also a matrix factorization algorithm and was firstly suggested at Netflix kaggle competition.

Its details can be checked at: https://www.asc.ohio-state.edu/statistics/dmsl/GrandPrize2009_BPC_BellKor.pdf

In [9]:
grouped_df = ui_stats.copy()

grouped_df['Description'] = grouped_df['Description'].astype("category")

grouped_df['CustomerID'] = grouped_df['CustomerID'].astype("category")
grouped_df['CustomerID'] = grouped_df['CustomerID'].cat.codes

grouped_df['StockCode'] = grouped_df['StockCode'].astype("category")
grouped_df['StockCode'] = grouped_df['StockCode'].cat.codes

grouped_df

Unnamed: 0,StockCode,Description,CustomerID,Full_Price_C,Investment
0,0,INFLATABLE POLITICAL GLOBE,86,1019,1
1,0,INFLATABLE POLITICAL GLOBE,130,2039,3
2,0,INFLATABLE POLITICAL GLOBE,189,4079,3
3,0,INFLATABLE POLITICAL GLOBE,235,1019,1
4,0,INFLATABLE POLITICAL GLOBE,266,85,1
...,...,...,...,...,...
268390,3663,POSTAGE,3693,9500,4
268391,3663,POSTAGE,3724,55094,5
268392,3663,POSTAGE,3978,495,1
268393,3663,POSTAGE,4000,65500,5


Here we define two sparce matrices to be used in the algorithm, as well as the alpha to be used (a confidence scaling value).

In [10]:
sparse_content_person = sparse.csr_matrix((grouped_df['Investment'].astype(float), (grouped_df['StockCode'], grouped_df['CustomerID'])))
sparse_person_content = sparse.csr_matrix((grouped_df['Investment'].astype(float), (grouped_df['CustomerID'], grouped_df['StockCode'])))

model = implicit.als.AlternatingLeastSquares(factors=20, regularization=0.1, iterations=50)

alpha = 15
data = (sparse_content_person * alpha).astype('double')
model.fit(data)

100%|██████████| 50/50 [00:13<00:00,  3.79it/s]


### Find Similar Items
Let's try to find 10 items similar to a "WOODLAND STORAGE BOX LARGE"

In [11]:
# WOODLAND STORAGE BOX LARGE
item_id = 493
n_similar = 10

similar = model.similar_items(item_id, n_similar)

# Print the names of our most similar artists
for idx, score in similar:
    print(grouped_df.Description.loc[grouped_df.StockCode == idx].iloc[0])


WOODLAND STORAGE BOX LARGE 
WOODLAND STORAGE BOX SMALL
S/6 SEW ON CROCHET FLOWERS
MILK PAN BLUE POLKADOT
RED RETROSPOT TAPE
5 HOOK HANGER RED MAGIC TOADSTOOL
WOODLAND LARGE BLUE FELT HEART
WOODLAND DESIGN  COTTON TOTE BAG
PACK 3 FIRE ENGINE/CAR PATCHES
BLUE POLKADOT KIDS BAG


Now let's try to find recommended items for a given user.

As can be seen various kinds of bags are recommended for this user, meaning this user probably has a high ammount of money spent on other bag types.

In [12]:
# Create recommendations for user with id 2025
user_id = 140

# Use the implicit recommender.
recommended = model.recommend(user_id, sparse_person_content)

items = []
scores = []

# Get artist names from ids
for item in recommended:
    idx, score = item
    items.append(grouped_df.Description.loc[grouped_df.StockCode == idx].iloc[0])
    scores.append(score)

# Create a dataframe of artist names and scores
recommendations = pd.DataFrame({'item': items, 'score': scores})

recommendations

Unnamed: 0,item,score
0,LUNCH BAG APPLE DESIGN,0.912387
1,LUNCH BAG CARS BLUE,0.884595
2,LUNCH BAG WOODLAND,0.878396
3,LUNCH BAG DOILEY PATTERN,0.878369
4,JUMBO BAG RED RETROSPOT,0.873042
5,JUMBO BAG DOILEY PATTERNS,0.86503
6,LUNCH BAG RED RETROSPOT,0.860273
7,JUMBO BAG PEARS,0.851781
8,LUNCH BAG SUKI DESIGN,0.848288
9,LUNCH BAG PINK POLKADOT,0.830233


By ordering the items the user as bought we can verify that indeed he spends most of his money in various kinds of bags.

In [13]:
grouped_df.loc[grouped_df['CustomerID'] == 140].sort_values(by=['Investment'], ascending=False)[['Description', 'Investment', 'CustomerID']]

Unnamed: 0,Description,Investment,CustomerID
268172,POSTAGE,4,140
57718,JUMBO STORAGE BAG SUKI,3,140
3954,JUMBO BAG WOODLAND ANIMALS,3,140
185510,JUMBO BAG APPLES,3,140
186108,JUMBO BAG ALPHABET,3,140
186505,JUMBO BAG VINTAGE LEAF,3,140
96401,JUMBO SHOPPER VINTAGE RED PAISLEY,3,140
90367,DOG BOWL CHASING BALL DESIGN,3,140
90453,ILLUSTRATED CAT BOWL,2,140
196379,SET OF 2 CERAMIC CHRISTMAS REINDEER,2,140
