## Collaborative filtering with SVD, NMF and Co-Clustering

In [1]:
import random

import numpy as np
import pandas as pd
from surprise import NMF, SVD, CoClustering, Dataset, Reader, SVDpp, accuracy
from surprise.model_selection.split import train_test_split
from surprise.model_selection.validation import cross_validate

## Preprocess

using https://github.com/Apress/applied-recommender-systems-python/tree/main/Data

In [2]:
df = pd.read_excel("../data/Rec_sys_data.xlsx")

In [3]:
df

Unnamed: 0,InvoiceNo,StockCode,Quantity,InvoiceDate,DeliveryDate,Discount%,ShipMode,ShippingCost,CustomerID
0,536365,84029E,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.20,ExpressAir,30.12,17850
1,536365,71053,6,2010-12-01 08:26:00,2010-12-02 08:26:00,0.21,ExpressAir,30.12,17850
2,536365,21730,6,2010-12-01 08:26:00,2010-12-03 08:26:00,0.56,Regular Air,15.22,17850
3,536365,84406B,8,2010-12-01 08:26:00,2010-12-03 08:26:00,0.30,Regular Air,15.22,17850
4,536365,22752,2,2010-12-01 08:26:00,2010-12-04 08:26:00,0.57,Delivery Truck,5.81,17850
...,...,...,...,...,...,...,...,...,...
272399,569629,23399,12,2011-10-05 11:37:00,2011-10-08 11:37:00,0.11,Delivery Truck,5.81,15249
272400,569629,22727,4,2011-10-05 11:37:00,2011-10-08 11:37:00,0.26,Delivery Truck,5.81,15249
272401,569629,23434,12,2011-10-05 11:37:00,2011-10-08 11:37:00,0.42,Delivery Truck,5.81,15249
272402,569629,23340,12,2011-10-05 11:37:00,2011-10-07 11:37:00,0.08,Regular Air,15.22,15249


In [4]:
item_purchase_matrix = pd.pivot_table(
    df, values="Quantity", index="StockCode", columns="CustomerID", fill_value=0.0
).astype(float)

In [5]:
item_purchase_matrix

CustomerID,12346,12347,12348,12350,12352,12353,12354,12355,12356,12358,...,18269,18270,18272,18273,18278,18280,18281,18282,18283,18287
StockCode,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10002,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10080,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10120,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10125,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10133,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
C2,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
DOT,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
M,0.0,0.0,0.00,0.0,1.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
PADS,0.0,0.0,0.00,0.0,0.00,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
flat_item_purchase = (
    item_purchase_matrix.stack()
    .to_frame()
    .reset_index()
    .rename(columns={0: "Quantity"})
)

In [7]:
flat_item_purchase

Unnamed: 0,StockCode,CustomerID,Quantity
0,10002,12346,0.0
1,10002,12347,0.0
2,10002,12348,0.0
3,10002,12350,0.0
4,10002,12352,0.0
...,...,...,...
12903081,POST,18280,0.0
12903082,POST,18281,0.0
12903083,POST,18282,0.0
12903084,POST,18283,0.0


In [8]:
customers_filtered = (
    df.groupby("CustomerID").InvoiceNo.count().reset_index().query("InvoiceNo > 120")
)

In [9]:
items_filtered = (
    df.groupby("StockCode").Quantity.count().reset_index().query("Quantity > 120")
)

In [10]:
df1 = (
    flat_item_purchase.merge(
        customers_filtered,
        on="CustomerID",
        how="inner",
    )
    .drop(["Quantity"], axis=1)
    .merge(
        items_filtered,
        on="StockCode",
        how="inner",
    )
    .drop(["Quantity"], axis=1)
    .rename({"InvoiceNo": "Quantity"}, axis=1)
)

df1

Unnamed: 0,StockCode,CustomerID,Quantity
0,10133,12347,124
1,10133,12359,143
2,10133,12362,145
3,10133,12370,139
4,10133,12378,219
...,...,...,...
385667,POST,18226,208
385668,POST,18229,131
385669,POST,18231,124
385670,POST,18260,134


In [11]:
df1.describe()

Unnamed: 0,CustomerID,Quantity
count,385672.0,385672.0
mean,15360.985915,279.089789
std,1719.468125,337.879413
min,12347.0,121.0
25%,13996.25,151.0
50%,15413.0,198.0
75%,16840.0,290.0
max,18283.0,5095.0


In [13]:
reader = Reader(rating_scale=(0, df1.Quantity.max()))

In [14]:
dataset = Dataset.load_from_df(df1, reader)

In [15]:
train_set, test_set = train_test_split(dataset)

## NMF

In [16]:
nmf = NMF()
nmf.fit(train_set)
preds1 = nmf.test(test_set)

In [17]:
accuracy.mae(preds1)

MAE:  273.1830


273.18295244203006

In [18]:
accuracy.rmse(preds1)

RMSE: 431.0366


431.03661940567815

In [19]:
cross_validate(nmf, dataset, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    427.2182437.3399427.3972423.0850425.9486428.19784.8245  
MAE (testset)     272.7719275.3858272.3181271.1275271.8861272.69791.4491  
Fit time          3.80    3.81    4.26    3.87    3.75    3.90    0.19    
Test time         0.45    0.42    0.28    0.41    0.41    0.39    0.06    


{'test_rmse': array([427.21821924, 437.33991992, 427.3972159 , 423.08500869,
        425.94864574]),
 'test_mae': array([272.77188221, 275.38578836, 272.31811785, 271.12748684,
        271.88614567]),
 'fit_time': (3.8030028343200684,
  3.807250499725342,
  4.263010025024414,
  3.869279623031616,
  3.748997926712036),
 'test_time': (0.4509904384613037,
  0.42299866676330566,
  0.2799985408782959,
  0.40799927711486816,
  0.40999841690063477)}

## Co-Clustering

In [20]:
clustering = CoClustering()
clustering.fit(train_set)
preds2 = clustering.test(test_set)

In [21]:
accuracy.rmse(preds2)

RMSE: 7.0874


7.0873789504537905

In [22]:
accuracy.mae(preds2)

MAE:  5.6492


5.649175172460262

In [23]:
cross_validate(clustering, dataset, verbose=True)

Evaluating RMSE, MAE of algorithm CoClustering on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    7.2126  6.8308  6.9820  7.0325  7.2584  7.0633  0.1562  
MAE (testset)     5.7398  5.5259  5.7139  5.6237  5.8759  5.6959  0.1173  
Fit time          5.25    5.30    5.23    5.29    5.62    5.34    0.14    
Test time         0.35    0.51    0.52    0.34    0.52    0.45    0.08    


{'test_rmse': array([7.21256679, 6.83076977, 6.98204576, 7.03253093, 7.25840344]),
 'test_mae': array([5.73982713, 5.5259464 , 5.71387195, 5.62372022, 5.87591166]),
 'fit_time': (5.246999979019165,
  5.300997972488403,
  5.2251152992248535,
  5.289999961853027,
  5.615610122680664),
 'test_time': (0.34599995613098145,
  0.5090076923370361,
  0.5150022506713867,
  0.3439912796020508,
  0.5190067291259766)}

## SVD

In [24]:
svd = SVD()
svd.fit(train_set)
preds3 = svd.test(test_set)

In [25]:
accuracy.mae(preds3)

MAE:  4815.3470


4815.347040902314

In [26]:
accuracy.rmse(preds3)

RMSE: 4827.4272


4827.427237340904

In [27]:
cross_validate(svd, dataset)

{'test_rmse': array([4828.09822355, 4826.49133378, 4826.93774989, 4828.22080911,
        4828.9928206 ]),
 'test_mae': array([4816.41547935, 4813.97547158, 4814.67504602, 4816.95398916,
        4817.53108875]),
 'fit_time': (2.645998954772949,
  2.654064893722534,
  2.6572108268737793,
  2.6640796661376953,
  2.643000602722168),
 'test_time': (0.558243989944458,
  0.5399990081787109,
  0.32199811935424805,
  0.5149219036102295,
  0.33400702476501465)}