Notebook inspired by recometrics example NB:
https://nbviewer.org/github/david-cortes/recometrics/blob/master/examples/recometrics_example.ipynb
Data 360k lastfm, loading directly via tsv

In [2]:
import numpy as np
import pandas as pd
import recometrics
import implicit
from scipy.sparse import coo_matrix
from implicit.evaluation import train_test_split, mean_average_precision_at_k,  precision_at_k, AUC_at_k, ndcg_at_k, ranking_metrics_at_k
from sklearn.model_selection import train_test_split

In [3]:
%cd

C:\Users\chris


In [80]:
# data loading
lfm = pd.read_table('usersha1-artmbid-artname-plays.tsv',
                    sep='\t', header=None,
                    names=['UserId','ItemId', 'Artist','Count'])
lfm.head(3)

Unnamed: 0,UserId,ItemId,Artist,Count
0,00000c289a1829a808ac09c00daf10bc3c4e223b,3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897


In [81]:
# preprocessing
lfm = lfm.drop('Artist', axis=1)
lfm = lfm.loc[(lfm.Count > 0) & (lfm.UserId.notnull()) & (lfm.ItemId.notnull())]
lfm['UserId'] = pd.Categorical(lfm.UserId).codes
lfm['ItemId'] = pd.Categorical(lfm.ItemId).codes
lfm.head(3)

Unnamed: 0,UserId,ItemId,Count
0,0,37425,2137
1,0,152039,1099
2,0,112365,897


In [6]:
#lfm = lfm.sort_values(['Count'])
counts = lfm['Count']

In [7]:
capped_counts = np.clip(counts, a_min=1, a_max=np.percentile(counts, 95))

In [8]:
lfm['CountsCap'] = capped_counts

In [196]:
type(lfm.Count[0])

numpy.int64

In [82]:
X = coo_matrix((lfm.Count, (lfm.UserId, lfm.ItemId)))
X

<358858x160112 sparse matrix of type '<class 'numpy.int64'>'
	with 17309518 stored elements in COOrdinate format>

In [14]:
# data splitting considering all users for test
X_train, X_test  = \
    recometrics.split_reco_train_test(
        X, split_type="all",
        users_test_fraction = None,
        max_test_users = 10000,
        items_test_fraction = 0.3
    )
X_test

<358858x160112 sparse matrix of type '<class 'numpy.float64'>'
	with 5206192 stored elements in Compressed Sparse Row format>

In [18]:
X_train, X_test, users_test = \
    recometrics.split_reco_train_test(
        X, split_type="joined",
        users_test_fraction = None,
        max_test_users = 10000,
        items_test_fraction = 0.3
    )
X_test

<10000x160112 sparse matrix of type '<class 'numpy.float64'>'
	with 145109 stored elements in Compressed Sparse Row format>

In [62]:
alpha = [0.2, 0.4, 0.8, 5, 10, 20, 30, 40]
reg = [0.001, 0.01, 0.01, 2, 5, 10]

for a in range(len(alpha)) :
    for r in range(len(reg)) :
        model = implicit.als.AlternatingLeastSquares(factors=100, regularization=reg[r], random_state=123, num_threads=4)
        Cui = X_train * alpha[a]
        model.fit(Cui)

        metrics_wrmf = recometrics.calc_reco_metrics(
            X_train[:X_test.shape[0]], X_test,
            model.user_factors[:X_test.shape[0]], model.item_factors,
            k=5, all_metrics=True
        )
        all_metrics = [
            metrics_wrmf
        ]
        all_metrics = pd.concat([m.mean(axis=0).to_frame().T for m in all_metrics], axis=0)
        all_metrics.index = [
            "alpha: " + str(alpha[a]) + "reg: " + str(reg[r])    
        ]
        metrics_100_init = pd.concat([metrics_100_init, all_metrics], axis=0)
        print(reg[r])
        print(alpha[a])


100%|██████████| 15/15 [04:18<00:00, 17.24s/it]


0.001
0.2


100%|██████████| 15/15 [04:26<00:00, 17.79s/it]


0.01
0.2


100%|██████████| 15/15 [04:28<00:00, 17.93s/it]


0.01
0.2


100%|██████████| 15/15 [04:26<00:00, 17.78s/it]


2
0.2


100%|██████████| 15/15 [04:19<00:00, 17.33s/it]


5
0.2


100%|██████████| 15/15 [04:19<00:00, 17.27s/it]


10
0.2


100%|██████████| 15/15 [04:19<00:00, 17.28s/it]


0.001
0.4


100%|██████████| 15/15 [04:19<00:00, 17.27s/it]


0.01
0.4


100%|██████████| 15/15 [04:20<00:00, 17.33s/it]


0.01
0.4


100%|██████████| 15/15 [04:19<00:00, 17.28s/it]


2
0.4


100%|██████████| 15/15 [04:19<00:00, 17.31s/it]


5
0.4


100%|██████████| 15/15 [04:20<00:00, 17.34s/it]


10
0.4


100%|██████████| 15/15 [04:19<00:00, 17.27s/it]


0.001
0.8


100%|██████████| 15/15 [04:19<00:00, 17.30s/it]


0.01
0.8


100%|██████████| 15/15 [04:18<00:00, 17.26s/it]


0.01
0.8


100%|██████████| 15/15 [04:19<00:00, 17.31s/it]


2
0.8


100%|██████████| 15/15 [04:18<00:00, 17.25s/it]


5
0.8


100%|██████████| 15/15 [04:19<00:00, 17.29s/it]


10
0.8


100%|██████████| 15/15 [04:18<00:00, 17.24s/it]


0.001
5


100%|██████████| 15/15 [04:18<00:00, 17.23s/it]


0.01
5


100%|██████████| 15/15 [04:18<00:00, 17.25s/it]


0.01
5


100%|██████████| 15/15 [04:18<00:00, 17.24s/it]


2
5


100%|██████████| 15/15 [04:18<00:00, 17.26s/it]


5
5


100%|██████████| 15/15 [04:19<00:00, 17.29s/it]


10
5


100%|██████████| 15/15 [04:19<00:00, 17.30s/it]


0.001
10


100%|██████████| 15/15 [04:19<00:00, 17.27s/it]


0.01
10


100%|██████████| 15/15 [04:19<00:00, 17.29s/it]


0.01
10


100%|██████████| 15/15 [04:18<00:00, 17.24s/it]


2
10


100%|██████████| 15/15 [04:19<00:00, 17.28s/it]


5
10


100%|██████████| 15/15 [04:19<00:00, 17.27s/it]


10
10


100%|██████████| 15/15 [04:18<00:00, 17.24s/it]


0.001
20


100%|██████████| 15/15 [04:19<00:00, 17.30s/it]


0.01
20


100%|██████████| 15/15 [04:18<00:00, 17.26s/it]


0.01
20


100%|██████████| 15/15 [04:19<00:00, 17.29s/it]


2
20


100%|██████████| 15/15 [04:19<00:00, 17.27s/it]


5
20


100%|██████████| 15/15 [04:19<00:00, 17.33s/it]


10
20


100%|██████████| 15/15 [04:19<00:00, 17.32s/it]


0.001
30


100%|██████████| 15/15 [04:18<00:00, 17.25s/it]


0.01
30


100%|██████████| 15/15 [04:19<00:00, 17.32s/it]


0.01
30


100%|██████████| 15/15 [04:19<00:00, 17.31s/it]


2
30


100%|██████████| 15/15 [04:18<00:00, 17.25s/it]


5
30


100%|██████████| 15/15 [04:18<00:00, 17.26s/it]


10
30


100%|██████████| 15/15 [04:19<00:00, 17.30s/it]


0.001
40


100%|██████████| 15/15 [04:19<00:00, 17.28s/it]


0.01
40


100%|██████████| 15/15 [04:19<00:00, 17.33s/it]


0.01
40


100%|██████████| 15/15 [04:19<00:00, 17.30s/it]


2
40


100%|██████████| 15/15 [04:19<00:00, 17.30s/it]


5
40


100%|██████████| 15/15 [04:19<00:00, 17.30s/it]


10
40


In [63]:
metrics_100_init

Unnamed: 0,P@5,TP@5,R@5,AP@5,TAP@5,NDCG@5,Hit@5,RR@5,ROC_AUC,PR_AUC
alpha: 0.2reg: 0.001,0.25528,0.255328,0.08814,0.062186,0.180368,0.213283,0.7177,0.489598,0.964864,0.14585
alpha: 0.2reg: 0.01,0.25538,0.255428,0.088166,0.06214,0.180315,0.213273,0.719,0.489815,0.964942,0.145887
alpha: 0.2reg: 0.01,0.25538,0.255428,0.088166,0.06214,0.180315,0.213273,0.719,0.489815,0.964942,0.145887
alpha: 0.2reg: 2,0.25696,0.257008,0.088715,0.062639,0.181736,0.214794,0.7193,0.49164,0.965583,0.147057
alpha: 0.2reg: 5,0.25804,0.258088,0.0891,0.062942,0.18267,0.215666,0.7219,0.494468,0.965489,0.147316
alpha: 0.2reg: 10,0.25754,0.257588,0.088945,0.062797,0.182207,0.215202,0.7212,0.4939,0.965341,0.147213
alpha: 0.4reg: 0.001,0.2449,0.244983,0.084862,0.058616,0.169458,0.199803,0.7013,0.465807,0.969746,0.142101
alpha: 0.4reg: 0.01,0.2448,0.244897,0.084851,0.058615,0.169447,0.199663,0.7008,0.465307,0.969861,0.142181
alpha: 0.4reg: 0.01,0.2448,0.244897,0.084851,0.058615,0.169447,0.199663,0.7008,0.465307,0.969861,0.142181
alpha: 0.4reg: 2,0.248,0.248102,0.086068,0.05952,0.172069,0.203213,0.708,0.470343,0.97064,0.144058


In [59]:
metrics_init

Unnamed: 0,P@5,TP@5,R@5,AP@5,TAP@5,NDCG@5,Hit@5,RR@5,ROC_AUC,PR_AUC
alpha: 0.4reg: 0.001,0.23288,0.232968,0.08058,0.05532,0.160309,0.191023,0.6844,0.453045,0.975846,0.134152
alpha: 0.4reg: 0.01,0.2331,0.233188,0.080659,0.055398,0.160546,0.191176,0.6838,0.453073,0.975887,0.1342
alpha: 0.4reg: 0.1,0.23334,0.233428,0.080721,0.055569,0.161093,0.191781,0.684,0.455382,0.975866,0.134437
alpha: 0.4reg: 2,0.23434,0.234407,0.080947,0.055776,0.161763,0.192233,0.6832,0.453102,0.976321,0.134839
alpha: 0.4reg: 5,0.2349,0.234953,0.081224,0.055914,0.162109,0.192471,0.6839,0.454728,0.976285,0.135129
alpha: 0.8reg: 0.001,0.21636,0.216378,0.074985,0.049981,0.14432,0.172386,0.6548,0.417315,0.979155,0.127014
alpha: 0.8reg: 0.01,0.21638,0.216398,0.074991,0.050003,0.144352,0.172504,0.6543,0.417703,0.979157,0.127047
alpha: 0.8reg: 0.1,0.21724,0.217288,0.075288,0.050051,0.144611,0.173097,0.6569,0.418417,0.979247,0.127176
alpha: 0.8reg: 2,0.21918,0.219203,0.075898,0.050642,0.14642,0.174686,0.6567,0.41983,0.979453,0.127997
alpha: 0.8reg: 5,0.21848,0.218503,0.075656,0.050401,0.145664,0.174104,0.6579,0.419673,0.979433,0.128114


In [64]:
wrmf = implicit.als.AlternatingLeastSquares(factors=50, regularization=1, random_state=123, num_threads=4)
wrmf.fit(X_train.T)

100%|██████████| 15/15 [04:08<00:00, 16.55s/it]


In [65]:
k = 5 ## Top-K recommendations to evaluate
metrics_wrmf = recometrics.calc_reco_metrics(
    X_train[:X_test.shape[0]], X_test,
    wrmf.user_factors[:X_test.shape[0]], wrmf.item_factors,
    k=k, all_metrics=True
)

  warn("'B' has more items than 'X_test'.")


In [27]:
all_metrics = [
    metrics_wrmf
]
all_metrics = pd.concat([m.mean(axis=0).to_frame().T for m in all_metrics], axis=0)
all_metrics.index = [
    "WRMF (a.k.a. iALS)"
]
all_metrics

Unnamed: 0,P@5,TP@5,R@5,AP@5,TAP@5,NDCG@5,Hit@5,RR@5,ROC_AUC,PR_AUC
WRMF (a.k.a. iALS),0.22208,0.222162,0.077338,0.051299,0.147571,0.174771,0.6652,0.419487,0.975165,0.133003


In [16]:
metr_impl = ranking_metrics_at_k(model=wrmf, train_user_items=X_train, test_user_items=X_test, K=5, show_progress=True, num_threads=4)

100%|██████████| 358812/358812 [02:05<00:00, 2856.39it/s]


In [17]:
metr_impl

{'precision': 0.2058831248823816,
 'map': 0.13492709196511338,
 'ndcg': 0.21811472272896915,
 'auc': 0.5356746013660015}

In [89]:
# MPR function
def MPR(model, train_data, test_data, user_f, item_f, k) :
    zähler_liste = []
    nenner_liste = []
    #for idx, u in enumerate(test_data.toarray()) :
    for u in np.unique(test_data.tocoo().row) :
        #mask = (train_data[u].toarray()[0] == 0)
        #pred_temp = []
        #for i in item_f[mask] :
        #    pred_temp.append(np.inner(user_f[u], i)) # training beobachtungen auslassen. Pro user die trainings items aus train_data weglassen
        #df = pd.DataFrame({'r' : test_data[u].toarray()[0][mask], 'pred' : pred_temp}) #, 'train' : train_data[u].toarray()[0][mask]})
        recs = model.recommend(user_items=train_data, userid=list(range(0,train_data.shape[0])), filter_already_liked_items=True, N = k)
        pred_temp = recs[1][u]
        r = test_data[u].toarray()[0][recs[0][u]]
        r = np.clip(r, a_min=0, a_max=1)
        df = pd.DataFrame({'r' : r, 'pred' : pred_temp})
        df = df.sort_values(by=['pred'], ascending=False)
        #df = df.drop(df[df.train > 0].index)
        index = list(range(0,len(df)))
        df['rankui'] = [x / (len(df)-1) for x in index]
        #df = df.iloc[:k, ]
        zähler_liste.append(np.inner(df.r, df.rankui))
        if sum(df.r) != 0:
            nenner_liste.append(sum(df.r))
        else:
            nenner_liste.append(1)
        if u % 100 == 0:
            print(u)
    mpr_list = [m/n for m, n in zip(zähler_liste, nenner_liste)]
    name = 'MPR@' + str(k)
    return pd.DataFrame({name : mpr_list})
    

In [87]:
train_data = X_train[:X_test.shape[0]]
test_data = X_test
user_factors = wrmf.user_factors[:X_test.shape[0]]
item_factors = wrmf.item_factors[:X_test.shape[0]]

Testing with implicit split and eval function if parameters behave the same

In [83]:
impl_train, impl_test = implicit.evaluation.train_test_split(X, train_percentage=0.7, random_state=123)

In [86]:
alpha = (X.shape[0] * X.shape[1] - X.nnz) / sum(X.data)
alpha

15.359144024256032

In [91]:
impl_model = implicit.als.AlternatingLeastSquares(factors=64, regularization=0.05, random_state=123, num_threads=4)
impl_model.fit(alpha * impl_train)
impl_metr = ranking_metrics_at_k(model=impl_model, train_user_items=impl_train.T.tocsr(), test_user_items=impl_test.T.tocsr(), K=10, show_progress=True, num_threads=4)
impl_metr

100%|██████████| 15/15 [03:20<00:00, 13.37s/it]
  0%|          | 0/117793 [00:00<?, ?it/s]

IndexError: index 179659 is out of bounds for axis 1 with size 160112

Since no changes, now with data loading provided by implicit:

In [130]:
from implicit.datasets.lastfm import get_lastfm
artists, users, plays = get_lastfm()

In [186]:
from implicit.evaluation import train_test_split
train, test = train_test_split(user_plays)

In [184]:
user_plays = plays.T

In [180]:
(plays.shape[0] * plays.shape[1] - plays.nnz) / sum(plays.data)

27.80136771101889

In [191]:
1 - (user_plays.nnz / (user_plays.shape[0] * user_plays.shape[1]))

0.9998328790268004

In [194]:
(user_plays.nnz / (user_plays.shape[0] * user_plays.shape[1]))

0.00016712097319960392

In [187]:
#init
from implicit.als import AlternatingLeastSquares
alpha = 15
model = AlternatingLeastSquares(factors=128, regularization=0.001, iterations=15, use_native=True, use_cg=True)
model.fit(train * alpha)
p = ranking_metrics_at_k(model, train, test, K=10, num_threads=4)

100%|██████████| 15/15 [04:09<00:00, 16.66s/it]
100%|██████████| 358532/358532 [04:55<00:00, 1212.17it/s]


In [188]:
init_df = pd.DataFrame(p, index=['reg: 0.001'])
init_df

Unnamed: 0,precision,map,ndcg,auc
reg: 0.001,0.071979,0.029409,0.071648,0.532456


In [177]:
from implicit.als import AlternatingLeastSquares
regularizations = [0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 20, 40]
alpha = 15
for i in range(len(regularizations)):
    model = AlternatingLeastSquares(factors=128, regularization=regularizations[i], iterations=15, use_native=True, use_cg=True)
    model.fit(train * alpha)
    p = ranking_metrics_at_k(model, train, test, K=10, num_threads=4)
    init_df = pd.concat((init_df, pd.DataFrame(p, index=['reg: ' + str(regularizations[i])])), axis=0)
    print(i / len(regularizations))

100%|██████████| 15/15 [00:04<00:00,  3.52it/s]
100%|██████████| 3470/3470 [00:00<00:00, 9663.60it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.0


100%|██████████| 15/15 [00:04<00:00,  3.43it/s]
100%|██████████| 3470/3470 [00:00<00:00, 9717.70it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.1


100%|██████████| 15/15 [00:04<00:00,  3.43it/s]
100%|██████████| 3470/3470 [00:00<00:00, 9504.72it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.2


100%|██████████| 15/15 [00:05<00:00,  2.74it/s]
100%|██████████| 3470/3470 [00:01<00:00, 3174.03it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.3


100%|██████████| 15/15 [00:10<00:00,  1.47it/s]
100%|██████████| 3470/3470 [00:00<00:00, 8260.09it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.4


100%|██████████| 15/15 [00:04<00:00,  3.01it/s]
100%|██████████| 3470/3470 [00:00<00:00, 9300.88it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.5


100%|██████████| 15/15 [00:04<00:00,  3.41it/s]
100%|██████████| 3470/3470 [00:00<00:00, 9276.02it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.6


100%|██████████| 15/15 [00:04<00:00,  3.42it/s]
100%|██████████| 3470/3470 [00:00<00:00, 9105.47it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.7


100%|██████████| 15/15 [00:04<00:00,  3.41it/s]
100%|██████████| 3470/3470 [00:00<00:00, 8952.66it/s]
  0%|          | 0/15 [00:00<?, ?it/s]

0.8


100%|██████████| 15/15 [00:04<00:00,  3.39it/s]
100%|██████████| 3470/3470 [00:00<00:00, 9300.83it/s]

0.9





In [178]:
init_df

Unnamed: 0,precision,map,ndcg,auc
reg: 0.001,0.203908,0.09157,0.187735,0.532693
reg: 0.003,0.207597,0.09461,0.191718,0.532953
reg: 0.01,0.205582,0.094251,0.192341,0.533178
reg: 0.03,0.209476,0.097526,0.196607,0.53362
reg: 0.1,0.206607,0.094425,0.192672,0.534375
reg: 0.3,0.212209,0.096806,0.195736,0.533782
reg: 1,0.213337,0.099423,0.198824,0.534537
reg: 3,0.207666,0.093935,0.19188,0.532945
reg: 10,0.21074,0.096722,0.195408,0.533844
reg: 20,0.222492,0.104173,0.205851,0.534349


In [168]:
init_df # alpha=5

Unnamed: 0,precision,map,ndcg,auc
reg: 0.001,0.275647,0.142135,0.25626,0.540093
reg: 0.003,0.276945,0.145457,0.2602,0.540275
reg: 0.01,0.277662,0.143902,0.257661,0.539836
reg: 0.03,0.27305,0.142123,0.256762,0.539842
reg: 0.1,0.282684,0.148001,0.263153,0.540671
reg: 0.3,0.28241,0.145258,0.261059,0.540982
reg: 1,0.27756,0.143474,0.258623,0.540241
reg: 3,0.28019,0.147173,0.262406,0.540536
reg: 10,0.28733,0.15253,0.268325,0.54138
reg: 20,0.296826,0.159597,0.277751,0.54258


In [164]:
init_df # aöpha = 15

Unnamed: 0,precision,map,ndcg,auc
reg: 0.001,0.204694,0.092708,0.188691,0.532273
reg: 0.003,0.206743,0.095636,0.192071,0.532049
reg: 0.01,0.210774,0.096524,0.194036,0.533175
reg: 0.03,0.207358,0.093976,0.190589,0.532587
reg: 0.1,0.209306,0.094043,0.192157,0.533485
reg: 0.3,0.210023,0.096654,0.195236,0.534035
reg: 1,0.211594,0.099213,0.19821,0.534196
reg: 3,0.204728,0.092348,0.189119,0.532795
reg: 10,0.212961,0.097938,0.19685,0.533927
reg: 20,0.223448,0.103368,0.205401,0.534644


In [160]:
init_df # alpha = 27

Unnamed: 0,precision,map,ndcg,auc
reg: 0.001,0.177365,0.076519,0.162823,0.529095
reg: 0.003,0.175657,0.076139,0.162595,0.528929
reg: 0.01,0.175793,0.076655,0.163287,0.528527
reg: 0.03,0.175349,0.075697,0.161742,0.529638
reg: 0.1,0.179927,0.078557,0.166509,0.529492
reg: 0.3,0.181942,0.078665,0.167816,0.530604
reg: 1,0.177467,0.076356,0.164014,0.529878
reg: 3,0.178936,0.077334,0.165528,0.530628
reg: 10,0.178868,0.076127,0.165011,0.529261
reg: 20,0.186144,0.081093,0.171609,0.531458


In [152]:
pd.DataFrame(p, index=[1])

Unnamed: 0,precision,map,ndcg,auc
1,0.175938,0.098132,0.166902,0.51779


In [129]:
alpha = (train.shape[0] * train.shape[1] - train.nnz) / sum(train.data)
alpha

8.05218508037773

In [133]:
from implicit.evaluation import precision_at_k, train_test_split
from implicit.als import AlternatingLeastSquares
from implicit.datasets.movielens import get_movielens

movies, ratings = get_movielens("1m")
train, test = implicit.evaluation.train_test_split(ratings)

model = AlternatingLeastSquares(factors=128, regularization=0.01, iterations=15)
alpha = (ratings.shape[0] * ratings.shape[1] - ratings.nnz) / sum(ratings.data)
model.fit(train * alpha)

p = ranking_metrics_at_k(model, train, test, K=10, num_threads=4)
p

100%|██████████| 15/15 [00:04<00:00,  3.00it/s]
100%|██████████| 3470/3470 [00:00<00:00, 8260.02it/s]


{'precision': 0.26085471253373416,
 'map': 0.12917824423504048,
 'ndcg': 0.24068824646783163,
 'auc': 0.5385924544400514}