In [1]:
import numpy as np, pandas as pd
from scipy.sparse import coo_matrix
from implicit.bpr import BayesianPersonalizedRanking
import recometrics
from poismf import PoisMF
from pandas.api.types import CategoricalDtype
from scipy import sparse
from sklearn.model_selection import train_test_split
import recmetrics
import tqdm


In [2]:
data = pd.read_csv("ml100k_ratings.csv",sep=",")


In [3]:
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)


In [4]:
user_positives = test_df.groupby('userId')['movieId'].apply(set).to_dict()


In [32]:
users = train_df['userId'].unique()
items = train_df['movieId'].unique()
users = sorted(users)
items = sorted(items)


ratingMatrix = test_df.pivot(index='userId', columns='movieId', values='rating')
ratingMatrix = ratingMatrix.reindex(index=users, columns=items, fill_value=0)

display(ratingMatrix)

movieId,1,2,3,4,5,6,7,8,9,10,...,1667,1668,1670,1671,1672,1673,1676,1678,1679,1680
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,4.0,3.0,,5.0,,1.0,,3.0,...,0,0,0,0,0,0,0,0,0,0
2,,,,,,,,,,2.0,...,0,0,0,0,0,0,0,0,0,0
3,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
5,,3.0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
940,,,,,,,4.0,5.0,,,...,0,0,0,0,0,0,0,0,0,0
941,,,,,,,4.0,,,,...,0,0,0,0,0,0,0,0,0,0
942,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [21]:
train_df["tstamp"] =1
train_df =train_df.drop("rating",axis=1)
train_df.columns =["UserId", "ItemId", "Count"]

display(train_df)


Unnamed: 0,UserId,ItemId,Count
76513,907,628,1
60406,622,206,1
27322,18,480,1
53699,484,699,1
65412,871,690,1
...,...,...,...
6265,216,231,1
54886,343,276,1
76820,437,475,1
860,284,322,1


In [33]:
modelPF = PoisMF(reindex=True, method="tncg", use_float=True,
                    early_stop=False, reuse_prev=True,
                    k=20, niter=10, maxupd=200, l2_reg=1e3)\
                .fit(train_df)

In [34]:
display(modelPF.A.shape) # user
display(modelPF.B.shape) # items


(943, 20)

(1631, 20)

In [35]:
pred = np.matmul(modelPF.A,modelPF.B.T)

In [38]:
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [36]:
display(pred.shape)

(943, 1631)

In [42]:
ratingMatrix[ratingMatrix >1] = 1

true = torch.tensor(ratingMatrix.fillna(0).to_numpy())


In [43]:
predv = torch.tensor(pred)


In [44]:
display(true)
display(predv)

tensor([[1., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

tensor([[0.1471, 0.0535, 0.1636,  ..., 0.0018, 0.0018, 0.0018],
        [0.1648, 0.0599, 0.1833,  ..., 0.0020, 0.0020, 0.0020],
        [0.1650, 0.0600, 0.1836,  ..., 0.0020, 0.0020, 0.0020],
        ...,
        [0.0221, 0.0080, 0.0245,  ..., 0.0003, 0.0003, 0.0003],
        [0.0220, 0.0080, 0.0245,  ..., 0.0003, 0.0003, 0.0003],
        [0.0206, 0.0075, 0.0230,  ..., 0.0002, 0.0003, 0.0003]])

In [45]:
def calculate_metrics(pred_df, user_postivies, k_list = [10,50,100]):
  metrics = {
    'MAR@ALL': [],
    'MAR@1000': [],
  }


  grouped = pred_df.groupby('user')

  for user, user_data in tqdm.tqdm(grouped, desc="Processing Users"):
    positives = user_postivies.get(user, set())
    if len(positives) == 0:
      continue

    sorted_data = user_data.sort_values(by='score', ascending=False)
    all_items = sorted_data['item'].tolist()

    relevant_ranks1k = []
    relevant_ranks_all=[]
    for rank, item in enumerate(all_items[:1000], start=1):
      if item in positives:
        relevant_ranks1k.append(rank)
    #calculate relevant
    for rank, item in enumerate(all_items, start=1):
      if item in positives:
        relevant_ranks_all.append(rank)



    if relevant_ranks_all:
      metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
    if relevant_ranks1k:
      metrics['MAR@1000'].append(np.mean(relevant_ranks1k))
      

  
  final = {
    'MAR@ALL': np.mean(metrics['MAR@ALL']),
    'MAR@1000': np.mean(metrics['MAR@1000']),
  }

  return final

In [46]:
def metricStuff(pred,true,user_positive):

    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("MRR@all",mean_reciprocal_rank(pred, true))
    marData = pred.detach().numpy()
    marDF = pd.DataFrame(marData)
    marDF.index.name = "user"
    marDF.columns.name = "items"
    marDF = marDF.stack().reset_index()
    marDF.columns = ['user', 'item', 'score']
    print("MAR",calculate_metrics(marDF,user_positive))
    print("MAP",mean_average_precision(pred, true))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    

In [47]:
metricStuff(predv,true,user_positives)

NDCG@10 tensor(0.0241)
MRR@1000 tensor(0.0772)
MRR@all tensor(0.0772)


Processing Users: 100%|██████████| 943/943 [00:00<00:00, 1219.68it/s]


MAR {'MAR@ALL': np.float64(509.7278431614733), 'MAR@1000': np.float64(406.23317585109965)}
MAP tensor(0.0417)
P@50 tensor(0.0408)
R@50 tensor(0.0626)
