In [None]:
import numpy as np, pandas as pd
from scipy.sparse import coo_matrix
from implicit.bpr import BayesianPersonalizedRanking
from pandas.api.types import CategoricalDtype
from scipy import sparse
from sklearn.model_selection import train_test_split
import tqdm
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [2]:
data = pd.read_csv("ml100k_ratings.csv",sep=",")


In [3]:
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)


In [4]:
train_df["rating"] =1
test_df["rating"]=1

In [5]:
user_positives = test_df.groupby('userId')['movieId'].apply(set).to_dict()


In [6]:
users = train_df['userId'].unique()
items = train_df['movieId'].unique()
users = sorted(users)
items = sorted(items)


ratingMatrix = test_df.pivot(index='userId', columns='movieId', values='rating')
ratingMatrix = ratingMatrix.reindex(index=users, columns=items, fill_value=0)

display(ratingMatrix)

movieId,1,2,3,4,5,6,7,8,9,10,...,1667,1668,1670,1671,1672,1673,1676,1678,1679,1680
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,,1.0,1.0,,1.0,,1.0,,1.0,...,0,0,0,0,0,0,0,0,0,0
2,,,,,,,,,,1.0,...,0,0,0,0,0,0,0,0,0,0
3,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
4,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
5,,1.0,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0
940,,,,,,,1.0,1.0,,,...,0,0,0,0,0,0,0,0,0,0
941,,,,,,,1.0,,,,...,0,0,0,0,0,0,0,0,0,0
942,,,,,,,,,,,...,0,0,0,0,0,0,0,0,0,0


In [7]:
users = train_df["userId"].unique()
movies = train_df["movieId"].unique()
shape = (len(users), len(movies))

# Create indices for users and movies
user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
movie_cat = CategoricalDtype(categories=sorted(movies), ordered=True)
user_index = train_df["userId"].astype(user_cat).cat.codes
movie_index = train_df["movieId"].astype(movie_cat).cat.codes

# Conversion via COO matrix
coo = sparse.coo_matrix((train_df["rating"], (user_index, movie_index)), shape=shape)
csr = coo.tocsr()
#https://hippocampus-garden.com/pandas_sparse/

In [8]:
display(csr.shape)

(943, 1631)

In [9]:
bpr = BayesianPersonalizedRanking(factors=19, learning_rate=0.001,
                                  regularization=0.001, dtype=np.float64,
                                  iterations=100)

In [10]:
bpr.fit(csr)


  0%|          | 0/100 [00:00<?, ?it/s]

In [11]:
pred = np.matmul(bpr.user_factors,bpr.item_factors.T)

In [13]:
ratingMatrix[ratingMatrix >1] = 1

true = torch.tensor(ratingMatrix.fillna(0).to_numpy())


In [14]:
predv = torch.tensor(pred)


In [15]:
display(true)
display(predv)

tensor([[1., 0., 1.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

tensor([[ 0.4786,  0.0619, -0.0410,  ...,  0.0106,  0.0079,  0.0126],
        [ 0.4795,  0.0542, -0.0401,  ...,  0.0110,  0.0061,  0.0136],
        [ 0.4809,  0.0571, -0.0405,  ...,  0.0114,  0.0075,  0.0120],
        ...,
        [ 0.4805,  0.0568, -0.0406,  ...,  0.0114,  0.0079,  0.0137],
        [ 0.4789,  0.0581, -0.0420,  ...,  0.0089,  0.0063,  0.0115],
        [ 0.4781,  0.0620, -0.0381,  ...,  0.0063,  0.0073,  0.0134]],
       dtype=torch.float64)

In [16]:
def calculate_metrics(pred_df, user_postivies, k_list = [10,50,100]):
  metrics = {
    'MAR@ALL': [],
    'MAR@1000': [],
  }


  grouped = pred_df.groupby('user')

  for user, user_data in tqdm.tqdm(grouped, desc="Processing Users"):
    positives = user_postivies.get(user, set())
    if len(positives) == 0:
      continue

    sorted_data = user_data.sort_values(by='score', ascending=False)
    all_items = sorted_data['item'].tolist()

    relevant_ranks1k = []
    relevant_ranks_all=[]
    for rank, item in enumerate(all_items[:1000], start=1):
      if item in positives:
        relevant_ranks1k.append(rank)
    #calculate relevant
    for rank, item in enumerate(all_items, start=1):
      if item in positives:
        relevant_ranks_all.append(rank)



    if relevant_ranks_all:
      metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
    if relevant_ranks1k:
      metrics['MAR@1000'].append(np.mean(relevant_ranks1k))
      

  
  final = {
    'MAR@ALL': np.mean(metrics['MAR@ALL']),
    'MAR@1000': np.mean(metrics['MAR@1000']),
  }

  return final

In [17]:
def metricStuff(pred,true,user_positive):

    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("MRR@all",mean_reciprocal_rank(pred, true))
    marData = pred.detach().numpy()
    marDF = pd.DataFrame(marData)
    marDF.index.name = "user"
    marDF.columns.name = "items"
    marDF = marDF.stack().reset_index()
    marDF.columns = ['user', 'item', 'score']
    print("MAR",calculate_metrics(marDF,user_positive))
    print("MAP",mean_average_precision(pred, true))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    

In [18]:
metricStuff(predv,true,user_positives)

NDCG@10 tensor(0.1565)
MRR@1000 tensor(0.3540)
MRR@all tensor(0.3540)


Processing Users: 100%|██████████| 943/943 [00:00<00:00, 1792.39it/s]


MAR {'MAR@ALL': np.float64(579.6927117266181), 'MAR@1000': np.float64(327.90701746972843)}
MAP tensor(0.1022)
P@50 tensor(0.1084)
R@50 tensor(0.2035)
