In [1]:
import numpy as np, pandas as pd
from poismf import PoisMF
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
import tqdm
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation


In [2]:
df = pd.read_csv("netflix_user_movie.csv",sep=",",)

In [3]:
train_df, test_df = train_test_split(df, test_size=0.3, random_state=42)


In [4]:
test_ratings_matrix = test_df.pivot(index='user_id', columns='movie_id', values='rating')
display(test_ratings_matrix)

movie_id,16,18,29,44,45,46,47,55,57,77,...,17740,17742,17746,17747,17754,17756,17758,17762,17763,17764
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
684,,,,,,,,,,,...,,,,,,,,,,1.0
1333,,1.0,,,,,,,1.0,,...,,,,,,,,,,
2307,,,,,,,,,,,...,,,,,,,,,,
2757,,,,,,,,,,,...,,,,,,,,,,
2976,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648298,,,,,,,,1.0,,,...,,,,,,,,,,
2648853,,1.0,,,,,,,,,...,,,,,,,,,,
2649097,,,,,,,,,,,...,,,,,,,,,,
2649285,,,,,,,,,,,...,,,,,,,,,,


In [5]:
user_positives = test_df.groupby('user_id')['movie_id'].apply(set).to_dict()


In [6]:
train_df =train_df.drop("timestamp",axis=1)
train_df.columns =["UserId", "ItemId", "Count"]

display(train_df)


Unnamed: 0,UserId,ItemId,Count
1480648,443193,7886,1
1179015,2338157,6408,1
1775987,782308,9528,1
1139411,2242958,6274,1
2788378,2080822,14601,1
...,...,...,...
1692743,1969676,9051,1
2356330,257710,12473,1
2229084,1384810,11982,1
2768307,1395543,14513,1


In [7]:
modelPF = PoisMF(reindex=True, method="tncg", use_float=True,
                    early_stop=False, reuse_prev=True,
                    k=20, niter=10, maxupd=200, l2_reg=1e3)\
                .fit(train_df)
# use the best gradient method with default values
# have k = 20 the same as the original experiement


In [8]:
display(modelPF.A.shape)
display(modelPF.B.shape)


(6163, 20)

(4105, 20)

In [9]:
pred = np.matmul(modelPF.A,modelPF.B.T)

In [10]:
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [11]:
display(pred.shape)

(6163, 4105)

In [None]:
test_ratings_matrix[test_ratings_matrix >1] = 1
# binarized the data for the evaluation

true = torch.tensor(test_ratings_matrix.fillna(0).to_numpy())


In [13]:
predv = torch.tensor(pred)


In [14]:
display(true)
display(predv)

tensor([[0., 0., 0.,  ..., 0., 0., 1.],
        [0., 1., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

tensor([[0.2040, 0.6408, 0.2832,  ..., 0.0087, 0.0134, 0.0108],
        [0.2045, 0.6424, 0.2839,  ..., 0.0087, 0.0135, 0.0108],
        [0.2048, 0.6434, 0.2843,  ..., 0.0087, 0.0135, 0.0108],
        ...,
        [0.0319, 0.1001, 0.0443,  ..., 0.0013, 0.0021, 0.0017],
        [0.0492, 0.1546, 0.0683,  ..., 0.0021, 0.0032, 0.0026],
        [0.0298, 0.0936, 0.0414,  ..., 0.0013, 0.0020, 0.0016]])

In [15]:
# Jinming Code

def calculate_metrics(pred_df, user_postivies, k_list = [10,50,100]):
  metrics = {
    'MAR@ALL': [],
    'MAR@1000': [],
  }


  grouped = pred_df.groupby('user')

  for user, user_data in tqdm.tqdm(grouped, desc="Processing Users"):
    positives = user_postivies.get(user, set())
    if len(positives) == 0:
      continue

    sorted_data = user_data.sort_values(by='score', ascending=False)
    all_items = sorted_data['item'].tolist()

    relevant_ranks1k = []
    relevant_ranks_all=[]
    for rank, item in enumerate(all_items[:1000], start=1):
      if item in positives:
        relevant_ranks1k.append(rank)
    #calculate relevant
    for rank, item in enumerate(all_items, start=1):
      if item in positives:
        relevant_ranks_all.append(rank)



    if relevant_ranks_all:
      metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
    if relevant_ranks1k:
      metrics['MAR@1000'].append(np.mean(relevant_ranks1k))
      

  
  final = {
    'MAR@ALL': np.mean(metrics['MAR@ALL']),
    'MAR@1000': np.mean(metrics['MAR@1000']),
  }

  return final

In [16]:
def metricStuff(pred,true,user_positive):

    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("MRR@all",mean_reciprocal_rank(pred, true))
    marData = pred.detach().numpy()
    marDF = pd.DataFrame(marData)
    marDF.index.name = "user"
    marDF.columns.name = "items"
    marDF = marDF.stack().reset_index()
    marDF.columns = ['user', 'item', 'score']
    print("MAR",calculate_metrics(marDF,user_positive))
    print("MAP",mean_average_precision(pred, true))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    

In [17]:
metricStuff(predv,true,user_positives)

NDCG@10 tensor(0.0568)
MRR@1000 tensor(0.1421)
MRR@all tensor(0.1421)


Processing Users: 100%|██████████| 6163/6163 [00:01<00:00, 4368.48it/s]


MAR {'MAR@ALL': np.float64(2141.644424031392), 'MAR@1000': np.float64(538.5314443132625)}
MAP tensor(0.0423)
P@50 tensor(0.0453)
R@50 tensor(0.0140)
