In [1]:
import numpy as np, pandas as pd
from scipy.sparse import coo_matrix
from poismf import PoisMF
from pandas.api.types import CategoricalDtype
from scipy import sparse
from sklearn.model_selection import train_test_split
import tqdm


In [2]:
train_df = pd.read_csv("pf_last_netflix_time_test.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)
test_df = pd.read_csv("pf_last_netflix_time_train.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)


In [3]:
test_users = test_df['user_id'].unique()
test_items = test_df['movie_id'].unique()


train_users = train_df['user_id'].unique()
train_items = train_df['movie_id'].unique()


user = list(set(train_users) & set(test_users))
item = list(set(train_items) & set(test_items))


In [4]:
train_df = train_df[train_df['user_id'].isin(user)]
train_df = train_df[train_df['movie_id'].isin(item)]

test_df = test_df[test_df['user_id'].isin(user)]
test_df = test_df[test_df['movie_id'].isin(item)]

In [5]:

test_ratings_matrix = test_df.pivot(index='user_id', columns='movie_id', values='rating')
train_ratings_matrix = train_df.pivot(index='user_id', columns='movie_id', values='rating')


In [6]:
display(train_ratings_matrix)
display(test_ratings_matrix)

movie_id,16,18,29,44,45,46,47,55,57,77,...,17740,17742,17746,17747,17754,17756,17758,17762,17763,17764
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
684,,,,,,,,,,,...,,,,,,,,,,
1333,,1.0,,,,,,1.0,,1.0,...,,,,,,,,1.0,1.0,1.0
2307,,,,,,,,,,,...,,,,,,,,,,
2757,,1.0,,,,,,,,,...,,,,,,,,,,1.0
2976,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648298,,,,,,,,,,,...,,,,,,,,,,
2648853,,,,,,,,,,,...,,,,,,,,,,
2649097,,,,,,,,,,,...,,,,,,,,,,
2649285,,,,,,,,,,,...,,,,,,,,1.0,,


movie_id,16,18,29,44,45,46,47,55,57,77,...,17740,17742,17746,17747,17754,17756,17758,17762,17763,17764
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
684,,,,,,,,,,,...,,,,,,,,1.0,,1.0
1333,,,,1.0,,,,,1.0,,...,,,,,,,1.0,,,
2307,,,,,,,,,,,...,,,,,,,,,,1.0
2757,,,,,,,,,,,...,,,,,,,,,,
2976,,,,,,,,,,,...,,,,,,,,,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2648298,,,,,,,,,,,...,,,,,,,,,,
2648853,,,,,,,,,,,...,,,,,,,,,,1.0
2649097,,,,,,,,,,,...,,,,,,,,,,
2649285,,,,,,,,,,,...,,,,,,,,,,1.0


In [7]:
user_positives = test_df.groupby('user_id')['movie_id'].apply(set).to_dict()


In [8]:
train_df =train_df.drop("timestamp",axis=1)
train_df.columns =["UserId", "ItemId", "Count"]

display(train_df)


Unnamed: 0,UserId,ItemId,Count
0,684,963,1
1,1333,13882,1
2,2307,7040,1
3,2757,5191,1
4,2976,8764,1
...,...,...,...
1003726,464172,7875,1
1003727,1235018,12456,1
1003728,800136,6302,1
1003729,728335,17053,1


In [9]:
modelPF = PoisMF(reindex=True, method="tncg", use_float=True,
                    early_stop=False, reuse_prev=True,
                    k=20, niter=10, maxupd=200, l2_reg=1e3)\
                .fit(train_df)

In [10]:
display(modelPF.A.shape)
display(modelPF.B.shape)


(6163, 20)

(4105, 20)

In [11]:
pred = np.matmul(modelPF.A,modelPF.B.T)

In [12]:
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [13]:
display(pred.shape)

(6163, 4105)

In [14]:
test_ratings_matrix[test_ratings_matrix >1] = 1

true = torch.tensor(test_ratings_matrix.fillna(0).to_numpy())


In [15]:
predv = torch.tensor(pred)


In [16]:
display(true)
display(predv)

tensor([[0., 0., 0.,  ..., 1., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 1.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

tensor([[0.0829, 0.0135, 0.0967,  ..., 0.0034, 0.0351, 0.0061],
        [0.1710, 0.0277, 0.1994,  ..., 0.0070, 0.0724, 0.0126],
        [0.1017, 0.0165, 0.1185,  ..., 0.0042, 0.0430, 0.0075],
        ...,
        [0.0204, 0.0033, 0.0238,  ..., 0.0008, 0.0087, 0.0015],
        [0.1227, 0.0199, 0.1431,  ..., 0.0050, 0.0520, 0.0090],
        [0.0770, 0.0125, 0.0898,  ..., 0.0032, 0.0326, 0.0057]])

In [17]:
def calculate_metrics(pred_df, user_postivies, k_list = [10,50,100]):
  metrics = {
    'MAR@ALL': [],
    'MAR@1000': [],
  }


  grouped = pred_df.groupby('user')

  for user, user_data in tqdm.tqdm(grouped, desc="Processing Users"):
    positives = user_postivies.get(user, set())
    if len(positives) == 0:
      continue

    sorted_data = user_data.sort_values(by='score', ascending=False)
    all_items = sorted_data['item'].tolist()

    relevant_ranks1k = []
    relevant_ranks_all=[]
    for rank, item in enumerate(all_items[:1000], start=1):
      if item in positives:
        relevant_ranks1k.append(rank)
    #calculate relevant
    for rank, item in enumerate(all_items, start=1):
      if item in positives:
        relevant_ranks_all.append(rank)



    if relevant_ranks_all:
      metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
    if relevant_ranks1k:
      metrics['MAR@1000'].append(np.mean(relevant_ranks1k))
      

  
  final = {
    'MAR@ALL': np.mean(metrics['MAR@ALL']),
    'MAR@1000': np.mean(metrics['MAR@1000']),
  }

  return final

In [18]:
def metricStuff(pred,true,user_positive):

    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("MRR@all",mean_reciprocal_rank(pred, true))
    marData = pred.detach().numpy()
    marDF = pd.DataFrame(marData)
    marDF.index.name = "user"
    marDF.columns.name = "items"
    marDF = marDF.stack().reset_index()
    marDF.columns = ['user', 'item', 'score']
    print("MAR",calculate_metrics(marDF,user_positive))
    print("MAP",mean_average_precision(pred, true))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    

In [19]:
metricStuff(predv,true,user_positives)

NDCG@10 tensor(0.1479)
MRR@1000 tensor(0.3073)
MRR@all tensor(0.3073)


Processing Users: 100%|██████████| 6163/6163 [00:01<00:00, 4170.74it/s]


MAR {'MAR@ALL': np.float64(2152.4912393439195), 'MAR@1000': np.float64(492.8627471003942)}
MAP tensor(0.0873)
P@50 tensor(0.0787)
R@50 tensor(0.0109)
