In [1]:
import numpy as np, pandas as pd
from poismf import PoisMF
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
import tqdm
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation


In [2]:
train_df = pd.read_csv("train.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)
test_df = pd.read_csv("test.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)


In [3]:
test_users = test_df['user_id'].unique()
test_items = test_df['movie_id'].unique()


train_users = train_df['user_id'].unique()
train_items = train_df['movie_id'].unique()


user = list(set(train_users) & set(test_users))
item = list(set(train_items) & set(test_items))


In [4]:
train_df = train_df[train_df['user_id'].isin(test_users)]
train_df = train_df[train_df['movie_id'].isin(test_items)]

test_df = test_df[test_df['user_id'].isin(user)]
test_df = test_df[test_df['movie_id'].isin(item)]


# some how the test and train split was sparse enough i guess that some of the few rows/ columns
# that had value were removed due to the split
#https://stackoverflow.com/questions/27965295/dropping-rows-from-dataframe-based-on-a-not-in-condition


In [5]:

test_ratings_matrix = test_df.pivot(index='user_id', columns='movie_id', values='rating')
train_ratings_matrix = train_df.pivot(index='user_id', columns='movie_id', values='rating')


In [6]:
display(train_ratings_matrix)
display(test_ratings_matrix)

movie_id,1,4,16,17,19,27,32,37,42,43,...,17726,17727,17729,17731,17733,17738,17754,17760,17767,17769
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59,,,,,,,,,,,...,,,,,,,,,,
83,,,,,,,,,,,...,,,,,,,,,,
94,,,,,,,,,,,...,,,,,,,,,,
116,,,,,,,,,,,...,,,,,,,,,,
126,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649379,,,,,,,,,,,...,,,,,,,,,,
2649401,,,,,,,,,,,...,,,,,,,,,,
2649404,,,,,,,,,,,...,,,,,,,,,,
2649409,,,,,,,,,,,...,,,,,,,,,,


movie_id,1,4,16,17,19,27,32,37,42,43,...,17726,17727,17729,17731,17733,17738,17754,17760,17767,17769
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
59,,,,,,,,,,,...,,,,,,,,,,
83,,,,,,,,,,,...,,,,,,,,,,
94,,,,,,,,,,,...,,,,,,,,,,
116,,,,,,,,,,,...,,,,,,,,,,
126,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649379,,,,,,,,,,,...,,,,,,,,,,
2649401,,,,,,,,,,,...,,,,,,,,,,
2649404,,,,,,,,,,,...,,,,,,,,,,
2649409,,,,,,,,,,,...,,,,,,,,,,


In [7]:
user_positives = test_df.groupby('user_id')['movie_id'].apply(set).to_dict()


In [8]:
del test_df

In [9]:
train_df["rating"] =1
train_df =train_df.drop("timestamp",axis=1)

train_df.columns =["UserId", "ItemId", "Count"]

display(train_df)


Unnamed: 0,UserId,ItemId,Count
0,2442,1,1
1,662870,1,1
2,525356,1,1
3,1910569,1,1
4,1434636,1,1
...,...,...,...
6237311,2497891,17769,1
6237312,439703,17769,1
6237313,685481,17769,1
6237314,77664,17769,1


In [10]:
modelPF = PoisMF(reindex=True, method="tncg", use_float=True,
                    early_stop=False, reuse_prev=True,
                    k=20, niter=10, maxupd=200, l2_reg=1e3)\
                .fit(train_df)
# use the best gradient method with default values
# have k = 20 the same as the original experiement


In [11]:
display(modelPF.A.shape) # user
display(modelPF.B.shape) # items


(139785, 20)

(5331, 20)

In [12]:
pred = np.matmul(modelPF.A,modelPF.B.T)

In [13]:
display(pred)

array([[0.00090841, 0.00026393, 0.00426085, ..., 0.00187193, 0.00055657,
        0.02486422],
       [0.00122777, 0.00024778, 0.00500965, ..., 0.00059249, 0.00044039,
        0.01351441],
       [0.00941936, 0.00214705, 0.04440563, ..., 0.00622868, 0.00297121,
        0.09830831],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00218122],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00218122],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.00218121]], dtype=float32)

In [14]:
display(pred.shape)

(139785, 5331)

Delete unused variables to free up space

In [15]:
del modelPF
del train_df

In [16]:
predv = torch.from_numpy(pred)
#https://medium.com/@heyamit10/converting-a-numpy-array-to-a-tensor-step-by-step-df329c44b035

In [17]:
del pred

In [18]:
marData = predv.detach().numpy()
marDF = pd.DataFrame(marData)
marDF.index.name = "user"
marDF.columns.name = "items"

# marDF = marDF.stack().reset_index()
# marDF.columns = ['user', 'item', 'score']
# print("MAR",calculate_metrics(marDF,user_positives))


In [19]:
# Jinming code
def calculate_metrics_matrix(pred_df, user_positives):
    metrics = {
        'MAR@ALL': [],
        'MAR@1000': [],
    }

    user_ids = pred_df.index.values  # Get actual user identifiers

    for user in tqdm.tqdm(user_ids, desc="Processing Users"):
        positives = user_positives.get(user, set())

        if len(positives) == 0:
            continue

        data = pred_df.loc[user]
        # display(data, positives)

        # this is rank
        rank = data.index.values
        # this is pred score
        score = data.values

        sorted_index = np.argsort(score)[::-1] 
        sorted_items = rank[sorted_index] 

        relevant_ranks1k = []
        relevant_ranks_all=[]

        for rank, item in enumerate(sorted_items[:1000], start=1):
            if item in positives:
                relevant_ranks1k.append(rank)
        #calculate relevant
        for rank, item in enumerate(sorted_items, start=1):
            if item in positives:
                relevant_ranks_all.append(rank)



        if relevant_ranks_all:
            metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
        if relevant_ranks1k:
            metrics['MAR@1000'].append(np.mean(relevant_ranks1k))



    final = {
        'MAR@ALL': np.mean(metrics['MAR@ALL']),
        'MAR@1000': np.mean(metrics['MAR@1000']),
    }

    return final

In [20]:
calculate_metrics_matrix(marDF,user_positives)

Processing Users: 100%|██████████| 139785/139785 [00:10<00:00, 13574.12it/s]


{'MAR@ALL': np.float64(2659.4111151670113),
 'MAR@1000': np.float64(474.9189729210325)}

In [21]:
del marDF
del user_positives

In [22]:
true = torch.tensor(test_ratings_matrix.values,dtype=torch.float16)

In [23]:
del test_ratings_matrix

In [24]:
display(true)
display(predv)

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], dtype=torch.float16)

tensor([[0.0009, 0.0003, 0.0043,  ..., 0.0019, 0.0006, 0.0249],
        [0.0012, 0.0002, 0.0050,  ..., 0.0006, 0.0004, 0.0135],
        [0.0094, 0.0021, 0.0444,  ..., 0.0062, 0.0030, 0.0983],
        ...,
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0022],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0022],
        [0.0000, 0.0000, 0.0000,  ..., 0.0000, 0.0000, 0.0022]])

In [25]:
true = true.nan_to_num(0)
predv = predv.nan_to_num(0)

In [26]:
display(true.shape)
display(predv.shape)


torch.Size([139785, 5331])

torch.Size([139785, 5331])

In [27]:
true[true<0]=0
true[true>1]=1
# daata needs to be binarized for evaluation

In [28]:
def metricStuff(pred,true):
    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    # code breaks when trying to compute metrics that are not @k
    # I.e. not enough memory too compute and not enough storage space on hard drive to write to
    print("MRR@all",mean_reciprocal_rank(pred, true))
    print("MAP",mean_average_precision(pred, true))
    

In [29]:
metricStuff(predv,true)

NDCG@10 tensor(0.1308)
MRR@1000 tensor(0.2661)
P@50 tensor(0.0769)
R@50 tensor(0.2927)
MRR@all tensor(0.2661)
MAP tensor(0.0959)
