In [1]:
import numpy as np, pandas as pd
from poismf import PoisMF
from pandas.api.types import CategoricalDtype
from scipy import sparse
from sklearn.model_selection import train_test_split
import tqdm


In [2]:
train_df = pd.read_csv("pf_last_full_train.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)
test_df = pd.read_csv("pf_last_full_test.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)


In [3]:
test_users = test_df['user_id'].unique()
test_items = test_df['movie_id'].unique()


train_users = train_df['user_id'].unique()
train_items = train_df['movie_id'].unique()


user = list(set(train_users) & set(test_users))
item = list(set(train_items) & set(test_items))


In [4]:
print(len(user),len(item))

139382 5331


In [5]:
train_df = train_df[train_df['user_id'].isin(user)]
train_df = train_df[train_df['movie_id'].isin(item)]

test_df = test_df[test_df['user_id'].isin(user)]
test_df = test_df[test_df['movie_id'].isin(item)]


# some how the test and train split was sparse enough i guess that some of the few rows/ columns
# that had value were removed due to the split
#https://stackoverflow.com/questions/27965295/dropping-rows-from-dataframe-based-on-a-not-in-condition


In [6]:

test_ratings_matrix = test_df.pivot(index='user_id', columns='movie_id', values='rating')
train_ratings_matrix = train_df.pivot(index='user_id', columns='movie_id', values='rating')


In [7]:
display(train_ratings_matrix)
display(test_ratings_matrix)

movie_id,2,4,9,12,13,20,22,33,36,41,...,17738,17740,17745,17746,17753,17755,17756,17758,17767,17769
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
25,,,,,,,,,,,...,,,,,,,,,,
33,,,,,,,,,,,...,,,,,,,,,,
42,,,,,,,,,,,...,,,,,,,,,,
142,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649376,,,,,,,,,,,...,,,,,,,,,,
2649388,,,,,,,,,,,...,,,,,,,,,,
2649401,,,,,,,,,,,...,,,,,,,,,,
2649421,,,,,,,,,,,...,,,,,,,,,,


movie_id,2,4,9,12,13,20,22,33,36,41,...,17738,17740,17745,17746,17753,17755,17756,17758,17767,17769
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
8,,,,,,,,,,,...,,,,,,,,,,
25,,,,,,,,,,,...,,,,,,,,,,
33,,,,,,,,,,,...,,,,,,,,,,
42,,,,,,,,,,,...,,,,,,,,,,
142,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2649376,,,,,,,,,,,...,,,,,,,,,,
2649388,,,,,,,,,,,...,,,,,,,,,,
2649401,,,,,,,,,,,...,,,,,,,,,,
2649421,,,,,,,,,,,...,,,,,,,,,,


In [8]:
user_positives = test_df.groupby('user_id')['movie_id'].apply(set).to_dict()


In [9]:
del train_ratings_matrix

In [10]:
train_df["rating"] =1
train_df =train_df.drop("timestamp",axis=1)

train_df.columns =["UserId", "ItemId", "Count"]

display(train_df)


Unnamed: 0,UserId,ItemId,Count
0,65932,2,1
1,494639,2,1
2,1288603,2,1
3,1312846,2,1
4,349407,2,1
...,...,...,...
6143939,2549935,17769,1
6143940,1331645,17769,1
6143941,2198837,17769,1
6143942,2397206,17769,1


In [11]:
modelPF = PoisMF(reindex=True, method="tncg", use_float=True,
                    early_stop=False, reuse_prev=True,
                    k=20, niter=10, maxupd=200, l2_reg=1e3,)\
                .fit(train_df)

In [12]:
display(modelPF.A.shape) # user
display(modelPF.B.shape) # items


(139382, 20)

(5331, 20)

In [13]:
pred = np.matmul(modelPF.A,modelPF.B.T)

In [14]:
display(pred)

array([[2.0023547e-04, 2.6078155e-04, 2.6087293e-06, ..., 0.0000000e+00,
        3.0493089e-05, 8.6009437e-03],
       [1.5549948e-04, 2.0251854e-04, 3.2368425e-05, ..., 0.0000000e+00,
        3.6609330e-04, 1.0261791e-02],
       [1.2474683e-04, 1.6246708e-04, 1.1077263e-04, ..., 4.3360745e-03,
        2.3886221e-04, 6.6401260e-03],
       ...,
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        3.5808938e-05, 5.2674295e-04],
       [0.0000000e+00, 0.0000000e+00, 0.0000000e+00, ..., 0.0000000e+00,
        3.5817142e-05, 5.2686362e-04],
       [1.0540089e-04, 1.3727142e-04, 0.0000000e+00, ..., 0.0000000e+00,
        0.0000000e+00, 4.2912965e-03]], dtype=float32)

In [15]:
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [16]:
display(pred.shape)

(139382, 5331)

Delete unused variables to free up space

In [17]:
del modelPF
del train_df
del test_df


In [18]:
predv = torch.from_numpy(pred)
#https://medium.com/@heyamit10/converting-a-numpy-array-to-a-tensor-step-by-step-df329c44b035

In [19]:
del pred

In [20]:
marData = predv.detach().numpy()
marDF = pd.DataFrame(marData)
marDF.index.name = "user"
marDF.columns.name = "items"

# marDF = marDF.stack().reset_index()
# marDF.columns = ['user', 'item', 'score']
# print("MAR",calculate_metrics(marDF,user_positives))


In [21]:
def calculate_metrics_matrix(pred_df, user_positives):
    metrics = {
        'MAR@ALL': [],
        'MAR@1000': [],
    }

    user_ids = pred_df.index.values  # Get actual user identifiers

    for user in tqdm.tqdm(user_ids, desc="Processing Users"):
        positives = user_positives.get(user, set())

        if len(positives) == 0:
            continue

        data = pred_df.loc[user]
        # display(data, positives)

        # this is rank
        rank = data.index.values
        # this is pred score
        score = data.values

        sorted_index = np.argsort(score)[::-1] 
        sorted_items = rank[sorted_index] 

        relevant_ranks1k = []
        relevant_ranks_all=[]

        for rank, item in enumerate(sorted_items[:1000], start=1):
            if item in positives:
                relevant_ranks1k.append(rank)
        #calculate relevant
        for rank, item in enumerate(sorted_items, start=1):
            if item in positives:
                relevant_ranks_all.append(rank)



        if relevant_ranks_all:
            metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
        if relevant_ranks1k:
            metrics['MAR@1000'].append(np.mean(relevant_ranks1k))



    final = {
        'MAR@ALL': np.mean(metrics['MAR@ALL']),
        'MAR@1000': np.mean(metrics['MAR@1000']),
    }

    return final

In [22]:
calculate_metrics_matrix(marDF,user_positives)

Processing Users: 100%|██████████| 139382/139382 [00:08<00:00, 16323.99it/s]


{'MAR@ALL': np.float64(2653.1656732671295),
 'MAR@1000': np.float64(548.1876323819423)}

In [23]:
del marDF
del user_positives

In [24]:
true = torch.tensor(test_ratings_matrix.values,dtype=torch.float16)

In [25]:
del test_ratings_matrix

In [26]:
display(true)
display(predv)

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], dtype=torch.float16)

tensor([[2.0024e-04, 2.6078e-04, 2.6087e-06,  ..., 0.0000e+00, 3.0493e-05,
         8.6009e-03],
        [1.5550e-04, 2.0252e-04, 3.2368e-05,  ..., 0.0000e+00, 3.6609e-04,
         1.0262e-02],
        [1.2475e-04, 1.6247e-04, 1.1077e-04,  ..., 4.3361e-03, 2.3886e-04,
         6.6401e-03],
        ...,
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 3.5809e-05,
         5.2674e-04],
        [0.0000e+00, 0.0000e+00, 0.0000e+00,  ..., 0.0000e+00, 3.5817e-05,
         5.2686e-04],
        [1.0540e-04, 1.3727e-04, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         4.2913e-03]])

In [27]:
true = true.nan_to_num(0)
predv = predv.nan_to_num(0)

In [28]:
display(true.shape)
display(predv.shape)


torch.Size([139382, 5331])

torch.Size([139382, 5331])

In [29]:
true[true<0]=0
true[true>1]=1


In [30]:
def metricStuff(pred,true):
    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    # code breaks when trying to compute metrics that are not @k
    # I.e. not enough memory too compute and not enough storage space on hard drive to write to
    print("MRR@all",mean_reciprocal_rank(pred, true))
    print("MAP",mean_average_precision(pred, true))
    

In [31]:
metricStuff(predv,true)

NDCG@10 tensor(0.1198)
MRR@1000 tensor(0.2439)
P@50 tensor(0.0766)
R@50 tensor(0.2906)
MRR@all tensor(0.2439)
MAP tensor(0.0888)
