In [18]:
import numpy as np, pandas as pd
from scipy.sparse import coo_matrix
from implicit.bpr import BayesianPersonalizedRanking
from pandas.api.types import CategoricalDtype
from scipy import sparse
from sklearn.model_selection import train_test_split
import tqdm
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [2]:
train_df = pd.read_csv("train.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)
test_df = pd.read_csv("test.tsv",sep="\t",    names=[
        'user_id', 'movie_id', 'rating', 'timestamp'
    ],)



In [3]:
test_users = test_df['user_id'].unique()
test_items = test_df['movie_id'].unique()


train_users = train_df['user_id'].unique()
train_items = train_df['movie_id'].unique()


user = list(set(train_users) & set(test_users))
item = list(set(train_items) & set(test_items))

In [4]:
train_df = train_df[train_df['user_id'].isin(test_users)]
train_df = train_df[train_df['movie_id'].isin(test_items)]

test_df = test_df[test_df['user_id'].isin(user)]
test_df = test_df[test_df['movie_id'].isin(item)]


# some how the test and train split was sparse enough i guess that some of the few rows/ columns
# that had value were removed due to the split
#https://stackoverflow.com/questions/27965295/dropping-rows-from-dataframe-based-on-a-not-in-condition


In [5]:
user_positives = test_df.groupby('user_id')['movie_id'].apply(set).to_dict()


In [6]:

test_ratings_matrix = test_df.pivot(index='user_id', columns='movie_id', values='rating')
train_ratings_matrix = train_df.pivot(index='user_id', columns='movie_id', values='rating')


In [7]:
display(test_ratings_matrix.shape)
display(train_ratings_matrix.shape)

(139785, 5331)

(139785, 5331)

In [8]:
del test_df
del train_ratings_matrix

In [9]:

shape = (len(user), len(item))
# print(shape)

# Create indices for users and movies
user_cat = CategoricalDtype(categories=sorted(user), ordered=True)
movie_cat = CategoricalDtype(categories=sorted(item), ordered=True)
user_index = train_df["user_id"].astype(user_cat).cat.codes
movie_index = train_df["movie_id"].astype(movie_cat).cat.codes

# Conversion via COO matrix
coo = sparse.coo_matrix((train_df["rating"], (user_index, movie_index)), shape=shape)
csr = coo.tocsr()
#https://hippocampus-garden.com/pandas_sparse/

In [10]:
display(csr.shape)

(139785, 5331)

In [11]:
display(test_ratings_matrix.shape)

(139785, 5331)

In [12]:
bpr = BayesianPersonalizedRanking(factors=19, learning_rate=0.001,
                                  regularization=0.001, dtype=np.float64,
                                  iterations=100)

In [13]:
bpr.fit(csr)


  0%|          | 0/100 [00:00<?, ?it/s]

In [14]:
# reduce data type as we probably dont need that high of percision in calculation
user_factors = bpr.user_factors.astype(np.float16)
item_factors = bpr.item_factors.astype(np.float16)
# https://numpy.org/doc/stable/reference/generated/numpy.ndarray.astype.html

pred = np.matmul(user_factors, item_factors.T)


In [15]:
display(pred)

array([[ 0.0843 ,  0.0911 , -0.8716 , ...,  0.3374 ,  0.11237,  1.792  ],
       [ 0.0288 ,  0.0636 , -0.5713 , ...,  0.1048 ,  0.04755,  0.2057 ],
       [ 0.0103 ,  0.05405, -0.718  , ...,  0.1099 ,  0.0637 ,  0.2988 ],
       ...,
       [ 0.03026,  0.0398 , -0.4956 , ...,  0.1401 ,  0.0354 , -0.01949],
       [ 0.03677,  0.071  , -0.7856 , ...,  0.23   ,  0.07947,  0.95   ],
       [-0.01095,  0.02676, -0.4424 , ...,  0.1039 , -0.00436, -0.522  ]],
      dtype=float16)

Delete unused variables to free up space

In [16]:
del user
del item
del user_cat
del movie_cat
del user_index
del movie_index
del coo
del csr
del shape
del bpr
del train_df
del user_factors
del item_factors

In [19]:
predv = torch.from_numpy(pred)
#https://medium.com/@heyamit10/converting-a-numpy-array-to-a-tensor-step-by-step-df329c44b035

In [20]:
del pred

In [21]:
marData = predv.detach().numpy()
marDF = pd.DataFrame(marData)
marDF.index.name = "user"
marDF.columns.name = "items"

# marDF = marDF.stack().reset_index()
# marDF.columns = ['user', 'item', 'score']
# print("MAR",calculate_metrics(marDF,user_positives))


In [22]:
# Jinming code
def calculate_metrics_matrix(pred_df, user_positives):
    metrics = {
        'MAR@ALL': [],
        'MAR@1000': [],
    }

    user_ids = pred_df.index.values  # Get actual user identifiers

    for user in tqdm.tqdm(user_ids, desc="Processing Users"):
        positives = user_positives.get(user, set())

        if len(positives) == 0:
            continue

        data = pred_df.loc[user]
        # display(data, positives)

        # this is rank
        rank = data.index.values
        # this is pred score
        score = data.values

        sorted_index = np.argsort(score)[::-1] 
        sorted_items = rank[sorted_index] 

        relevant_ranks1k = []
        relevant_ranks_all=[]

        for rank, item in enumerate(sorted_items[:1000], start=1):
            if item in positives:
                relevant_ranks1k.append(rank)
        #calculate relevant
        for rank, item in enumerate(sorted_items, start=1):
            if item in positives:
                relevant_ranks_all.append(rank)



        if relevant_ranks_all:
            metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
        if relevant_ranks1k:
            metrics['MAR@1000'].append(np.mean(relevant_ranks1k))



    final = {
        'MAR@ALL': np.mean(metrics['MAR@ALL']),
        'MAR@1000': np.mean(metrics['MAR@1000']),
    }

    return final

In [23]:
calculate_metrics_matrix(marDF,user_positives)

Processing Users: 100%|██████████| 139785/139785 [00:13<00:00, 10180.79it/s]


{'MAR@ALL': np.float64(2640.8565514762868),
 'MAR@1000': np.float64(457.72093085348246)}

In [24]:
del marDF
del user_positives

In [25]:
true = torch.tensor(test_ratings_matrix.values,dtype=torch.float16)

In [26]:
del test_ratings_matrix

In [27]:
display(true)
display(predv)

tensor([[nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        ...,
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan],
        [nan, nan, nan,  ..., nan, nan, nan]], dtype=torch.float16)

tensor([[ 0.0843,  0.0911, -0.8716,  ...,  0.3374,  0.1124,  1.7920],
        [ 0.0288,  0.0636, -0.5713,  ...,  0.1048,  0.0475,  0.2057],
        [ 0.0103,  0.0540, -0.7178,  ...,  0.1099,  0.0637,  0.2988],
        ...,
        [ 0.0303,  0.0398, -0.4956,  ...,  0.1401,  0.0354, -0.0195],
        [ 0.0368,  0.0710, -0.7856,  ...,  0.2300,  0.0795,  0.9502],
        [-0.0109,  0.0268, -0.4424,  ...,  0.1039, -0.0044, -0.5220]],
       dtype=torch.float16)

In [28]:
true = true.nan_to_num(0)
predv = predv.nan_to_num(0)

In [29]:
display(true.shape)
display(predv.shape)


torch.Size([139785, 5331])

torch.Size([139785, 5331])

In [30]:
true[true<0]=0
true[true>1]=1
# binarized the data for the evaluation


In [31]:
def metricStuff(pred,true):
    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    # code breaks when trying to compute metrics that are not @k
    # I.e. not enough memory too compute and not enough storage space on hard drive to write to
    print("MRR@all",mean_reciprocal_rank(pred, true))
    print("MAP",mean_average_precision(pred, true))
    

In [32]:
metricStuff(predv,true)

NDCG@10 tensor(0.1478)
MRR@1000 tensor(0.2686)
P@50 tensor(0.0864)
R@50 tensor(0.3715)
MRR@all tensor(0.2686)
MAP tensor(0.1124)
