In [None]:
import numpy as np, pandas as pd
from poismf import PoisMF
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
import tqdm
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation


Data has unique characters that cannot be parsed by pandas

In [None]:
# code from https://github.com/eifuentes/lastfm-dataset-1K/blob/master/preprocessing.ipynb
df = pd.read_csv(
    "userid-timestamp-artid-artname-traid-traname.tsv", sep='\t', header=None,
    names=[
        'user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name'
    ],
    skiprows=[
        2120260-1, 2446318-1, 11141081-1,
        11152099-1, 11152402-1, 11882087-1,
        12902539-1, 12935044-1, 17589539-1
    ]
)


In [None]:
display(df)

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)
...,...,...,...,...,...,...
19098848,user_001000,2008-01-27T22:02:35Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,a490cabc-1e5c-4807-86c7-740c31a50009,Please Be Patient With Me
19098849,user_001000,2008-01-27T21:56:52Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3e92e447-9e1f-440d-bc00-6734469880c5,Shake It Off
19098850,user_001000,2008-01-27T21:52:36Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,93d044e6-1bbb-46a6-ac8e-283382a89e6f,Side With The Seeds
19098851,user_001000,2008-01-27T21:49:12Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,5ac4386f-6146-4389-a762-4b43f362d2c8,Sky Blue Sky


In [None]:
df = df.loc[(df.user_id.notnull()) & (df.track_id.notnull()) & (df.timestamp.notnull())]
df['user_id'] = pd.Categorical(df.user_id).codes
df['track_id'] = pd.Categorical(df.track_id).codes

In [None]:
df = df.drop("track_name", axis=1)
df = df.drop("artist_name", axis=1)
df =df.drop("artist_id",axis=1)
df["timestamp"]=1
df = df.dropna()
display(df)

Unnamed: 0,user_id,timestamp,track_id
10,0,1,929335
12,0,1,267920
14,0,1,825810
15,0,1,195630
16,0,1,41330
...,...,...,...
19098848,991,1,616930
19098849,991,1,235234
19098850,991,1,554216
19098851,991,1,340123


In [None]:
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df
#https://www.statology.org/swap-columns-pandas/

In [None]:
df.rename(columns={'timestamp': 'Count', 'track_id': 'ItemId', "user_id":"UserId"}, inplace=True)
df = swap_columns(df,'Count','ItemId')

In [None]:
display(df)

Unnamed: 0,UserId,ItemId,Count
10,0,929335,1
12,0,267920,1
14,0,825810,1
15,0,195630,1
16,0,41330,1
...,...,...,...
19098848,991,616930,1
19098849,991,235234,1
19098850,991,554216,1
19098851,991,340123,1


In [None]:
users =df["UserId"].unique()
#https://www.statology.org/pandas-unique-values-in-column/

In [None]:
userDict ={}
for i in users:
    userDict[i] ={}

In [None]:
for index, row in df.iterrows():
    user = row["UserId"]
    item = row["ItemId"]
    rating = row["Count"]
    if item in userDict[user]:
        userDict[user][item]+=1
    else:
        userDict[user][item]=1
# compressed the data such that if a user listen to a song again at another date we just add +1 i.e. implict value 

In [None]:
res = []
for user, items in userDict.items():
    for item, rating in items.items():
        res.append((user, item, rating))

data = pd.DataFrame(res, columns=["UserId", "ItemId", "Count"])

In [None]:
display(data)

Unnamed: 0,UserId,ItemId,Count
0,0,929335,27
1,0,267920,37
2,0,825810,22
3,0,195630,29
4,0,41330,7
...,...,...,...
3957802,991,946485,1
3957803,991,849399,1
3957804,991,920272,1
3957805,991,902961,1


In [None]:
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)

In [None]:
user_positives = test_df.groupby('UserId')['ItemId'].apply(set).to_dict()
# Jinming code

In [None]:
# https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.unique.html
# grab unique users and Items

users = train_df['UserId'].unique()
items = train_df['ItemId'].unique()
# reorganize and sort the list
users = sorted(users)
items = sorted(items)


ratingMatrix = test_df.pivot(index='UserId', columns='ItemId', values='Count')
ratingMatrix = ratingMatrix.reindex(index=users, columns=items, fill_value=0)
display(ratingMatrix)

ItemId,0,1,2,3,4,5,6,7,9,10,...,960386,960387,960388,960389,960391,960392,960394,960398,960400,960401
UserId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
1,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
2,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
3,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
4,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
988,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
989,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
990,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0


In [None]:
modelPF = PoisMF(reindex=True, method="tncg", use_float=True,
                    early_stop=False, reuse_prev=True,
                    k=20, niter=10, maxupd=200, l2_reg=1e3)\
                .fit(train_df)
# use the best gradient method with default values
# have k = 20 the same as the original experiement

In [None]:
display(modelPF.A.shape) # user
display(modelPF.B.shape) # items

(991, 20)

(792004, 20)

In [None]:
pred = np.matmul(modelPF.A,modelPF.B.T)

In [None]:
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [None]:
display(pred.shape)

(991, 792004)

In [None]:
# binarized the data for listen or not listen
ratingMatrix[ratingMatrix >1] = 1
# fill na with 0 for evaluations to work
true = torch.tensor(ratingMatrix.fillna(0).to_numpy())


In [None]:
predv = torch.tensor(pred)


In [None]:
display(true)
display(predv)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

tensor([[6.4610e-02, 1.1285e-02, 1.3041e-01,  ..., 0.0000e+00, 0.0000e+00,
         1.2655e-02],
        [3.4702e-02, 7.7281e-02, 1.1675e-01,  ..., 1.0016e-03, 2.0871e-03,
         0.0000e+00],
        [0.0000e+00, 1.3298e-06, 3.3529e-02,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        ...,
        [0.0000e+00, 1.9530e-04, 3.5691e-04,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [8.1351e-05, 4.6972e-05, 1.1668e-05,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00],
        [5.3148e-05, 3.6211e-05, 0.0000e+00,  ..., 0.0000e+00, 0.0000e+00,
         0.0000e+00]])

In [None]:
# Jinming Code
def calculate_metrics(pred_df, user_postivies, k_list = [10,50,100]):
  metrics = {
    'MAR@ALL': [],
    'MAR@1000': [],
  }


  grouped = pred_df.groupby('user')

  for user, user_data in tqdm.tqdm(grouped, desc="Processing Users"):
    positives = user_postivies.get(user, set())
    if len(positives) == 0:
      continue

    sorted_data = user_data.sort_values(by='score', ascending=False)
    all_items = sorted_data['item'].tolist()

    relevant_ranks1k = []
    relevant_ranks_all=[]
    for rank, item in enumerate(all_items[:1000], start=1):
      if item in positives:
        relevant_ranks1k.append(rank)
    #calculate relevant
    for rank, item in enumerate(all_items, start=1):
      if item in positives:
        relevant_ranks_all.append(rank)



    if relevant_ranks_all:
      metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
    if relevant_ranks1k:
      metrics['MAR@1000'].append(np.mean(relevant_ranks1k))
      

  
  final = {
    'MAR@ALL': np.mean(metrics['MAR@ALL']),
    'MAR@1000': np.mean(metrics['MAR@1000']),
  }

  return final

In [None]:
def metricStuff(pred,true,user_positive):

    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("MRR@all",mean_reciprocal_rank(pred, true))
    marData = pred.detach().numpy()
    marDF = pd.DataFrame(marData)
    marDF.index.name = "user"
    marDF.columns.name = "items"
    marDF = marDF.stack().reset_index()
    marDF.columns = ['user', 'item', 'score']
    print("MAR",calculate_metrics(marDF,user_positive))
    print("MAP",mean_average_precision(pred, true))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    

In [None]:
metricStuff(predv,true,user_positives)

NDCG@10 tensor(0.0012)
MRR@1000 tensor(0.0079)
MRR@all tensor(0.0081)


Processing Users: 100%|██████████| 991/991 [08:20<00:00,  1.98it/s]  


MAR {'MAR@ALL': np.float64(396351.38148028974), 'MAR@1000': np.float64(514.8955094800378)}
MAP tensor(0.0013)
P@50 tensor(0.0013)
R@50 tensor(0.0001)
