In [None]:
import numpy as np, pandas as pd
from scipy.sparse import coo_matrix
from implicit.bpr import BayesianPersonalizedRanking
from pandas.api.types import CategoricalDtype
from scipy import sparse
from sklearn.model_selection import train_test_split
import tqdm
from recsys_metrics import *
import torch
# https://github.com/zuoxingdong/recsys_metrics?tab=readme-ov-file#Citation

In [2]:
# code from https://github.com/eifuentes/lastfm-dataset-1K/blob/master/preprocessing.ipynb
df = pd.read_csv(
    "userid-timestamp-artid-artname-traid-traname.tsv", sep='\t', header=None,
    names=[
        'user_id', 'timestamp', 'artist_id', 'artist_name', 'track_id', 'track_name'
    ],
    skiprows=[
        2120260-1, 2446318-1, 11141081-1,
        11152099-1, 11152402-1, 11882087-1,
        12902539-1, 12935044-1, 17589539-1
    ]
)


In [3]:
display(df)

Unnamed: 0,user_id,timestamp,artist_id,artist_name,track_id,track_name
0,user_000001,2009-05-04T23:08:57Z,f1b1cf71-bd35-4e99-8624-24a6e15f133a,Deep Dish,,Fuck Me Im Famous (Pacha Ibiza)-09-28-2007
1,user_000001,2009-05-04T13:54:10Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Composition 0919 (Live_2009_4_15)
2,user_000001,2009-05-04T13:52:04Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc2 (Live_2009_4_15)
3,user_000001,2009-05-04T13:42:52Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Hibari (Live_2009_4_15)
4,user_000001,2009-05-04T13:42:11Z,a7f7df4a-77d8-4f12-8acd-5c60c93f4de8,坂本龍一,,Mc1 (Live_2009_4_15)
...,...,...,...,...,...,...
19098848,user_001000,2008-01-27T22:02:35Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,a490cabc-1e5c-4807-86c7-740c31a50009,Please Be Patient With Me
19098849,user_001000,2008-01-27T21:56:52Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,3e92e447-9e1f-440d-bc00-6734469880c5,Shake It Off
19098850,user_001000,2008-01-27T21:52:36Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,93d044e6-1bbb-46a6-ac8e-283382a89e6f,Side With The Seeds
19098851,user_001000,2008-01-27T21:49:12Z,9e53f84d-ef44-4c16-9677-5fd4d78cbd7d,Wilco,5ac4386f-6146-4389-a762-4b43f362d2c8,Sky Blue Sky


In [4]:
df = df.drop("track_name", axis=1)
df = df.drop("artist_name", axis=1)
df =df.drop("artist_id",axis=1)
df = df.dropna()
display(df)

Unnamed: 0,user_id,timestamp,track_id
10,user_000001,2009-05-04T13:06:09Z,f7c1f8f8-b935-45ed-8fc8-7def69d92a10
12,user_000001,2009-05-04T12:55:34Z,475d4e50-cebb-4cd0-8cd4-c3df97987962
14,user_000001,2009-05-03T15:48:25Z,dc394163-2b78-4b56-94e4-658597a29ef8
15,user_000001,2009-05-03T15:37:56Z,340d9a0b-9a43-4098-b116-9f79811bd508
16,user_000001,2009-05-03T15:14:53Z,0b04407b-f517-4e00-9e6a-494795efc73e
...,...,...,...
19098848,user_001000,2008-01-27T22:02:35Z,a490cabc-1e5c-4807-86c7-740c31a50009
19098849,user_001000,2008-01-27T21:56:52Z,3e92e447-9e1f-440d-bc00-6734469880c5
19098850,user_001000,2008-01-27T21:52:36Z,93d044e6-1bbb-46a6-ac8e-283382a89e6f
19098851,user_001000,2008-01-27T21:49:12Z,5ac4386f-6146-4389-a762-4b43f362d2c8


In [5]:
df = df.loc[(df.user_id.notnull()) & (df.track_id.notnull()) & (df.timestamp.notnull())]
df['user_id'] = pd.Categorical(df.user_id).codes
df['track_id'] = pd.Categorical(df.track_id).codes

In [6]:
df["timestamp"]=1
display(df)

Unnamed: 0,user_id,timestamp,track_id
10,0,1,929335
12,0,1,267920
14,0,1,825810
15,0,1,195630
16,0,1,41330
...,...,...,...
19098848,991,1,616930
19098849,991,1,235234
19098850,991,1,554216
19098851,991,1,340123


In [7]:
def swap_columns(df, col1, col2):
    col_list = list(df.columns)
    x, y = col_list.index(col1), col_list.index(col2)
    col_list[y], col_list[x] = col_list[x], col_list[y]
    df = df[col_list]
    return df
#https://www.statology.org/swap-columns-pandas/

In [8]:
df.rename(columns={'timestamp': 'rating', 'track_id': 'item_id'}, inplace=True)
df = swap_columns(df,'rating','item_id')

In [9]:
display(df)

Unnamed: 0,user_id,item_id,rating
10,0,929335,1
12,0,267920,1
14,0,825810,1
15,0,195630,1
16,0,41330,1
...,...,...,...
19098848,991,616930,1
19098849,991,235234,1
19098850,991,554216,1
19098851,991,340123,1


In [10]:
users =df["user_id"].unique()
#https://www.statology.org/pandas-unique-values-in-column/

In [11]:
userDict ={}
for i in users:
    userDict[i] ={}

In [12]:
for index, row in df.iterrows():
    user = row["user_id"]
    item = row["item_id"]
    rating = row["rating"]
    if item in userDict[user]:
        userDict[user][item]+=1
    else:
        userDict[user][item]=1
# compressed the data such that if a user listen to a song again at another date we just add +1 i.e. implict value 

In [13]:
res = []
for user, items in userDict.items():
    for item, rating in items.items():
        res.append((user, item, rating))

data = pd.DataFrame(res, columns=["user_id", "item_id", "rating"])

In [14]:
display(data)

Unnamed: 0,user_id,item_id,rating
0,0,929335,27
1,0,267920,37
2,0,825810,22
3,0,195630,29
4,0,41330,7
...,...,...,...
3957802,991,946485,1
3957803,991,849399,1
3957804,991,920272,1
3957805,991,902961,1


In [15]:
Total = data['rating'].sum()
print(Total/3957807)


4.279171268331174


In [16]:
train_df, test_df = train_test_split(data, test_size=0.3, random_state=42)


In [17]:
train_df["rating"]=1
test_df["rating"]=1


In [18]:
user_positives = test_df.groupby('user_id')['item_id'].apply(set).to_dict()


In [19]:
users = train_df['user_id'].unique()
items = train_df['item_id'].unique()
users = sorted(users)
items = sorted(items)


ratingMatrix = test_df.pivot(index='user_id', columns='item_id', values='rating')
ratingMatrix = ratingMatrix.reindex(index=users, columns=items, fill_value=0)
display(ratingMatrix)

item_id,0,1,2,3,4,5,6,7,9,10,...,960386,960387,960388,960389,960391,960392,960394,960398,960400,960401
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
1,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
2,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
3,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
4,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
987,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
988,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
989,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0
990,0.0,,0.0,0.0,0.0,,,,0.0,0.0,...,,0.0,0.0,,0.0,0.0,0.0,,0.0,0.0


In [20]:
users = train_df["user_id"].unique()
movies = train_df["item_id"].unique()
shape = (len(users), len(movies))

# Create indices for users and movies
user_cat = CategoricalDtype(categories=sorted(users), ordered=True)
movie_cat = CategoricalDtype(categories=sorted(movies), ordered=True)
user_index = train_df["user_id"].astype(user_cat).cat.codes
movie_index = train_df["item_id"].astype(movie_cat).cat.codes

# Conversion via COO matrix
coo = sparse.coo_matrix((train_df["rating"], (user_index, movie_index)), shape=shape)
csr = coo.tocsr()
#https://hippocampus-garden.com/pandas_sparse/

In [21]:
bpr = BayesianPersonalizedRanking(factors=19, learning_rate=0.001,
                                  regularization=0.001, dtype=np.float64,
                                  iterations=100)

In [22]:
bpr.fit(csr)


  0%|          | 0/100 [00:00<?, ?it/s]

In [23]:
pred = np.matmul(bpr.user_factors,bpr.item_factors.T)

In [25]:
ratingMatrix[ratingMatrix >1] = 1
true = torch.tensor(ratingMatrix.fillna(0).values)

In [26]:
predv = torch.tensor(pred)

In [27]:
display(true)
display(predv)

tensor([[0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        ...,
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.],
        [0., 0., 0.,  ..., 0., 0., 0.]], dtype=torch.float64)

tensor([[ 1.0748e-01,  1.4325e-01,  5.3304e-02,  ...,  4.8125e-04,
         -6.6858e-02,  6.2816e-02],
        [-4.1097e-01, -3.6063e-01, -3.3188e-02,  ..., -6.9342e-02,
         -4.9440e-02, -1.3477e-01],
        [-2.6312e-01,  2.3301e-02,  5.4005e-02,  ..., -7.6244e-02,
         -1.0199e-01,  1.7833e-02],
        ...,
        [-8.0547e-02, -6.4211e-01, -2.2300e-01,  ...,  4.8072e-02,
          3.5078e-01, -3.2304e-01],
        [-2.6900e-01,  6.7991e-02,  6.2687e-02,  ..., -7.0269e-02,
         -1.5158e-01,  3.0630e-02],
        [ 1.3429e-01, -4.9618e-01, -2.0111e-01,  ...,  5.6860e-02,
          4.1281e-01, -2.3964e-01]], dtype=torch.float64)

In [28]:
def calculate_metrics(pred_df, user_postivies, k_list = [10,50,100]):
  metrics = {
    'MAR@ALL': [],
    'MAR@1000': [],
  }


  grouped = pred_df.groupby('user')

  for user, user_data in tqdm.tqdm(grouped, desc="Processing Users"):
    positives = user_postivies.get(user, set())
    if len(positives) == 0:
      continue

    sorted_data = user_data.sort_values(by='score', ascending=False)
    all_items = sorted_data['item'].tolist()

    relevant_ranks1k = []
    relevant_ranks_all=[]
    for rank, item in enumerate(all_items[:1000], start=1):
      if item in positives:
        relevant_ranks1k.append(rank)
    #calculate relevant
    for rank, item in enumerate(all_items, start=1):
      if item in positives:
        relevant_ranks_all.append(rank)



    if relevant_ranks_all:
      metrics['MAR@ALL'].append(np.mean(relevant_ranks_all))
    if relevant_ranks1k:
      metrics['MAR@1000'].append(np.mean(relevant_ranks1k))
      

  
  final = {
    'MAR@ALL': np.mean(metrics['MAR@ALL']),
    'MAR@1000': np.mean(metrics['MAR@1000']),
  }

  return final

In [29]:
def metricStuff(pred,true,user_positive):
    print("NDCG@10",normalized_dcg(pred, true ,k=10))
    print("MRR@1000",mean_reciprocal_rank(pred, true,k=1000))
    print("MRR@all",mean_reciprocal_rank(pred, true))
    marData = pred.detach().numpy()
    marDF = pd.DataFrame(marData)
    marDF.index.name = "user"
    marDF.columns.name = "items"
    marDF = marDF.stack().reset_index()
    marDF.columns = ['user', 'item', 'score']

    print("MAR",calculate_metrics(marDF,user_positive))
    print("MAP",mean_average_precision(pred, true))
    print("P@50",precision(pred, true,k=50))
    print("R@50",recall(pred, true,k=50))
    

In [30]:
metricStuff(predv,true,user_positives) 

NDCG@10 tensor(0.0998)
MRR@1000 tensor(0.2340)
MRR@all tensor(0.2341)


Processing Users: 100%|██████████| 991/991 [06:25<00:00,  2.57it/s]  


MAR {'MAR@ALL': np.float64(394811.64859217714), 'MAR@1000': np.float64(508.6044509127765)}
MAP tensor(0.0243)
P@50 tensor(0.0913)
R@50 tensor(0.0075)
