In [1]:
!pip install numpy pandas scikit-learn tqdm scipy



In [2]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics
import scipy as sp
from concurrent.futures import ThreadPoolExecutor

In [3]:
! pip install -U LibRecommender

Collecting LibRecommender
  Downloading LibRecommender-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (30 kB)
Downloading LibRecommender-1.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.1/2.1 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: LibRecommender
Successfully installed LibRecommender-1.5.1


In [4]:
def abs_loss(gt_ratings, pred_ratings, isbn_to_index):
    loss = 0
    count = 0

    for _, row in gt_ratings.iterrows():
        user_id = row['User-ID']
        isbn = row['ISBN']
        gt_rating = row['Rating']

        item_index = isbn_to_index.get(isbn)
        if item_index is None or user_id not in pred_ratings:
            continue

        pred_rating = pred_ratings[user_id].get(item_index)
        if pred_rating is None:
            continue

        loss += np.abs(gt_rating - pred_rating)
        count += 1

    return loss / count if count > 0 else 0


def l2_loss(gt_ratings, pred_ratings, isbn_to_index):
    loss = 0
    count = 0

    for _, row in gt_ratings.iterrows():
        user_id = row['User-ID']
        isbn = row['ISBN']
        gt_rating = row['Rating']

        item_index = isbn_to_index.get(isbn)
        if item_index is None or user_id not in pred_ratings:
            continue

        pred_rating = pred_ratings[user_id].get(item_index)
        if pred_rating is None:
            continue

        loss += (gt_rating - pred_rating) ** 2
        count += 1

    return np.sqrt(loss / count) if count > 0 else 0

def dcg(relevances, rank=10):
    relevances = np.asarray(relevances)[:rank]
    n_relevances = len(relevances)
    if n_relevances == 0:
        return 0.

    discounts = np.log2(np.arange(n_relevances) + 2)
    return np.sum(relevances / discounts)


def ndcg(relevances, rank=10):
    best_dcg = dcg(sorted(relevances, reverse=True), rank)
    if best_dcg == 0:
        return 0.

    return dcg(relevances, rank) / best_dcg


def precision_at_k(recommended_items, relevant_items, k):
    recommended_at_k = recommended_items[:k]
    relevant_and_recommended = [item for item in recommended_at_k if item in relevant_items]
    return len(relevant_and_recommended) / k


def recall_at_k(recommended_items, relevant_items, k):
    """Calculate recall at k."""
    if relevant_items.size == 0:  # Use size attribute to check if the array is empty
        return 0.0  # Return 0 recall if there are no relevant items

    recommended_at_k = recommended_items[:k]
    relevant_and_recommended = [item for item in recommended_at_k if item in relevant_items]
    return len(relevant_and_recommended) / len(relevant_items)



In [5]:
books = pd.read_csv('/content/Books.csv', delimiter=';', low_memory=False)
ratings = pd.read_csv('/content/Ratings.csv', delimiter=';')
users = pd.read_csv('/content/Users.csv', delimiter=';')

  users = pd.read_csv('/content/Users.csv', delimiter=';')


In [6]:
df_prep_step_1 = pd.merge(books, ratings.query("`Rating` > 0"), on='ISBN', how='inner')

# join users data
df_prep_step_2 = pd.merge(df_prep_step_1, users, on='User-ID', how='inner')

In [7]:
df_prep = df_prep_step_2.drop(['Publisher'], axis=1)
df_isbn = df_prep.drop_duplicates()

In [8]:
df_isbn['Original_NaN'] = df_isbn['Age'].isna()

# Convert 'Age' to numeric, turning non-numeric values into NaN
df_isbn['Age'] = pd.to_numeric(df_isbn['Age'], errors='coerce')

# Drop rows where 'Age' is NaN and were not originally NaN
df_isbn = df_isbn[~(users['Age'].isna() & ~df_isbn['Original_NaN'])]

# Drop the 'Original_NaN' column as it's no longer needed
df_isbn.drop(columns=['Original_NaN'], inplace=True)

  df_isbn = df_isbn[~(users['Age'].isna() & ~df_isbn['Original_NaN'])]
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_isbn.drop(columns=['Original_NaN'], inplace=True)


In [9]:
df_isbn['User-ID'] = pd.to_numeric(df_isbn['User-ID'], errors='coerce')

# Drop rows where 'Age' is NaN and were not originally NaN
df_isbn = df_isbn[~(df_isbn['User-ID'].isna())]

In [10]:
age_outliers = df_isbn.query("Age > 100 or Age < 6")

user_outliers = age_outliers["User-ID"].to_list()

In [11]:
# exclude age outliers
df_isbn = df_isbn[~df_isbn["User-ID"].isin(user_outliers)]

In [12]:
df_isbn.drop_duplicates(inplace=True)

In [13]:
y = df_isbn['Rating']
X = df_isbn.drop('Rating', axis=1)

In [14]:
user_id_mapping = {uid: i for i, uid in enumerate(df_isbn['User-ID'].unique())}

In [15]:
from sklearn.model_selection import train_test_split
df_isbn['User-ID'] = df_isbn['User-ID'].apply(user_id_mapping.get)
train_ratings, test_ratings = train_test_split(df_isbn[['User-ID', 'ISBN', 'Rating']], test_size=0.3, random_state=42)

In [16]:
df_isbn.rename(columns={"User-ID": "user"}, inplace=True)

In [17]:
df_isbn.rename(columns={"ISBN": "item"}, inplace=True)

In [18]:
df_isbn.rename(columns={"Rating": "label"}, inplace=True)

In [19]:
df_isbn[['user', 'item', 'label']]

Unnamed: 0,user,item,label
0,0,0002005018,5.0
1,0,074322678X,5.0
2,0,0887841740,5.0
3,0,1552041778,5.0
4,0,1567407781,6.0
...,...,...,...
98717,16449,9068062506,5.0
98718,16450,0595179355,10.0
98720,16451,3492270174,8.0
98721,16452,0156012073,9.0


In [20]:
from libreco.algorithms import SVDpp
from libreco.data import DataInfo, DatasetPure, split_by_ratio
train_data, test_data = split_by_ratio(df_isbn[['user', 'item', 'label']], test_size=0.2)

train_data, data_info = DatasetPure.build_trainset(train_data)
test_data = DatasetPure.build_testset(test_data)

model = SVDpp(task="rating", data_info=data_info, embed_size=16, n_epochs=20, lr=0.01, reg=None)

model.fit(train_data, verbose=2, neg_sampling=False)

Instructions for updating:
non-resource variables are not supported in the long term


Training start time: [35m2024-08-09 22:10:29[0m


train: 100%|██████████| 481/481 [00:03<00:00, 123.18it/s]


Epoch 1 elapsed: 3.912s
	 [32mtrain_loss: 43.314[0m


train: 100%|██████████| 481/481 [00:02<00:00, 227.16it/s]


Epoch 2 elapsed: 2.127s
	 [32mtrain_loss: 14.9528[0m


train: 100%|██████████| 481/481 [00:03<00:00, 151.30it/s]


Epoch 3 elapsed: 3.194s
	 [32mtrain_loss: 5.4387[0m


train: 100%|██████████| 481/481 [00:02<00:00, 233.37it/s]


Epoch 4 elapsed: 2.071s
	 [32mtrain_loss: 2.3867[0m


train: 100%|██████████| 481/481 [00:01<00:00, 246.98it/s]


Epoch 5 elapsed: 1.956s
	 [32mtrain_loss: 1.8032[0m


train: 100%|██████████| 481/481 [00:01<00:00, 241.61it/s]


Epoch 6 elapsed: 2.001s
	 [32mtrain_loss: 1.9454[0m


train: 100%|██████████| 481/481 [00:02<00:00, 234.66it/s]


Epoch 7 elapsed: 2.060s
	 [32mtrain_loss: 2.3339[0m


train: 100%|██████████| 481/481 [00:02<00:00, 237.27it/s]


Epoch 8 elapsed: 2.038s
	 [32mtrain_loss: 2.6347[0m


train: 100%|██████████| 481/481 [00:03<00:00, 156.25it/s]


Epoch 9 elapsed: 3.095s
	 [32mtrain_loss: 2.6226[0m


train: 100%|██████████| 481/481 [00:02<00:00, 207.31it/s]


Epoch 10 elapsed: 2.334s
	 [32mtrain_loss: 2.5304[0m


train: 100%|██████████| 481/481 [00:02<00:00, 234.44it/s]


Epoch 11 elapsed: 2.058s
	 [32mtrain_loss: 2.3323[0m


train: 100%|██████████| 481/481 [00:01<00:00, 242.05it/s]


Epoch 12 elapsed: 1.995s
	 [32mtrain_loss: 2.2991[0m


train: 100%|██████████| 481/481 [00:02<00:00, 240.45it/s]


Epoch 13 elapsed: 2.008s
	 [32mtrain_loss: 2.2118[0m


train: 100%|██████████| 481/481 [00:02<00:00, 239.86it/s]


Epoch 14 elapsed: 2.012s
	 [32mtrain_loss: 2.1934[0m


train: 100%|██████████| 481/481 [00:02<00:00, 161.16it/s]


Epoch 15 elapsed: 2.997s
	 [32mtrain_loss: 2.0795[0m


train: 100%|██████████| 481/481 [00:02<00:00, 209.67it/s]


Epoch 16 elapsed: 2.307s
	 [32mtrain_loss: 2.0385[0m


train: 100%|██████████| 481/481 [00:01<00:00, 243.97it/s]


Epoch 17 elapsed: 1.981s
	 [32mtrain_loss: 1.9821[0m


train: 100%|██████████| 481/481 [00:01<00:00, 242.83it/s]


Epoch 18 elapsed: 1.988s
	 [32mtrain_loss: 1.9697[0m


train: 100%|██████████| 481/481 [00:01<00:00, 240.80it/s]


Epoch 19 elapsed: 2.004s
	 [32mtrain_loss: 1.9239[0m


train: 100%|██████████| 481/481 [00:02<00:00, 228.62it/s]

Epoch 20 elapsed: 2.114s
	 [32mtrain_loss: 1.9564[0m





In [21]:
isbn_to_index = {isbn: index for index, isbn in enumerate(df_isbn['item'].unique())}
index_to_isbn = {index: isbn for isbn, index in isbn_to_index.items()}

In [22]:
user_indices = test_data.user_indices
item_indices = test_data.item_indices
predictions = {}

In [39]:
for user_id in np.unique(user_indices):
    if not np.isnan(user_id):
        recommendations = model.recommend_user(user=user_id, n_rec=10)

        predicted_isbns = [index_to_isbn[idx] for idx in recommendations]
        predictions[user_id] = predicted_isbns

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1


In [24]:
def average_precision(predicted, relevant, isbn_to_index, k=5):
    if len(relevant) == 0:
        return 0.0

    predicted_indices = [isbn_to_index.get(isbn) for isbn in predicted if isbn in isbn_to_index]

    score = 0.0
    num_hits = 0.0

    for i, p in enumerate(predicted_indices[:k]):
        if p in relevant and p not in predicted_indices[:i]:
            num_hits += 1.0
            score += num_hits / (i + 1.0)

    return score / min(len(relevant), k)


In [47]:
def precision_at_k(predicted_items, relevant_items, isbn_to_index, k=None):
    predicted_indices = [isbn_to_index.get(isbn) for isbn in predicted_items if isbn in isbn_to_index]

    if k is None:
        k = len(relevant_items)  # Set K to the number of relevant items for this user
    recommended_at_k = predicted_indices[:k]
    relevant_and_recommended = [item for item in recommended_at_k if item in relevant_items]
    return len(relevant_and_recommended) / k


In [51]:
def recall_at_k(predicted_items, relevant_items, isbn_to_index, k=None):
    predicted_indices = [isbn_to_index.get(isbn) for isbn in predicted_items if isbn in isbn_to_index]
    if k is None:
        k = len(relevant_items)  # Set K to the number of relevant items for this user

    recommended_at_k = predicted_indices[:k]
    relevant_and_recommended = [item for item in recommended_at_k if item in relevant_items]
    return len(relevant_and_recommended) / len(relevant_items)


In [55]:
mean_ap = 0
total_abs_loss = 0
total_l2_loss = 0
total_precision_at_k = 0
total_recall_at_k = 0
total_ndcg = 0
total_dcg = 0

count_users_with_relevant_items = 0
count = 0

for user_id in np.unique(user_indices):
    relevant_indices = np.where(user_indices == user_id)[0]
    relevant_items = test_data.user_indices[relevant_indices]

    if relevant_items.size == 0:
        continue

    predicted_isbns = predictions.get(user_id, [])

    ap = average_precision(predicted_isbns, relevant_items, isbn_to_index, k=5)
    mean_ap += ap

    precision = precision_at_k(predicted_isbns, relevant_items, isbn_to_index, k=5)
    recall = recall_at_k(predicted_isbns, relevant_items, isbn_to_index, k=5)
    total_precision_at_k += precision
    total_recall_at_k += recall

    predicted_item_indices = [isbn_to_index.get(isbn) for isbn in predicted_isbns if isbn in isbn_to_index]
    relevances = [1 if item in relevant_items else 0 for item in predicted_item_indices]

    dcg_value = dcg(relevances, rank=5)
    ndcg_value = ndcg(relevances, rank=5)

    total_dcg += dcg_value
    total_ndcg += ndcg_value

    count_users_with_relevant_items += 1

mean_ap = mean_ap / count_users_with_relevant_items if count_users_with_relevant_items > 0 else 0
mean_precision_at_k = total_precision_at_k / count_users_with_relevant_items if count_users_with_relevant_items > 0 else 0
mean_recall_at_k = total_recall_at_k / count_users_with_relevant_items if count_users_with_relevant_items > 0 else 0

mean_dcg = total_dcg / count_users_with_relevant_items if count_users_with_relevant_items > 0 else 0
mean_ndcg = total_ndcg / count_users_with_relevant_items if count_users_with_relevant_items > 0 else 0

# Print out the results
print(f"Mean Average Precision: {mean_ap}")
print(f"Mean Precision at K: {mean_precision_at_k}")
print(f"Mean Recall at K: {mean_recall_at_k}")
print(f"Mean DCG: {mean_dcg}")
print(f"Mean NDCG: {mean_ndcg}")

Mean Average Precision: 0.8087370713119216
Mean Precision at K: 0.1999999999999933
Mean Recall at K: 0.8066554267007163
Mean DCG: 1.0
Mean NDCG: 1.0
