In [4]:
!pip install numpy pandas scikit-learn tqdm scipy



In [93]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import scipy as sp

In [7]:
books = pd.read_csv('/kaggle/input/books-csv/Books.csv', delimiter=';', low_memory=False)
ratings = pd.read_csv('/kaggle/input/books-csv/Ratings.csv', delimiter=';')
users = pd.read_csv('/kaggle/input/books-csv/Users.csv', delimiter=';')

  users = pd.read_csv('/kaggle/input/books-csv/Users.csv', delimiter=';')


In [8]:
df_prep_step_1 = pd.merge(books, ratings.query("`Rating` > 0"), on='ISBN', how='inner')

# join users data
df_prep_step_2 = pd.merge(df_prep_step_1, users, on='User-ID', how='inner')

In [9]:
df_prep = df_prep_step_2.drop(['Publisher'], axis=1)
df_isbn = df_prep.drop_duplicates()

In [10]:
df_isbn['Original_NaN'] = df_isbn['Age'].isna()

df_isbn['Age'] = pd.to_numeric(df_isbn['Age'], errors='coerce')

df_isbn = df_isbn[~(users['Age'].isna() & ~df_isbn['Original_NaN'])]

df_isbn.drop(columns=['Original_NaN'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_isbn['Original_NaN'] = df_isbn['Age'].isna()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_isbn['Age'] = pd.to_numeric(df_isbn['Age'], errors='coerce')
  df_isbn = df_isbn[~(users['Age'].isna() & ~df_isbn['Original_NaN'])]


In [11]:
df_isbn['User-ID'] = pd.to_numeric(df_isbn['User-ID'], errors='coerce')

df_isbn = df_isbn[~(df_isbn['User-ID'].isna())]

In [12]:
age_outliers = df_isbn.query("Age > 100 or Age < 6")

user_outliers = age_outliers["User-ID"].to_list()

In [13]:
# exclude age outliers
df_isbn = df_isbn[~df_isbn["User-ID"].isin(user_outliers)]

In [14]:
y = df_isbn['Rating']
X = df_isbn.drop('Rating', axis=1)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [86]:
import heapq

In [104]:
class ContentRecommender():
    def __init__(self):
        super().__init__()
        self.tfidf_matrix = None
        self.isbn_to_idx = None  
        self.tfidf = None
        self.books = None

    def fit(self, books):
        books['Author'] = books['Author'].fillna('missing')
        books['description'] = books['Title'] + ' ' + books['Author']
        self.tfidf = TfidfVectorizer(stop_words='english')
        self.tfidf_matrix = self.tfidf.fit_transform(books['description'])
        self.isbn_to_idx = {isbn: i for i, isbn in enumerate(books['ISBN'])}
        self.books = books

    def predict(self, user_ratings, num_recommendations=5):
        user_predictions = {}
        for user_id, group in tqdm(user_ratings.groupby('User-ID'), desc='Generating recommendations'):
            user_indices = [self.isbn_to_idx.get(isbn) for isbn in group['ISBN'] if isbn in self.isbn_to_idx]
            if not user_indices:
                continue 

            average_vector = self.tfidf_matrix[user_indices].mean(axis=0)

            if isinstance(average_vector, sp.sparse.csr_matrix):
                average_vector = average_vector.toarray()

            average_vector_np = np.asarray(average_vector).reshape(1, -1)

            user_sim_scores = cosine_similarity(average_vector_np, self.tfidf_matrix)[0]

            top_indices = heapq.nlargest(num_recommendations, range(len(user_sim_scores)),
                                         key=lambda x: user_sim_scores[x] if x not in user_indices else float('-inf'))

            recommended_books = [self.books['ISBN'].iloc[idx] for idx in top_indices]
            user_predictions[user_id] = recommended_books

        return user_predictions
   

In [109]:
recommender = ContentRecommender()
recommender.fit(X_train[['ISBN', 'Title', 'Author', 'Year']])
y_predicted = recommender.predict(X_test, 5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['Author'] = books['Author'].fillna('missing')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  books['description'] = books['Title'] + ' ' + books['Author']
Generating recommendations: 100%|██████████| 21185/21185 [32:50<00:00, 10.75it/s]


In [144]:
test_ratings = X_test
test_ratings['Rating'] = y_test

In [145]:
test_users = test_ratings['User-ID'].unique()
len(test_users)

21185

In [131]:
filtered_df = test_ratings[test_ratings['User-ID'].isin(y_predicted.keys())]
test_users = filtered_df['User-ID'].unique()
len(test_users)

16538

In [179]:
from src.evaluation import average_precision

rating_mask = test_ratings['Rating'] >= 5
mean_ap = 0
count_users_with_relevant_items = 0  

for user_id in tqdm(test_users):
    user_mask = test_ratings['User-ID'] == user_id
    relevant_items = test_ratings.loc[user_mask & rating_mask].sort_values(by='Rating', ascending=False)[
        'ISBN'
    ]
    if len(relevant_items) == 0:
        continue

    predicted_isbns = y_predicted.get(user_id, [])
    if len(predicted_isbns) == 0:
        continue  # Skip users with no predictions

    ap = average_precision(predicted_isbns, relevant_items.tolist(), 5)
    mean_ap += ap
    count_users_with_relevant_items += 1


if count_users_with_relevant_items > 0:
    mean_ap /= count_users_with_relevant_items
else:
    mean_ap = 0  
mean_ap

100%|██████████| 21185/21185 [00:18<00:00, 1138.40it/s]


0.5186530349313265