In [1]:
# Import data
import pandas as pd

interaction_df = pd.read_csv('goodreads_interactions.csv')

interaction_df_clean = interaction_df[['user_id', 'book_id', 'rating']][interaction_df['is_read'] == 1]

# Group by user_id and count unique book_ids
books_per_user = interaction_df_clean.groupby('user_id')['book_id'].nunique()

#Compute the 99th percentile
threshold = books_per_user.quantile(0.95)

# Step 3: Identify users at or below the threshold
valid_users = books_per_user[books_per_user <= threshold].index

# Step 4: Filter the original dataframe
filtered_df = interaction_df_clean[interaction_df_clean['user_id'].isin(valid_users)]

from collections import defaultdict

user_items = filtered_df.groupby('user_id')['book_id'].apply(set).to_dict()

In [2]:
# def compute_overlap_similarity(user_items):
#     similarity = defaultdict(dict)
#     users = list(user_items.keys())

#     for i in range(len(users)):
#         for j in range(i + 1, len(users)):
#             u1, u2 = users[i], users[j]
#             books_u1 = user_items[u1]
#             books_u2 = user_items[u2]
            
#             overlap = len(books_u1 & books_u2)
#             if overlap > 0:
#                 similarity[u1][u2] = overlap
#                 similarity[u2][u1] = overlap
                
#     return similarity


def recommend(user_id, user_items, similarity, top_k=5):
    scores = defaultdict(int)

    for other_user, sim_score in similarity.get(user_id, {}).items():
        for book in user_items[other_user]:
            if book not in user_items[user_id]:
                scores[book] += sim_score

    # Sort by score descending and return top_k book_ids
    ranked_books = sorted(scores.items(), key=lambda x: -x[1])
    return [book for book, _ in ranked_books[:top_k]]

In [9]:
import logging
# from collections import defaultdict

# Step 1: Configure logging once at the top of your script or notebook
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def compute_overlap_similarity(user_items, min_overlap=5):
    similarity = defaultdict(dict)
    users = list(user_items.keys())
    total_users = len(users)

    # Step 2: Iterate with logging progress
    for i in range(total_users):
        u1 = users[i]
        books_u1 = user_items[u1]

        # Log progress every 10 users
        if i % 10 == 0:
            logging.info(f"Processing user {i+1}/{total_users}")

        for j in range(i + 1, total_users):
            u2 = users[j]
            books_u2 = user_items[u2]

            overlap = len(books_u1 & books_u2)
            if overlap > min_overlap:
                similarity[u1][u2] = overlap
                similarity[u2][u1] = overlap

    logging.info("Similarity computation completed.")
    return similarity

In [10]:
# Step 1: Prepare data
# implicit_df = df[df['rating'] > 0]
# user_items = implicit_df.groupby('user_id')['book_id'].apply(set).to_dict()

# Step 2: Compute similarity
similarity = compute_overlap_similarity(user_items)

# Step 3: Get recommendations
user_to_recommend = 2
recommendations = recommend(user_to_recommend, user_items, similarity, top_k=5)
print(recommendations)

2025-06-15 14:36:30,701 - INFO - Processing user 1/794674
2025-06-15 14:36:48,750 - INFO - Processing user 11/794674


KeyboardInterrupt: 