In [2]:
import numpy as np
import pandas as pd
import networkx as nx

In [3]:
data = pd.read_csv('../data/Ratings.csv', delimiter=';')

In [5]:
# Filter out users or books with very few interactions to reduce noise (optional)
user_threshold = 5
book_threshold = 5

user_counts = data['User-ID'].value_counts()
book_counts = data['ISBN'].value_counts()

filtered_data = data[data['User-ID'].isin(user_counts[user_counts >= user_threshold].index)]
filtered_data = filtered_data[filtered_data['ISBN'].isin(book_counts[book_counts >= book_threshold].index)]

# Create a bipartite graph
B = nx.Graph()

# Add nodes with the bipartite attribute
users = list(filtered_data['User-ID'].unique())
books = list(filtered_data['ISBN'].unique())

B.add_nodes_from(users, bipartite=0)  # Add user nodes
B.add_nodes_from(books, bipartite=1)  # Add book nodes

# Add edges between users and books based on interactions
edges = list(filtered_data[['User-ID', 'ISBN']].itertuples(index=False, name=None))
B.add_edges_from(edges)

In [6]:
from tqdm import tqdm

In [7]:
def pagerank(G, alpha=0.85, max_iter=100, tol=1e-06):
    """
    Compute PageRank for each node in the graph.

    Parameters:
    - G: The graph (NetworkX graph)
    - alpha: Damping factor, typically 0.85
    - max_iter: Maximum number of iterations
    - tol: Tolerance to check convergence

    Returns:
    - A dictionary of nodes with PageRank as value
    """
    nodes = G.nodes()
    N = len(nodes)

    # Initialize the PageRank dict with equal probability for each node
    pagerank = {node: 1 / N for node in nodes}

    for i in tqdm(range(max_iter)):
        prev_pagerank = pagerank.copy()
        for node in nodes:
            rank_sum = 0
            for neighbor in G.neighbors(node):
                rank_sum += prev_pagerank[neighbor] / len(list(G.neighbors(neighbor)))
            pagerank[node] = (1 - alpha) / N + alpha * rank_sum

        # Check for convergence
        if sum(abs(pagerank[node] - prev_pagerank[node]) for node in nodes) < tol:
            print(f"Converged after {i + 1} iterations.")
            break

    return pagerank


In [8]:
# Calculate PageRank
pr = pagerank(B)

# Convert the result to a DataFrame for easier analysis
pr_df = pd.DataFrame(list(pr.items()), columns=['Node', 'PageRank'])

# Separate users and books
user_pr = pr_df[pr_df['Node'].isin(users)].sort_values(by='PageRank', ascending=False)
book_pr = pr_df[pr_df['Node'].isin(books)].sort_values(by='PageRank', ascending=False)

# Display top 10 books based on PageRank
print("Top 10 Books by PageRank:")
print(book_pr.head(10))

# Display top 10 users based on PageRank (optional)
print("\nTop 10 Users by PageRank:")
print(user_pr.head(10))


 82%|████████▏ | 82/100 [04:53<01:04,  3.59s/it]

Converged after 83 iterations.
Top 10 Books by PageRank:
             Node  PageRank
22222  0971880107  0.001388
22206  0316666343  0.000655
22208  0385504209  0.000483
23629  0060928336  0.000388
22659  0312195516  0.000361
22302  059035342X  0.000329
22720  0142001740  0.000327
23362  0679781587  0.000326
26012  067976402X  0.000323
23031  0671027360  0.000322

Top 10 Users by PageRank:
         Node  PageRank
988     11676  0.006272
3022    35859  0.002464
12358  153662  0.002276
15843  198711  0.001844
8029    98391  0.001652
6284    76352  0.001632
16958  212898  0.001299
1396    16795  0.001262
16304  204864  0.001232
18071  227447  0.001136



