# PAGERANK FOR AMAZON BOOK REVIEWS
## AMD - University of Milan
### Fatemeh Amirian 34015A

This output was taken on a 20% subset. Change if you wish to see different results on different fractions of the data.

# PHASE ZERO: SET UP

In [1]:
!apt-get update -qq
!apt-get install openjdk-11-jdk-headless -qq
!wget https://dlcdn.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz
!tar xf spark-3.5.6-bin-hadoop3.tgz
!pip install -q findspark pyspark==3.5.6 graphframes kaggle

W: Skipping acquire of configured file 'main/source/Sources' as repository 'https://r2u.stat.illinois.edu/ubuntu jammy InRelease' does not seem to provide it (sources.list entry misspelt?)
--2025-06-11 21:39:43--  https://dlcdn.apache.org/spark/spark-3.5.6/spark-3.5.6-bin-hadoop3.tgz
Resolving dlcdn.apache.org (dlcdn.apache.org)... 151.101.2.132, 2a04:4e42::644
Connecting to dlcdn.apache.org (dlcdn.apache.org)|151.101.2.132|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 400923510 (382M) [application/x-gzip]
Saving to: ‘spark-3.5.6-bin-hadoop3.tgz.1’


2025-06-11 21:39:58 (290 MB/s) - ‘spark-3.5.6-bin-hadoop3.tgz.1’ saved [400923510/400923510]



In [2]:
import os
import time
import zipfile
from collections import defaultdict
from math import sqrt
import numpy as np
import findspark
import matplotlib.pyplot as plt
from graphframes import GraphFrame
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import (
    DoubleType,
    LongType,
    StringType,
    StructField,
    StructType
)

from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType
import re

In [3]:
# Set environment
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.6-bin-hadoop3"

try:
    spark.stop()
except:
    pass

try:
    sc.stop()
except:
    pass

findspark.init()

# Setup Spark
spark = SparkSession.builder \
    .appName("PageRankForAmazonBookReviews") \
    .master("local[*]") \
    .config("spark.driver.memory", "6g") \
    .config("spark.jars.packages", "graphframes:graphframes:0.8.4-spark3.5-s_2.12") \
    .getOrCreate()

sc = spark.sparkContext

# Test GraphFrames import
from graphframes import GraphFrame
print("GraphFrames imported successfully!")

GraphFrames imported successfully!


In [4]:
spark

Downloading the dataset!

In [5]:
# Replace "xxxxxx" with your actual Kaggle username and API key
os.environ['KAGGLE_USERNAME'] = "xxx"
os.environ['KAGGLE_KEY'] = "xxx"

# Download Amazon Books Review dataset
!kaggle datasets download -d mohamedbakhet/amazon-books-reviews

Dataset URL: https://www.kaggle.com/datasets/mohamedbakhet/amazon-books-reviews
License(s): CC0-1.0
amazon-books-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


In [6]:
# prompt: unzip the data

with zipfile.ZipFile('amazon-books-reviews.zip', 'r') as zip_ref:
    zip_ref.extractall('.')

# PHASE ONE : DATA PROCESSING

In [7]:
# reading the dataset and selecting the required columns
selected_book_reviews = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("multiLine", "true") \
    .csv("/content/Books_rating.csv") \
    .select("Id", "Title", "User_id", "review/score")

# taking a sample since colab is limited
selected_book_reviews = selected_book_reviews.sample(0.2, seed=42)


# getting a glimpse of the dataset
selected_book_reviews.printSchema()
selected_book_reviews.cache()
selected_book_reviews.show(5, truncate=False)


root
 |-- Id: string (nullable = true)
 |-- Title: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- review/score: double (nullable = true)

+----------+-----------------------------+--------------+------------+
|Id        |Title                        |User_id       |review/score|
+----------+-----------------------------+--------------+------------+
|0826414346|Dr. Seuss: American Icon     |A2RSSXTDZDUSH4|5.0         |
|0595344550|Whispers of the Wicked Saints|AUR0VA5H0C66C |1.0         |
|0595344550|Whispers of the Wicked Saints|ACO23CG8K8T77 |5.0         |
|0595344550|Whispers of the Wicked Saints|AJV5HX8BBZKEP |4.0         |
|0595344550|Whispers of the Wicked Saints|A2XXVRH6VJ8S7Q|5.0         |
+----------+-----------------------------+--------------+------------+
only showing top 5 rows



In [8]:
# how many rows do we have in the dataset?
print(f"Total number of reviews: {selected_book_reviews.count()}")

Total number of reviews: 600871


In [9]:
# do we have any missing values in the columns we selected?
selected_book_reviews.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in selected_book_reviews.columns]).show()

+---+-----+-------+------------+
| Id|Title|User_id|review/score|
+---+-----+-------+------------+
|  0|   41| 112696|           0|
+---+-----+-------+------------+



In [10]:
# we drop the rows with missing User_id values
selected_book_reviews = selected_book_reviews.na.drop(subset=["User_id"])

In [11]:
# we need to fill the missing values in the Title column with a placeholder - unknown
selected_book_reviews = selected_book_reviews.fillna("unknown", subset=["Title"])

# let's check again for missing values
selected_book_reviews.select([F.count(F.when(F.col(c).isNull(), c)).alias(c) for c in selected_book_reviews.columns]).show()

+---+-----+-------+------------+
| Id|Title|User_id|review/score|
+---+-----+-------+------------+
|  0|    0|      0|           0|
+---+-----+-------+------------+



In [12]:
# we wanna check to see if we have any duplicate reviews by the same user for the same book Id?
duplicate_reviews = selected_book_reviews.groupBy("User_id", "Id").count().filter("count > 1")

# seeing the first few duplicate reviews
print("Duplicate reviews (User_id, Book_id):")
duplicate_reviews.show(5)

# counting the number of duplicate reviews (user-book id pairs)
num_duplicate_reviews = duplicate_reviews.count()
print(f"Number of user-book pairs with duplicate reviews: {num_duplicate_reviews}")


Duplicate reviews (User_id, Book_id):
+--------------+----------+-----+
|       User_id|        Id|count|
+--------------+----------+-----+
|A2PBKFCD7YI23H|B000GY0PV4|    2|
|A1L43KWWR05PCS|B000NWXNIG|    2|
|A2CA0MYM4FCQSJ|0195813618|    2|
|A2SPUG1DO8TH3R|0897501446|    2|
|A3V1EPSE6XDC0C|B00005NKL9|    2|
+--------------+----------+-----+
only showing top 5 rows

Number of user-book pairs with duplicate reviews: 1788


In [13]:
# removing duplicates based on User_id and Id
cleaned_reviews = selected_book_reviews.dropDuplicates(["User_id", "Id"])
print(f"Number of reviews after removing duplicates: {cleaned_reviews.count()}")

cleaned_reviews = cleaned_reviews.cache()
selected_book_reviews.unpersist()

Number of reviews after removing duplicates: 486012


DataFrame[Id: string, Title: string, User_id: string, review/score: double]

In [14]:
# checking to see how many unique books and user Ids we have in the dataset
unique_books = cleaned_reviews.select("Id").distinct().count()
unique_users = cleaned_reviews.select("User_id").distinct().count()

print(f"Unique books: {unique_books:,}")
print(f"Unique users: {unique_users:,}")


# See books with most reviews - we will likely see some of these books in the top lists
cleaned_reviews.groupBy("Id", "Title") \
    .count() \
    .orderBy(col("count").desc()) \
    .show(5)

Unique books: 108,488
Unique users: 305,881
+----------+--------------------+-----+
|        Id|               Title|count|
+----------+--------------------+-----+
|B000IEZE3G|Harry Potter and ...|  794|
|B000Q032UY|The Hobbit or The...|  736|
|B000GQG5MA|The Hobbit; Or, T...|  735|
|B000GQG7D2|          The Hobbit|  719|
|B000PC54NG|          The Hobbit|  717|
+----------+--------------------+-----+
only showing top 5 rows



### **Attention!!!** Here we notice something important. We seem to have similar books (same titles) but with different book ids. we need to test this hypothesis.

First we normalize the titles (at least as much as we can). <br>
1. Make all of them lower case
2. Remove versions such as (CD) - (Audio Book) **but we will exculde generic titles such as poems because in that case we would lose information**
<br>

The idea is that I want to see the true books by their content and not by their version. If i am ranking the books by their importance, I dont care if it was paper back or hard cover and people do tend to leave diffrent reviwes for diferent editions of the same book. <br>
However, Perfect normalization will not be achieved in this attempt but it will make our ranking more meaningful.

In [15]:
# normalizing the titles
# define a list of generic titles that we want to keep as is - i took a wild guess with this based on my experince in reading books
# also we make it into a function because we will need it for mappings

def normalize_titles(df, title_column="Title"):

    # Define generic titles to preserve as-is
    generic_titles = [
        "poems", "selected poems", "collected poems",
        "essays", "letters", "stories", "collected stories",
        "short stories", "anthology", "complete works",
        "selected works", "collected works", "memoir",
        "biography", "autobiography", "diary", "journals"
    ]

    @F.udf(returnType=BooleanType())
    def contains_generic_title(title):
        if title is None:
            return False
        for generic in generic_titles:
            if re.search(r'\b' + re.escape(generic) + r'\b', title):
                return True
        return False

    df_with_lowercase = df.withColumn(
        "LowercasedTitle",
        F.lower(F.col(title_column))
    )

    df_normalized = df_with_lowercase.withColumn(
        "Title_Norm",
        F.when(
            contains_generic_title(F.col("LowercasedTitle")),
            F.col("LowercasedTitle")
        ).otherwise(
            F.trim(F.regexp_replace(
                F.col("LowercasedTitle"),
                r"\[.*?\]|\(.*?\)|audiobook|unabridged|cd",
                ""
            ))
        )
    )

    df_normalized = df_normalized.drop("LowercasedTitle")

    return df_normalized


cleaned_reviews_norm = normalize_titles(cleaned_reviews)

cleaned_reviews_norm = cleaned_reviews_norm.cache()
cleaned_reviews.unpersist()

orig_count = cleaned_reviews_norm.select("Title").distinct().count()
norm_count = cleaned_reviews_norm.select("Title_Norm").distinct().count()

print(f"Unique original titles: {orig_count:,}")
print(f"Unique normalized titles: {norm_count:,}")

# what percent of titles were reduced?
print(f"Reduction: {orig_count - norm_count:,} ({(orig_count - norm_count)/orig_count*100:.2f}%)")

# show some examples to see if we did a good job
print("\nSample data with normalized titles:")
cleaned_reviews_norm.select("Title", "Title_Norm").distinct().orderBy(F.rand()).show(10, truncate=False)

Unique original titles: 104,134
Unique normalized titles: 100,522
Reduction: 3,612 (3.47%)

Sample data with normalized titles:
+------------------------------------------------------------------------------------+------------------------------------------------------------------------------------+
|Title                                                                               |Title_Norm                                                                          |
+------------------------------------------------------------------------------------+------------------------------------------------------------------------------------+
|Books Into Film: The Stuff That Dreams Are Made of                                  |books into film: the stuff that dreams are made of                                  |
|Political Ideals                                                                    |political ideals                                                                    |
|Monsieur Be

Since This looks somewhat okay, we abondon the told title colomns and continue with our new colomn.

In [16]:
# Rename NormalizedTitle to Title_Norm
cleaned_reviews_norm = cleaned_reviews_norm.withColumnRenamed("NormalizedTitle", "Title_Norm")

In [17]:
cleaned_reviews_norm = cleaned_reviews_norm.drop("Title")
cleaned_reviews_norm = cleaned_reviews_norm.drop("LowercasedTitle")

In [18]:
# Verify the change
print("Schema after replacing Title with normalized version:")
cleaned_reviews_norm.printSchema()

Schema after replacing Title with normalized version:
root
 |-- Id: string (nullable = true)
 |-- User_id: string (nullable = true)
 |-- review/score: double (nullable = true)
 |-- Title_Norm: string (nullable = false)



In [19]:
cleaned_reviews_norm.show(5, truncate=False)

+----------+--------------+------------+-----------------------------------------------------+
|Id        |User_id       |review/score|Title_Norm                                           |
+----------+--------------+------------+-----------------------------------------------------+
|B0007JGWB0|A100Z0BS6Z89IN|5.0         |seven pillars of wisdom,: a triumph                  |
|B0000CJ9GZ|A102DWIIFU8MWF|4.0         |the richest man in babylon                           |
|B000G643YM|A1042BIXF6ZMAC|5.0         |little women                                         |
|B000Q032UY|A104QFFOEJL0NW|5.0         |the hobbit or there and back again                   |
|0884196372|A106EO2I13BL2L|5.0         |image maker, the: recognize your true worth and value|
+----------+--------------+------------+-----------------------------------------------------+
only showing top 5 rows



#### Now the next problem is that titles sometimes have different book IDs (despite being the same book) and we need to verify this before thinking of a solution.

In [20]:
# grouping books by Title and count how many distinct IDs exist for each title
title_id_counts = cleaned_reviews_norm.groupBy("Title_Norm").agg(F.countDistinct("Id").alias("distinct_id_count"))

# Filter titles with more than one distinct ID
titles_with_same_name_diff_id = title_id_counts.filter(col("distinct_id_count") > 1)

print("Number of titles with the same title but different IDs:")
duplicate_titles = titles_with_same_name_diff_id.count()
print(duplicate_titles)

# top ten books with the most distinct IDs - same title
print("\nTop five books with the same title and different IDs ordered by the count of distinct ids:")
titles_with_same_name_diff_id.orderBy(col("distinct_id_count").desc()).show(10, truncate=False)

Number of titles with the same title but different IDs:
5442

Top five books with the same title and different IDs ordered by the count of distinct ids:
+--------------------------------------------------------------+-----------------+
|Title_Norm                                                    |distinct_id_count|
+--------------------------------------------------------------+-----------------+
|persuasion                                                    |18               |
|wuthering heights                                             |17               |
|great expectations                                            |15               |
|emma                                                          |14               |
|the picture of dorian gray                                    |13               |
|an inquiry into the nature and causes of the wealth of nations|12               |
|the white company                                             |12               |
|pride and prejud

For the purpose of truly having distincs Human - Book reviews and also easier handling in the functions, we will create integer indexes.

In [21]:
from pyspark.sql.window import Window

def add_generated_ids_norm(df):

    title_window = Window.orderBy("Title_Norm")
    titles_with_id = df.select("Title_Norm").distinct() \
                      .withColumn("book_id", F.dense_rank().over(title_window))

    user_window = Window.orderBy("User_id")
    users_with_id = df.select("User_id").distinct() \
                     .withColumn("user_id_int", F.dense_rank().over(user_window))

    result = df.join(titles_with_id, "Title_Norm") \
               .join(users_with_id, "User_id") \
               .select("User_id", "user_id_int", "Id", "Title_Norm", "book_id",
                       F.col("review/score").cast("double").alias("score"))

    return result


df_with_ids = add_generated_ids_norm(cleaned_reviews_norm)
df_with_ids = df_with_ids.cache()
cleaned_reviews_norm.unpersist()
df_with_ids.show(10)

+--------------+-----------+----------+--------------------+-------+-----+
|       User_id|user_id_int|        Id|          Title_Norm|book_id|score|
+--------------+-----------+----------+--------------------+-------+-----+
|A1005YJDO9VCIY|        124|0977390403|the gift of the a...|  78415|  5.0|
|A1006V961PBMKA|        127|0742516814|fighting the forc...|  26873|  3.0|
|A100HWDN5JMK8G|        139|1413703860|         jake's gold|  39401|  5.0|
|A1018G2FPJBJ0S|        190|B000KW0HGK|essays of ralph w...|  24925|  4.0|
|A10AKE9TAADHVV|        769|B0006ASR90|insects: a guide ...|  37991|  4.0|
|A10AKE9TAADHVV|        769|0679886524|the berenstain be...|  72425|  5.0|
|A10AKE9TAADHVV|        769|B0006BQLVA|pond life - a gui...|  57329|  4.0|
|A10AKE9TAADHVV|        769|1559716843|     fun with nature|  29306|  5.0|
|A10AZVYK32ZJSE|        803|B000871DZ6|alice's adventure...|   5163|  5.0|
|A10AZVYK32ZJSE|        803|1596009772|alice's adventure...|   5159|  5.0|
+--------------+---------

Checking to see if there is any collision?

In [22]:
book_collisions = df_with_ids.select("book_id", "Title_Norm").distinct().groupBy("book_id").count().filter(F.col("count") > 1).count()
user_collisions = df_with_ids.select("user_id_int", "User_id").distinct().groupBy("user_id_int").count().filter(F.col("count") > 1).count()

print(f"Book collisions: {book_collisions}")
print(f"User collisions: {user_collisions}")

Book collisions: 0
User collisions: 0


In [23]:
# only keeping what we need
indexed_data = df_with_ids.select("user_id_int", "book_id", "Title_Norm", "score")
indexed_data.show(10)

+-----------+-------+--------------------+-----+
|user_id_int|book_id|          Title_Norm|score|
+-----------+-------+--------------------+-----+
|        124|  78415|the gift of the a...|  5.0|
|        127|  26873|fighting the forc...|  3.0|
|        139|  39401|         jake's gold|  5.0|
|        190|  24925|essays of ralph w...|  4.0|
|        769|  37991|insects: a guide ...|  4.0|
|        769|  72425|the berenstain be...|  5.0|
|        769|  57329|pond life - a gui...|  4.0|
|        769|  29306|     fun with nature|  5.0|
|        803|   5163|alice's adventure...|  5.0|
|        803|   5159|alice's adventure...|  5.0|
+-----------+-------+--------------------+-----+
only showing top 10 rows



Here we can see that we run into more duplicate human-book pairs which makes sense since we basically turned many different book variations into one id, so we remove duplicates again.

In [24]:
# removng duplicate user-book pairs
indexed_data_unique = indexed_data.dropDuplicates(["user_id_int", "book_id"])

indexed_data_unique = indexed_data_unique.cache()
indexed_data.unpersist()
df_with_ids.unpersist()

a = indexed_data.count()
b = indexed_data_unique.count()

print("Original count:", a)
print("After removing duplicates:", b)
print("Duplicates removed:", a-b)

Original count: 486012
After removing duplicates: 450647
Duplicates removed: 35365


In [25]:
indexed_data_unique.show(5)

+-----------+-------+--------------------+-----+
|user_id_int|book_id|          Title_Norm|score|
+-----------+-------+--------------------+-----+
|         63|  32121|gulliver's travel...|  5.0|
|        479|   6854|anglo-saxon attit...|  3.0|
|        492|  14943|chef prudhomme's ...|  5.0|
|        668|  56569|pigeon feathers &...|  5.0|
|        768|   5398|all the weyrs of ...|  5.0|
+-----------+-------+--------------------+-----+
only showing top 5 rows



In [26]:
# Count unique books
unique_books = indexed_data_unique.select("book_id").distinct().count()

# Count unique users
unique_users = indexed_data_unique.select("user_id_int").distinct().count()

# Calculate average reviews per user
reviews_per_user = indexed_data_unique.groupBy("user_id_int").count()
avg_reviews_per_user = reviews_per_user.select(F.avg("count")).first()[0]

# Print the statistics
print(f"Number of unique books: {unique_books:,}")
print(f"Number of unique users: {unique_users:,}")
print(f"Average reviews per user: {avg_reviews_per_user:.2f}")

# Additional distribution statistics
reviews_distribution = reviews_per_user.select(
    F.min("count").alias("min_reviews"),
    F.expr("percentile(count, 0.25)").alias("Q1"),
    F.expr("percentile(count, 0.5)").alias("Q2"),
    F.expr("percentile(count, 0.75)").alias("Q3"),
    F.max("count").alias("max_reviews")
)

reviews_distribution.show()

# Show top 10 most active reviewers
print("\nTop 10 Most Active Reviewers:")
reviews_per_user.orderBy(F.col("count").desc()).show(10)

Number of unique books: 100,522
Number of unique users: 305,881
Average reviews per user: 1.47
+-----------+---+---+---+-----------+
|min_reviews| Q1| Q2| Q3|max_reviews|
+-----------+---+---+---+-----------+
|          1|1.0|1.0|1.0|       1127|
+-----------+---+---+---+-----------+


Top 10 Most Active Reviewers:
+-----------+-----+
|user_id_int|count|
+-----------+-----+
|      10567| 1127|
|     260617|  694|
|     263952|  369|
|      74959|  302|
|      45290|  276|
|      63429|  242|
|      29398|  224|
|     135961|  205|
|      52011|  168|
|      82014|  161|
+-----------+-----+
only showing top 10 rows



So most people only leave one review.

### **ATTENTION** <br>
Because during debugging the spark session would often crash and i had to restart the kernel, i saved the final data so each time that i would re-run the spark, i wouldnt have to re-run the data processing too. <br>
be careful this snippet of code will make and save inside the relative directory. You might wanna skip the next 2 blocks.

In [27]:
final_data = indexed_data_unique

In [28]:
# Convert to pandas DataFrame for reliable local storage
#pandas_df = final_data.toPandas()
#os.makedirs("pandas_data", exist_ok=True)
#pandas_df.to_csv("pandas_data/final_data.csv", index=False)
#pandas_df.to_pickle("pandas_data/final_data.pkl")

#print(f"Successfully saved DataFrame with {len(pandas_df)} rows")

ONLY USE IF THE SPARK CRASHES AND YOU ARE TRYING TO RESTART THE SESSION.

In [29]:
#final_data = spark.read \.option("header", "true") \.option("inferSchema", "true") \.csv("pandas_data/final_data.csv")
#final_data.show(5)


## PHASE ONE (1.1): MAPPINGS

In [30]:
# title - book id
def create_title_mapping(df):
    title_rows = df.select("book_id", "Title_Norm").distinct().collect()

    title_mapping = {row["book_id"]: row["Title_Norm"] for row in title_rows}

    print(f"Created mapping for {len(title_mapping)} unique books")
    return title_mapping

title_mapping = create_title_mapping(final_data)

# Preview a few entries
print("\nSample title mappings:")
for book_id, title in list(title_mapping.items())[:4]:
    print(f"Book ID: {book_id} ---> Title: {title}")

Created mapping for 100522 unique books

Sample title mappings:
Book ID: 40576 ---> Title: just another kid
Book ID: 12034 ---> Title: book of fresh flowers: a complete guide to selecting and arranging
Book ID: 59360 ---> Title: quick-strip paper piecing
Book ID: 63534 ---> Title: second foundation


In [31]:
# average rating - book id
def create_book_rating_mapping(df):
    avg_ratings = df.groupBy("book_id") \
                    .agg(F.avg("score").alias("avg_rating"),
                         F.count("score").alias("num_ratings")) \
                    .collect()

    rating_mapping = {row["book_id"]: {
                          "avg_rating": float(row["avg_rating"]),
                          "num_ratings": row["num_ratings"]
                      } for row in avg_ratings}

    print(f"Created rating mapping for {len(rating_mapping)} books")
    return rating_mapping

book_rating_mapping = create_book_rating_mapping(final_data)

# Preview a few entries
print("\nSample book rating mappings:")
sample_books = list(book_rating_mapping.keys())[:5]
for book_id in sample_books:
    rating_info = book_rating_mapping[book_id]
    print(f"Book ID: {book_id} ---> Rating: {rating_info['avg_rating']:.2f}/5 ({rating_info['num_ratings']} reviews)")

Created rating mapping for 100522 books

Sample book rating mappings:
Book ID: 6654 ---> Rating: 5.00/5 (1 reviews)
Book ID: 44906 ---> Rating: 3.78/5 (94 reviews)
Book ID: 65867 ---> Rating: 5.00/5 (6 reviews)
Book ID: 34239 ---> Rating: 5.00/5 (3 reviews)
Book ID: 89537 ---> Rating: 4.67/5 (3 reviews)


We need genres for two thing:
1. showing the genre of top books across all models.
2. making topic sensitive pagerank
<br>


First we read it from the second csv file and we normalize its titles for comparing against ours.

In [32]:
# book genre - book id

book_data = spark.read \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .option("quote", '"') \
    .option("escape", '"') \
    .option("multiLine", "true") \
    .csv("/content/books_data.csv") \
    .select("Title", "categories")

book_data_norm = normalize_titles(book_data)

genre_by_title = {row["Title_Norm"]: row["categories"]
                  for row in book_data_norm.select("Title_Norm", "categories").collect()
                  if row["categories"] is not None}

print(f"Loaded metadata for {len(genre_by_title)} books with genres")

def create_genre_mapping(df):
    books = df.select("book_id", "Title_Norm").distinct().collect()

    genre_mapping = {row["book_id"]: genre_by_title.get(row["Title_Norm"], "Unknown")
                    for row in books}

    matched = sum(1 for genre in genre_mapping.values() if genre != "Unknown")
    print(f"Created genre mapping for {len(genre_mapping)} books")
    print(f"Matched genres: {matched} ({matched/len(genre_mapping)*100:.1f}%)")

    return genre_mapping

genre_mapping = create_genre_mapping(final_data)

# Preview a few entries
print("\nSample genre mappings:")
for book_id in list(genre_mapping.keys())[:4]:
    title = title_mapping.get(book_id, "Unknown")
    genre = genre_mapping[book_id]
    print(f"Book ID: {book_id} ---> Genre: {genre}")

Loaded metadata for 165975 books with genres
Created genre mapping for 100522 books
Matched genres: 83214 (82.8%)

Sample genre mappings:
Book ID: 40576 ---> Genre: ['Family & Relationships']
Book ID: 12034 ---> Genre: ['Gardening']
Book ID: 59360 ---> Genre: ['Crafts & Hobbies']
Book ID: 63534 ---> Genre: Unknown


What genres are most common? You can use this info for gerring topic sensitive pagerank later.

In [33]:
# Get the top 10 genres
genre_counts = {}
for genre in genre_mapping.values():
    genre_counts[genre] = genre_counts.get(genre, 0) + 1

sorted_genres = sorted(genre_counts.items(), key=lambda x: x[1], reverse=True)

print("\nTop 10 genres by book count:")
for i, (genre, count) in enumerate(sorted_genres[:10]):
    print(f"{i+1}. {genre}: {count} books")


Top 10 genres by book count:
1. Unknown: 17308 books
2. ['Fiction']: 13826 books
3. ['Religion']: 4648 books
4. ['History']: 4428 books
5. ['Juvenile Fiction']: 3692 books
6. ['Biography & Autobiography']: 3484 books
7. ['Business & Economics']: 2866 books
8. ['Computers']: 2440 books
9. ['Social Science']: 1665 books
10. ['Juvenile Nonfiction']: 1585 books


# **PHASE TWO: BIULDING THE GRAPH**

### What is the Graph logic?

Writing a fucntion that makes the graph - Books connected if they share 2 or more reviews from at least 2 unique people. <br>
BOOKA <--> BOOKB if USER-A and USER-B reviewed both A and B

One more think to point out is that pagerank needs directed graph and since our book network has no direction, we have to double each link and make it **bidirectional**.
<br>
BOOK1 -> BOOK2
BOOK1 <- BOOK2

In [34]:
from pyspark.sql import functions as F
from graphframes import GraphFrame


def build_book_graph(df, threshold, purpose):
    user_books = df.select("user_id_int", "book_id").cache()
    book_pairs = user_books.alias("ub1").join(
        user_books.alias("ub2"),
        F.col("ub1.user_id_int") == F.col("ub2.user_id_int")
    ).filter(
        F.col("ub1.book_id") < F.col("ub2.book_id")
    ).select(
        F.col("ub1.book_id").alias("src"),
        F.col("ub2.book_id").alias("dst")
    )

    edges_df = book_pairs.groupBy("src", "dst").count() \
        .filter(F.col("count") >= threshold) \
        .select("src", "dst").cache()

    edges_df = edges_df.union(edges_df.select(F.col("dst").alias("src"), F.col("src").alias("dst")))

    vertices_df = edges_df.select("src").union(edges_df.select("dst")).distinct().withColumnRenamed("src", "id")

    graph = GraphFrame(vertices_df, edges_df)

    if purpose == "pagerank_builtin":
        # Only return GraphFrame for built-in PageRank
        return {"graph": graph}
    elif purpose == "pagerank_rdd":
        # Return RDD of edges and the number of nodes for RDD-based algorithms
        edges_rdd = graph.edges.rdd.map(lambda row: (row["src"], row["dst"]))
        nodes_count = vertices_df.count()
        return {"edges_rdd": edges_rdd, "nodes_count": nodes_count}
    elif purpose == "pagerank_python":
        # Return local lists for pure Python algorithms (be careful with large graphs!!!)
        links = [(row["src"], row["dst"]) for row in edges_df.collect()]
        pages = [row["id"] for row in vertices_df.collect()]
        return {"pages": pages, "links": links}
    else:
        raise ValueError(f"Unknown purpose: {purpose}")

# **PHASE THREE: OBTAINING A BENCHMRK**

Non personalized classic pagerak from the graphframe. Its results will be a testing ground for us.

If you refer to the graphframe user giude you can see this: <br>
link: https://graphframes.io/docs/_site/user-guide.html#pagerank

You can see we have 2 oprions, either fixed iteration or fixed tolerance (not at the same time). I Went with fixed iterations, you can easily test the other one as well.

In [35]:
data_1 = build_book_graph(final_data, threshold=2, purpose="pagerank_builtin")



In [36]:
# Count vertices and edges in the graph (just to know the size of the graph)
num_vertices = data_1['graph'].vertices.count()
num_edges = data_1['graph'].edges.count()

print(f"Number of vertices in the graph: {num_vertices:,}")
print(f"Number of edges in the graph: {num_edges:,}")

Number of vertices in the graph: 7,283
Number of edges in the graph: 106,028


In [37]:
# since the graph was bidirectional, we can divide the number of edges by 2 to get the actual number of connections
print(f"Number of unique connections (edges/2): {num_edges // 2:,}")

Number of unique connections (edges/2): 53,014


In [38]:
# running the built-in PageRank algorithm in oder to get a benchmark
# ATTENTION: This may take some time and it is slow, run this at your own risk!

result_one = data_1['graph'].pageRank(resetProbability=0.15, maxIter=20)



In [39]:
# First extract the top 20 book IDs - the benchmark for PageRank
# ATTENTION: This one too will take long, run this at your own risk!

top_20_books = result_one.vertices.orderBy(result_one.vertices.pagerank.desc()).limit(20).collect()
top_20_ids = [row["id"] for row in top_20_books]



In [40]:
# Create a formatted table with all required information
print("\nTop 20 Books by PageRank:")
print("-" * 20)
print(f"{'#':^4} | {'Title':^40} | {'Genre':^20} | {'Avg Rating':^10} | {'PageRank':^10}")
print("-" * 20)

for i, row in enumerate(top_20_books):
    book_id = row["id"]
    pagerank_score = row["pagerank"]

    title = title_mapping.get(book_id, "Unknown Title")
    title_display = (title[:37] + "...") if len(title) > 40 else title

    genre = genre_mapping.get(book_id, "Unknown")
    genre_display = (genre[:17] + "...") if len(genre) > 20 else genre

    rating_info = book_rating_mapping.get(book_id, {"avg_rating": 0.0})
    avg_rating = rating_info["avg_rating"]

    print(f"{i+1:^4} | {title_display:<40} | {genre_display:<20} | {avg_rating:^10.2f} | {pagerank_score:.6f}")

print(f"\nTop 20 book IDs: {top_20_ids}")


Top 20 Books by PageRank:
--------------------
 #   |                  Title                   |        Genre         | Avg Rating |  PageRank 
--------------------
 1   | pride and prejudice                      | ['Juvenile Nonfic... |    4.53    | 40.470887
 2   | the great gatsby                         | ['Fiction']          |    4.18    | 32.696410
 3   | wuthering heights                        | ['Fiction']          |    4.08    | 31.546651
 4   | the catcher in the rye                   | ['Young Adult Fic... |    3.97    | 30.978957
 5   | the hobbit                               | ['Juvenile Fiction'] |    4.67    | 30.261263
 6   | to kill a mockingbird                    | ['Performing Arts']  |    4.60    | 29.508966
 7   | fahrenheit 451                           | ['Comics & Graphi... |    4.20    | 27.744051
 8   | of mice and men                          | ['Fiction']          |    4.37    | 26.925210
 9   | brave new world                          | ['Reference']   

The above list is our benchmark, ideally we want the result of our following models to be identical or at least very close to this.

# **PHASE FOUR: ATTEMPTING TO WRITE THE ALGORITHM FROM SCRATCH**

First we write a function using only python, just to see if we can implement this logic ourselves or not.

In [41]:
def pagerank_python(pages, links, beta=0.85, max_iter=100, tol=1e-6, silent=False):

    if not pages:
        raise ValueError("Pages list cannot be empty")
    if not links:
        raise ValueError("Links list cannot be empty")
    if not (0 <= beta <= 1):
        raise ValueError(f"Beta must be between 0 and 1, got {beta}")
    if max_iter <= 0:
        raise ValueError(f"Max iterations must be positive, got {max_iter}")
    if tol <= 0:
        raise ValueError(f"Tolerance must be positive, got {tol}")

    N = len(pages)
    if not silent:
        print(f"Starting PageRank with {N:,} pages")

    ranks = {p: 1.0 / N for p in pages}
    initial_sum = sum(ranks.values())
    if not silent:
        print(f"Initialized ranks (sum={initial_sum:.8f})")

    adjacency = defaultdict(list)
    for src, dst in links:
        adjacency[src].append(dst)

    nodes_with_outlinks = len(adjacency)
    total_edges = len(links)
    if not silent:
        print(f"Built adjacency list ({nodes_with_outlinks:,} nodes with outlinks, {total_edges:,} total edges)")
        print(f"Starting PageRank iterations...")
        print("-" * 60)

    for iteration in range(max_iter):
        new_ranks = {p: (1 - beta) / N for p in pages}

        for src in pages:
            neighbors = adjacency.get(src, [])
            if not neighbors:
                continue
            share = ranks[src] / len(neighbors)
            for dst in neighbors:
                new_ranks[dst] += beta * share

        diff = (sum((new_ranks[p] - ranks[p])**2 for p in pages))**0.5  # L2 norm
        rank_sum = sum(new_ranks.values())

        if not silent:
            print(f"Iteration {iteration + 1:3d}: L2 diff={diff:.6f}, sum={rank_sum:.8f}")

        ranks = new_ranks
        if diff < tol:
            break

    if not silent:
        print("-" * 60)
        print(f"Converged after {iteration + 1} iterations")

        final_sum = sum(ranks.values())
        if abs(final_sum - 1.0) > 1e-6:
            print(f"Warning: Final sum ({final_sum:.8f}) deviates from 1.0")
        else:
            print(f"Rank sum validation passed")

    return sorted(ranks.items(), key=lambda x: -x[1]), iteration + 1

In [42]:
# first we make the graph for this specific function
data_2 = build_book_graph(final_data, threshold=2, purpose="pagerank_python")

In [43]:
result_two = pagerank_python(data_2['pages'], data_2['links'], beta=0.85, max_iter=100, tol=1e-6, silent=False)

Starting PageRank with 7,283 pages
Initialized ranks (sum=1.00000000)
Built adjacency list (7,283 nodes with outlinks, 106,028 total edges)
Starting PageRank iterations...
------------------------------------------------------------
Iteration   1: L2 diff=0.023783, sum=1.00000000
Iteration   2: L2 diff=0.007672, sum=1.00000000
Iteration   3: L2 diff=0.002466, sum=1.00000000
Iteration   4: L2 diff=0.001287, sum=1.00000000
Iteration   5: L2 diff=0.000847, sum=1.00000000
Iteration   6: L2 diff=0.000657, sum=1.00000000
Iteration   7: L2 diff=0.000505, sum=1.00000000
Iteration   8: L2 diff=0.000420, sum=1.00000000
Iteration   9: L2 diff=0.000341, sum=1.00000000
Iteration  10: L2 diff=0.000287, sum=1.00000000
Iteration  11: L2 diff=0.000239, sum=1.00000000
Iteration  12: L2 diff=0.000203, sum=1.00000000
Iteration  13: L2 diff=0.000170, sum=1.00000000
Iteration  14: L2 diff=0.000144, sum=1.00000000
Iteration  15: L2 diff=0.000122, sum=1.00000000
Iteration  16: L2 diff=0.000104, sum=1.00000000

In [44]:
# TOP 20 Books by Custom PageRank - PURE PYTHON implementation


custom_pagerank_sorted, iterations_to_converge = result_two
custom_pagerank_top20 = custom_pagerank_sorted[:20]
custom_pagerank_ids = [book_id for book_id, score in custom_pagerank_top20]

print(f"\nTop 20 Books by Custom PageRank (converged in {iterations_to_converge} iterations):")
print("-" * 60)
print(f"{'#':^4} | {'Title':^40} | {'Genre':^20} | {'Avg Rating':^10} | {'PageRank':^10}")
print("-" * 100)

for i, (book_id, pagerank_score) in enumerate(custom_pagerank_top20):
    title = title_mapping.get(book_id, "Unknown Title")
    title_display = (title[:37] + "...") if len(title) > 40 else title

    genre = genre_mapping.get(book_id, "Unknown")
    genre_display = (genre[:17] + "...") if len(genre) > 20 else genre

    rating_info = book_rating_mapping.get(book_id, {"avg_rating": 0.0})
    avg_rating = rating_info["avg_rating"]

    print(f"{i+1:^4} | {title_display:<40} | {genre_display:<20} | {avg_rating:^10.2f} | {pagerank_score:.6f}")

print(f"\nTop 20 book IDs from custom PageRank: {custom_pagerank_ids}")


Top 20 Books by Custom PageRank (converged in 45 iterations):
------------------------------------------------------------
 #   |                  Title                   |        Genre         | Avg Rating |  PageRank 
----------------------------------------------------------------------------------------------------
 1   | pride and prejudice                      | ['Juvenile Nonfic... |    4.53    | 0.005558
 2   | the great gatsby                         | ['Fiction']          |    4.18    | 0.004491
 3   | wuthering heights                        | ['Fiction']          |    4.08    | 0.004333
 4   | the catcher in the rye                   | ['Young Adult Fic... |    3.97    | 0.004255
 5   | the hobbit                               | ['Juvenile Fiction'] |    4.67    | 0.004156
 6   | to kill a mockingbird                    | ['Performing Arts']  |    4.60    | 0.004053
 7   | fahrenheit 451                           | ['Comics & Graphi... |    4.20    | 0.003810
 8   | of mic

# **PHASE FOUR (4.1): USING SPARK RDD FOR PAGERNAK**

In [45]:
data_3 = build_book_graph(final_data, threshold=2, purpose="pagerank_rdd")

In [56]:
def pagerank_rdd(edges_rdd, nodes_count, damping_factor=0.85, max_iter=100, tolerance=1e-6, silent=False):


    # Input validation
    if not edges_rdd:
        raise ValueError("Edges RDD cannot be empty")
    if nodes_count <= 0:
        raise ValueError(f"Nodes count must be positive, got {nodes_count}")
    if not (0 <= damping_factor <= 1):
        raise ValueError(f"Damping factor must be between 0 and 1, got {damping_factor}")
    if max_iter <= 0:
        raise ValueError(f"Max iterations must be positive, got {max_iter}")
    if tolerance <= 0:
        raise ValueError(f"Tolerance must be positive, got {tolerance}")

    if not silent:
        print(f"Starting PageRank RDD with {nodes_count:,} nodes")

    if not silent:
        print("Building adjacency list...")
    adjacency_rdd = edges_rdd.groupByKey().mapValues(list).cache()
    if not silent:
        print("adjacency list built and cached")

    if not silent:
        print("Identifying all nodes in the graph...")
    all_nodes = edges_rdd.flatMap(lambda x: [x[0], x[1]]).distinct().collect()

    teleport_prob = (1 - damping_factor) / len(all_nodes)
    if not silent:
        print(f"Teleportation probability set: {teleport_prob:.8f}")

    ranks = {node: 1.0 / len(all_nodes) for node in all_nodes}
    initial_sum = sum(ranks.values())
    if not silent:
        print(f"Initial ranks set (sum={initial_sum:.8f})")
        print(f"Starting PageRank iterations...")
        print("-" * 60)

    for iteration in range(max_iter):
        old_ranks = ranks.copy()

        ranks_bc = edges_rdd.context.broadcast(ranks)

        contributions = adjacency_rdd.flatMap(
            lambda node_neighbors: [
                (neighbor, damping_factor * ranks_bc.value[node_neighbors[0]] / len(node_neighbors[1]))
                for neighbor in node_neighbors[1]
            ]
        ).reduceByKey(lambda a, b: a + b).collectAsMap()

        ranks = {node: teleport_prob + contributions.get(node, 0) for node in all_nodes}

        diff = sum((ranks[node] - old_ranks[node])**2 for node in all_nodes)**0.5 # L2 norm

        total_sum = sum(ranks.values())

        if not silent:
            print(f"Iteration {iteration + 1:3d}: L2 diff={diff:.6f}, sum={total_sum:.8f}")

        ranks_bc.unpersist()

        if diff < tolerance:
            if not silent:
                print("-" * 60)
                print(f"Converged after {iteration + 1} iterations")
            break

    if not silent:
        if iteration + 1 == max_iter:
            print("-" * 60)
            print(f"Reached maximum iterations ({max_iter}) without full convergence")

        print(f"Final rank sum: {sum(ranks.values()):.8f}")

        final_sum = sum(ranks.values())
        if abs(final_sum - 1.0) > 1e-6:
            print(f"Warning: Final sum ({final_sum:.8f}) deviates from 1.0")
        else:
            print(f"Rank sum validation passed")

    adjacency_rdd.unpersist()

    return sorted(ranks.items(), key=lambda x: -x[1]), (iteration + 1)

carefull, this takes longer than the python version to run.

In [57]:
# Extract the edges_rdd and nodes_count from your graph
edges_rdd = data_3["edges_rdd"]
nodes_count = data_3["nodes_count"]

print(f"edges_rdd partitions: {edges_rdd.getNumPartitions()}")
print(f"edges_rdd is cached??: {edges_rdd.is_cached}")

edges_rdd_fixed = edges_rdd.repartition(4).cache() # in local you can change this to 8
edges_rdd_fixed.count()

ranks, iterations = pagerank_rdd(edges_rdd_fixed, nodes_count)


edges_rdd partitions: 400
edges_rdd is cached??: False
Starting PageRank RDD with 7,283 nodes
Building adjacency list...
adjacency list built and cached
Identifying all nodes in the graph...
Teleportation probability set: 0.00002060
Initial ranks set (sum=1.00000000)
Starting PageRank iterations...
------------------------------------------------------------
Iteration   1: L2 diff=0.023783, sum=1.00000000
Iteration   2: L2 diff=0.007672, sum=1.00000000
Iteration   3: L2 diff=0.002466, sum=1.00000000
Iteration   4: L2 diff=0.001287, sum=1.00000000
Iteration   5: L2 diff=0.000847, sum=1.00000000
Iteration   6: L2 diff=0.000657, sum=1.00000000
Iteration   7: L2 diff=0.000505, sum=1.00000000
Iteration   8: L2 diff=0.000420, sum=1.00000000
Iteration   9: L2 diff=0.000341, sum=1.00000000
Iteration  10: L2 diff=0.000287, sum=1.00000000
Iteration  11: L2 diff=0.000239, sum=1.00000000
Iteration  12: L2 diff=0.000203, sum=1.00000000
Iteration  13: L2 diff=0.000170, sum=1.00000000
Iteration  14: 

In [48]:
# TOP 20 Books by RDD PageRank implementation
rdd_iterations = iterations
rdd_pagerank_top20 = ranks[:20]
rdd_pagerank_ids = [book_id for book_id, score in rdd_pagerank_top20]

print(f"\nTop 20 Books by RDD PageRank (converged in {rdd_iterations} iterations):")
print("-" * 60)
print(f"{'#':^4} | {'Title':^40} | {'Genre':^20} | {'Avg Rating':^10} | {'PageRank':^10}")
print("-" * 100)

for i, (book_id, pagerank_score) in enumerate(rdd_pagerank_top20):
    title = title_mapping.get(book_id, "Unknown Title")
    title_display = (title[:37] + "...") if len(title) > 40 else title

    genre = genre_mapping.get(book_id, "Unknown")
    genre_display = (genre[:17] + "...") if len(genre) > 20 else genre

    rating_info = book_rating_mapping.get(book_id, {"avg_rating": 0.0})
    avg_rating = rating_info["avg_rating"]

    print(f"{i+1:^4} | {title_display:<40} | {genre_display:<20} | {avg_rating:^10.2f} | {pagerank_score:.6f}")

print(f"\nTop 20 book IDs from RDD PageRank: {rdd_pagerank_ids}")


Top 20 Books by RDD PageRank (converged in 45 iterations):
------------------------------------------------------------
 #   |                  Title                   |        Genre         | Avg Rating |  PageRank 
----------------------------------------------------------------------------------------------------
 1   | pride and prejudice                      | ['Juvenile Nonfic... |    4.53    | 0.005558
 2   | the great gatsby                         | ['Fiction']          |    4.18    | 0.004491
 3   | wuthering heights                        | ['Fiction']          |    4.08    | 0.004333
 4   | the catcher in the rye                   | ['Young Adult Fic... |    3.97    | 0.004255
 5   | the hobbit                               | ['Juvenile Fiction'] |    4.67    | 0.004156
 6   | to kill a mockingbird                    | ['Performing Arts']  |    4.60    | 0.004053
 7   | fahrenheit 451                           | ['Comics & Graphi... |    4.20    | 0.003810
 8   | of mice a

## COMPARING THE RESULTS BETWEEN THREE LISTS <br>
both in content (book titles) and their rankings.

In [49]:
# are they even similar?
common_all = set(top_20_ids) & set(custom_pagerank_ids) & set(rdd_pagerank_ids)
common_builtin_python = set(top_20_ids) & set(custom_pagerank_ids)
common_builtin_rdd = set(top_20_ids) & set(rdd_pagerank_ids)
common_python_rdd = set(custom_pagerank_ids) & set(rdd_pagerank_ids)

# Print overlap statistics
print(f"Common books in all three lists: {len(common_all)}/20")
print(f"Common books between Built-in and Python: {len(common_builtin_python)}/20")
print(f"Common books between Built-in and RDD: {len(common_builtin_rdd)}/20")
print(f"Common books between Python and RDD: {len(common_python_rdd)}/20")

# Check if the top books match across implementations
print("\nTop 5 books comparison:")
for i in range(5):
    if i < len(top_20_ids) and i < len(custom_pagerank_ids) and i < len(rdd_pagerank_ids):
        print(f"Position {i+1}: Built-in={top_20_ids[i]}, Python={custom_pagerank_ids[i]}, RDD={rdd_pagerank_ids[i]}")
        match = (top_20_ids[i] == custom_pagerank_ids[i] == rdd_pagerank_ids[i])
        print(f"  Match: {'yes' if match else 'no'}")

Common books in all three lists: 20/20
Common books between Built-in and Python: 20/20
Common books between Built-in and RDD: 20/20
Common books between Python and RDD: 20/20

Top 5 books comparison:
Position 1: Built-in=58130, Python=58130, RDD=58130
  Match: yes
Position 2: Built-in=78919, Python=78919, RDD=78919
  Match: yes
Position 3: Built-in=99734, Python=99734, RDD=99734
  Match: yes
Position 4: Built-in=73915, Python=73915, RDD=73915
  Match: yes
Position 5: Built-in=79741, Python=79741, RDD=79741
  Match: yes


The only difference is 2,3 of positions. Neglegable since top 20 books are the same.

# **PHASE FIVE: TOPIC SENSITIVE PAGERANK**

For this one too we will be using pure python and rdd version.
The only diffrence is that here we dont teleport to ANY place, Instead we travel to destinations that we are intrested in.

In [50]:
def pagerank_python_topic_sensitive(pages, links, genre_mapping, target_genres=None,
                                   beta=0.85, max_iter=100, tol=1e-6, silent=False):


    # Input validation
    if not pages:
        raise ValueError("Pages list cannot be empty")
    if not links:
        raise ValueError("Links list cannot be empty")
    if not isinstance(genre_mapping, dict):
        raise ValueError("Genre mapping must be a dictionary")
    if not (0 <= beta <= 1):
        raise ValueError(f"Beta must be between 0 and 1, got {beta}")
    if max_iter <= 0:
        raise ValueError(f"Max iterations must be positive, got {max_iter}")
    if tol <= 0:
        raise ValueError(f"Tolerance must be positive, got {tol}")

    N = len(pages)
    if not silent:
        print(f"Starting Topic-Sensitive PageRank with {N:,} pages")

    ranks = {p: 1.0 / N for p in pages}
    initial_sum = sum(ranks.values())
    if not silent:
        print(f"Initialized ranks (sum={initial_sum:.8f})")

    adjacency = defaultdict(list)
    for src, dst in links:
        adjacency[src].append(dst)

    nodes_with_outlinks = len(adjacency)
    total_edges = len(links)
    if not silent:
        print(f"Built adjacency list ({nodes_with_outlinks:,} nodes with outlinks, {total_edges:,} total edges)")

    if target_genres is None:
        # Original behavior: uniform teleportation to all pages
        teleport_prob = (1 - beta) / N
        teleport_probs = {p: teleport_prob for p in pages}
        if not silent:
            print(f"Using uniform teleportation (original PageRank behavior)")
    else:
        topic_pages = [p for p in pages if any(str(g) in str(genre_mapping.get(p, "")) for g in target_genres)]

        if not topic_pages:
            # going back to uniform if no target pages found
            teleport_prob = (1 - beta) / N
            teleport_probs = {p: teleport_prob for p in pages}
            if not silent:
                print(f"Warning: No pages found for target genres {target_genres}. Using uniform teleportation.")
        else:
            topic_teleport_prob = (1 - beta) / len(topic_pages)
            teleport_probs = {p: topic_teleport_prob if p in topic_pages else 0.0 for p in pages}
            if not silent:
                print(f"Topic-sensitive teleportation: {len(topic_pages):,} pages in {target_genres}")

    if not silent:
        print(f"Starting PageRank iterations...")
        print("-" * 60)

    for iteration in range(max_iter):
        new_ranks = {p: teleport_probs[p] for p in pages}

        for src in pages:
            neighbors = adjacency.get(src, [])
            share = ranks[src] / len(neighbors)
            for dst in neighbors:
                new_ranks[dst] += beta * share

        diff = (sum((new_ranks[p] - ranks[p])**2 for p in pages))**0.5  # L2 norm
        rank_sum = sum(new_ranks.values())

        if not silent:
            print(f"Iteration {iteration + 1:3d}: L2 diff={diff:.6f}, sum={rank_sum:.8f}")

        ranks = new_ranks
        if diff < tol:
            break

    if not silent:
        print("-" * 60)
        print(f"Converged after {iteration + 1} iterations")

        final_sum = sum(ranks.values())
        if abs(final_sum - 1.0) > 1e-6:
            print(f"Warning: Final sum ({final_sum:.8f}) deviates from 1.0")
        else:
            print(f"Rank sum validation passed")

    return sorted(ranks.items(), key=lambda x: -x[1]), iteration + 1

In [51]:
# lets choose a genre
result_two_fiction = pagerank_python_topic_sensitive(data_2['pages'], data_2['links'],genre_mapping, target_genres=['Fiction'], beta=0.85, max_iter=100, tol=1e-6, silent=False)

Starting Topic-Sensitive PageRank with 7,283 pages
Initialized ranks (sum=1.00000000)
Built adjacency list (7,283 nodes with outlinks, 106,028 total edges)
Topic-sensitive teleportation: 3,102 pages in ['Fiction']
Starting PageRank iterations...
------------------------------------------------------------
Iteration   1: L2 diff=0.023970, sum=1.00000000
Iteration   2: L2 diff=0.008106, sum=1.00000000
Iteration   3: L2 diff=0.002680, sum=1.00000000
Iteration   4: L2 diff=0.001514, sum=1.00000000
Iteration   5: L2 diff=0.001022, sum=1.00000000
Iteration   6: L2 diff=0.000827, sum=1.00000000
Iteration   7: L2 diff=0.000632, sum=1.00000000
Iteration   8: L2 diff=0.000546, sum=1.00000000
Iteration   9: L2 diff=0.000432, sum=1.00000000
Iteration  10: L2 diff=0.000380, sum=1.00000000
Iteration  11: L2 diff=0.000305, sum=1.00000000
Iteration  12: L2 diff=0.000270, sum=1.00000000
Iteration  13: L2 diff=0.000218, sum=1.00000000
Iteration  14: L2 diff=0.000193, sum=1.00000000
Iteration  15: L2 dif

In [52]:
# Extract the top 20 Fiction books from topic-sensitive PageRank
fiction_pagerank_sorted, iterations = result_two_fiction
fiction_pagerank_top20 = fiction_pagerank_sorted[:20]

print(f"\nTop 20 Fiction Books (Topic-Sensitive PageRank, converged in {iterations} iterations):")
print("-" * 60)
print(f"{'#':^4} | {'Title':^40} | {'Genre':^20} | {'Avg Rating':^10} | {'PageRank':^10}")
print("-" * 100)

for i, (book_id, pagerank_score) in enumerate(fiction_pagerank_top20):
    title = title_mapping.get(book_id, "Unknown Title")
    title_display = (title[:37] + "...") if len(title) > 40 else title

    genre = genre_mapping.get(book_id, "Unknown")
    genre_display = (genre[:17] + "...") if len(genre) > 20 else genre

    rating_info = book_rating_mapping.get(book_id, {"avg_rating": 0.0})
    avg_rating = rating_info["avg_rating"]

    print(f"{i+1:^4} | {title_display:<40} | {genre_display:<20} | {avg_rating:^10.2f} | {pagerank_score:.6f}")


Top 20 Fiction Books (Topic-Sensitive PageRank, converged in 47 iterations):
------------------------------------------------------------
 #   |                  Title                   |        Genre         | Avg Rating |  PageRank 
----------------------------------------------------------------------------------------------------
 1   | pride and prejudice                      | ['Juvenile Nonfic... |    4.53    | 0.006128
 2   | the hobbit                               | ['Juvenile Fiction'] |    4.67    | 0.004701
 3   | wuthering heights                        | ['Fiction']          |    4.08    | 0.004697
 4   | the great gatsby                         | ['Fiction']          |    4.18    | 0.004563
 5   | the catcher in the rye                   | ['Young Adult Fic... |    3.97    | 0.004469
 6   | to kill a mockingbird                    | ['Performing Arts']  |    4.60    | 0.004309
 7   | fahrenheit 451                           | ['Comics & Graphi... |    4.20    | 0.00408

In [53]:
def pagerank_rdd_topic_sensitive(edges_rdd, nodes_count, genre_mapping, target_genres=None,
                                damping_factor=0.85, max_iter=100, tolerance=1e-6, silent=False):

    # Input validation
    if not edges_rdd:
        raise ValueError("Edges RDD cannot be empty")
    if nodes_count <= 0:
        raise ValueError(f"Nodes count must be positive, got {nodes_count}")
    if not isinstance(genre_mapping, dict):
        raise ValueError("Genre mapping must be a dictionary")
    if not (0 <= damping_factor <= 1):
        raise ValueError(f"Damping factor must be between 0 and 1, got {damping_factor}")
    if max_iter <= 0:
        raise ValueError(f"Max iterations must be positive, got {max_iter}")
    if tolerance <= 0:
        raise ValueError(f"Tolerance must be positive, got {tolerance}")

    if not silent:
        print(f"Starting Topic-Sensitive PageRank RDD with {nodes_count:,} nodes")

    if not silent:
        print("Building adjacency list...")
    adjacency_rdd = edges_rdd.groupByKey().mapValues(list).cache()
    if not silent:
        print("Adjacency list built and cached")

    if not silent:
        print("Identifying all nodes in the graph...")
    all_nodes = edges_rdd.flatMap(lambda x: [x[0], x[1]]).distinct().collect()

    if target_genres is None:
        teleport_prob = (1 - damping_factor) / len(all_nodes)
        teleport_probs = {node: teleport_prob for node in all_nodes}
        if not silent:
            print(f"Using uniform teleportation (original PageRank behavior)")
            print(f"Teleportation probability set: {teleport_prob:.8f}")
    else:
        topic_nodes = [n for n in all_nodes if any(str(g) in str(genre_mapping.get(n, "")) for g in target_genres)]

        if not topic_nodes:

            teleport_prob = (1 - damping_factor) / len(all_nodes)
            teleport_probs = {node: teleport_prob for node in all_nodes}
            if not silent:
                print(f"Warning: No nodes found for target genres {target_genres}. Using uniform teleportation.")
                print(f"Teleportation probability set: {teleport_prob:.8f}")
        else:
            topic_teleport_prob = (1 - damping_factor) / len(topic_nodes)
            teleport_probs = {n: topic_teleport_prob if n in topic_nodes else 0.0 for n in all_nodes}
            if not silent:
                print(f"Topic-sensitive teleportation: {len(topic_nodes):,} nodes in {target_genres}")
                print(f"Topic teleportation probability: {topic_teleport_prob:.8f}")

    ranks = {node: 1.0 / len(all_nodes) for node in all_nodes}
    initial_sum = sum(ranks.values())
    if not silent:
        print(f"Initial ranks set (sum={initial_sum:.8f})")
        print(f"Starting PageRank iterations...")
        print("-" * 60)

    for iteration in range(max_iter):
        old_ranks = ranks.copy()

        ranks_bc = edges_rdd.context.broadcast(ranks)

        contributions = adjacency_rdd.flatMap(
            lambda node_neighbors: [
                (neighbor, damping_factor * ranks_bc.value[node_neighbors[0]] / len(node_neighbors[1]))
                for neighbor in node_neighbors[1]
            ]
        ).reduceByKey(lambda a, b: a + b).collectAsMap()

        ranks = {node: teleport_probs[node] + contributions.get(node, 0) for node in all_nodes}

        diff = sum((ranks[node] - old_ranks[node])**2 for node in all_nodes)**0.5  # L2 norm
        total_sum = sum(ranks.values())

        if not silent:
            print(f"Iteration {iteration + 1:3d}: L2 diff={diff:.6f}, sum={total_sum:.8f}")

        ranks_bc.unpersist()

        if diff < tolerance:
            if not silent:
                print("-" * 60)
                print(f"Converged after {iteration + 1} iterations")
            break

    if not silent:
        if iteration + 1 == max_iter:
            print("-" * 60)
            print(f"Reached maximum iterations ({max_iter}) without full convergence")

        print(f"Final rank sum: {sum(ranks.values()):.8f}")

        final_sum = sum(ranks.values())
        if abs(final_sum - 1.0) > 1e-6:
            print(f"Warning: Final sum ({final_sum:.8f}) deviates from 1.0")
        else:
            print(f"Rank sum validation passed")

    adjacency_rdd.unpersist()

    return sorted(ranks.items(), key=lambda x: -x[1]), (iteration + 1)

In [54]:
ranks_topic_sensitive, iterations = pagerank_rdd_topic_sensitive(edges_rdd_fixed, nodes_count, genre_mapping, target_genres=["Fiction"], damping_factor=0.85, max_iter=100, tolerance=1e-6, silent=False)

Starting Topic-Sensitive PageRank RDD with 7,283 nodes
Building adjacency list...
Adjacency list built and cached
Identifying all nodes in the graph...
Topic-sensitive teleportation: 3,102 nodes in ['Fiction']
Topic teleportation probability: 0.00004836
Initial ranks set (sum=1.00000000)
Starting PageRank iterations...
------------------------------------------------------------
Iteration   1: L2 diff=0.023970, sum=1.00000000
Iteration   2: L2 diff=0.008106, sum=1.00000000
Iteration   3: L2 diff=0.002680, sum=1.00000000
Iteration   4: L2 diff=0.001514, sum=1.00000000
Iteration   5: L2 diff=0.001022, sum=1.00000000
Iteration   6: L2 diff=0.000827, sum=1.00000000
Iteration   7: L2 diff=0.000632, sum=1.00000000
Iteration   8: L2 diff=0.000546, sum=1.00000000
Iteration   9: L2 diff=0.000432, sum=1.00000000
Iteration  10: L2 diff=0.000380, sum=1.00000000
Iteration  11: L2 diff=0.000305, sum=1.00000000
Iteration  12: L2 diff=0.000270, sum=1.00000000
Iteration  13: L2 diff=0.000218, sum=1.000

In [55]:
# Extract top 20 Fiction books from the RDD-based topic-sensitive PageRank
rdd_fiction_top20 = ranks_topic_sensitive[:20]

print(f"\nTop 20 Fiction Books (RDD Topic-Sensitive PageRank, converged in {iterations} iterations):")
print("-" * 60)
print(f"{'#':^4} | {'Title':^40} | {'Genre':^20} | {'Avg Rating':^10} | {'PageRank':^10}")
print("-" * 100)

for i, (book_id, pagerank_score) in enumerate(rdd_fiction_top20):
    title = title_mapping.get(book_id, "Unknown Title")
    title_display = (title[:37] + "...") if len(title) > 40 else title

    genre = genre_mapping.get(book_id, "Unknown")
    genre_display = (genre[:17] + "...") if len(genre) > 20 else genre

    rating_info = book_rating_mapping.get(book_id, {"avg_rating": 0.0})
    avg_rating = rating_info["avg_rating"]

    print(f"{i+1:^4} | {title_display:<40} | {genre_display:<20} | {avg_rating:^10.2f} | {pagerank_score:.6f}")


Top 20 Fiction Books (RDD Topic-Sensitive PageRank, converged in 47 iterations):
------------------------------------------------------------
 #   |                  Title                   |        Genre         | Avg Rating |  PageRank 
----------------------------------------------------------------------------------------------------
 1   | pride and prejudice                      | ['Juvenile Nonfic... |    4.53    | 0.006128
 2   | the hobbit                               | ['Juvenile Fiction'] |    4.67    | 0.004701
 3   | wuthering heights                        | ['Fiction']          |    4.08    | 0.004697
 4   | the great gatsby                         | ['Fiction']          |    4.18    | 0.004563
 5   | the catcher in the rye                   | ['Young Adult Fic... |    3.97    | 0.004469
 6   | to kill a mockingbird                    | ['Performing Arts']  |    4.60    | 0.004309
 7   | fahrenheit 451                           | ['Comics & Graphi... |    4.20    | 0.0

You can use the above functions for the classic pagerank too: <br>

classic_ranks, iterations = pagerank_rdd_topic_sensitive(<br>
&nbsp;&nbsp;&nbsp;&nbsp;    edges_rdd_fixed, <br>
&nbsp;&nbsp;&nbsp;&nbsp;    nodes_count, <br>
&nbsp;&nbsp;&nbsp;&nbsp;    genre_mapping, <br>
&nbsp;&nbsp;&nbsp;&nbsp;    target_genres=None,  # This makes it behave like classic PageRank<br>
&nbsp;&nbsp;&nbsp;&nbsp;    damping_factor=0.85, <br>
&nbsp;&nbsp;&nbsp;&nbsp;    max_iter=100, <br>
&nbsp;&nbsp;&nbsp;&nbsp;    tolerance=1e-6, <br>
&nbsp;&nbsp;&nbsp;&nbsp;    silent=False<br>
)<br>

You can use the above functions for the classic pagerank too: <br>

classic_ranks, iterations = pagerank_python_topic_sensitive(<br>
&nbsp;&nbsp;&nbsp;&nbsp;data_2['pages'], <br>
&nbsp;&nbsp;&nbsp;&nbsp;data_2['links'],<br>
&nbsp;&nbsp;&nbsp;&nbsp;genre_mapping, <br>
&nbsp;&nbsp;&nbsp;&nbsp;target_genres=None,  # This makes it behave like classic PageRank<br>
&nbsp;&nbsp;&nbsp;&nbsp;beta=0.85, <br>
&nbsp;&nbsp;&nbsp;&nbsp;max_iter=100, <br>
&nbsp;&nbsp;&nbsp;&nbsp;tol=1e-6, <br>
&nbsp;&nbsp;&nbsp;&nbsp;silent=False<br>
)<br>