In [0]:
# Replace with your actual storage account name and key
spark.conf.set(
"fs.azure.account.key.goodreadsreviews60301511.dfs.core.windows.net",
"QZzPMlZcQvM/LeucwJ67H1zRkEhbWCH9+uxdJaTWALJU/QN8ArtpEhMHmVb7vT2DaXAgMY52PkPH+AStsA7+fw=="
)

In [0]:
# Load the books dataset from the silver layer
books = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60301511.dfs.core.windows.net/processed/books/"
)
# Load the authors dataset from the silver layer
authors = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60301511.dfs.core.windows.net/processed/authors/"
)
# Display the first few records to confirm the data was loaded correctly
books.show(5)
authors.show(5)
# Display the columns and their data types to verify the schema
books.printSchema()
authors.printSchema()

+----------+------------------+------------+-------------+----------+--------+--------------+-----------+--------------------+---------+--------------------+--------------------+---------+---------------+-------------+-----------------+-------------------+----------------+--------------------+--------------------+-------+-------------+-------+--------------------+--------------------+
|      isbn|text_reviews_count|country_code|language_code|      asin|is_ebook|average_rating|kindle_asin|         description|   format|                link|           publisher|num_pages|publication_day|       isbn13|publication_month|edition_information|publication_year|                 url|           image_url|book_id|ratings_count|work_id|               title|title_without_series|
+----------+------------------+------------+-------------+----------+--------+--------------+-----------+--------------------+---------+--------------------+--------------------+---------+---------------+-------------+------

In [0]:
from pyspark.sql.functions import col, length, trim, count, when
# Read raw (uncleaned) reviews from the silver layer
reviews = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60301511.dfs.core.windows.net/processed/reviews/"
)
# Peek at rows and schema
reviews.show(5, truncate=False)
reviews.printSchema()
# Basic profiling: counts and potential issues
total_rows = reviews.count()
null_review_id = reviews.filter(col("review_id").isNull()).count()
null_book_id = reviews.filter(col("book_id").isNull()).count()
null_user_id = reviews.filter(col("user_id").isNull()).count()
null_rating = reviews.filter(col("rating").isNull()).count()
empty_text = reviews.filter( (col("review_text").isNull()) | (trim(col("review_text")) == "") ).count()
print(f"Total rows: {total_rows}")
print(f"NULL review_id: {null_review_id}, NULL book_id: {null_book_id}, NULL user_id: {null_user_id}, NULL rating: {null_rating}")
print(f"Empty/NULL review_text: {empty_text}")

+--------------------------------+--------+--------------------------------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
from pyspark.sql.functions import col, trim, length
# Start from the existing Parquet-loaded DataFrame
# (Assumes you already did: reviews = spark.read.parquet(".../processed/reviews/"))
df = reviews
# 1) Drop rows missing critical keys
df = df.filter(
col("review_id").isNotNull() &
col("book_id").isNotNull() &
col("user_id").isNotNull()
)
# 2) Enforce rating to be integer in [1..5]
df = df.withColumn("rating_int", col("rating").cast("int"))
df = df.filter(
col("rating_int").isNotNull() &
(col("rating_int") >= 1) &
(col("rating_int") <= 5)
)
# 3) Normalize text; drop empty or ultra-short reviews (<10 chars after trim)
df = df.withColumn("review_text", trim(col("review_text")))
df = df.filter(
col("review_text").isNotNull() &
(length(col("review_text")) >= 10)
)
# 4) De-duplicate by review_id (keep arbitrary first; refine if you have timestamps)
df = df.dropDuplicates(["review_id"])
# 5) Select final shape
reviews_clean = df.select(
"review_id",
"book_id",
"user_id",
col("rating_int").alias("rating"), "review_text", "n_votes", "date_added")

In [0]:
# Write the cleaned reviews back to the silver layer (overwrite)
reviews_clean.write.mode("overwrite").parquet(
"abfss://lakehouse@goodreadsreviews60301511.dfs.core.windows.net/processed/reviews/"
)
# Sanity check: re-read from disk and inspect schema and a few rows
reviews_verified = spark.read.parquet(
"abfss://lakehouse@goodreadsreviews60301511.dfs.core.windows.net/processed/reviews/"
)
reviews_verified.printSchema()
reviews_verified.show(5, truncate=False)
print(f"Written cleaned rows: {reviews_verified.count()}")

root
 |-- review_id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- n_votes: long (nullable = true)
 |-- date_added: string (nullable = true)

+--------------------------------+--------+--------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# HomeWork Part 1

from pyspark.sql.functions import col, lit

# Simple approach: Join reviews with books, handle authors separately
print("Creating curated gold table...")

# First, join reviews with books (we know book_id exists in both)
reviews_books = reviews_verified.alias("r").join(
    books.alias("b"),
    col("r.book_id") == col("b.book_id"),
    how="inner"
)

# Check if we can join with authors
if 'author_id' in books.columns and 'author_id' in authors.columns:
    # Join with authors using author_id from books
    curated_reviews_gold = reviews_books.join(
        authors.alias("a"),
        col("b.author_id") == col("a.author_id"),
        how="left"  # Use left join in case some books don't have author info
    ).select(
        col("r.review_id"),
        col("r.book_id"),
        col("b.title"),
        col("a.author_id"),
        col("a.name"),
        col("r.user_id"),
        col("r.rating"),
        col("r.review_text"),
        col("b.language_code").alias("language"),
        col("r.n_votes"),
        col("r.date_added")
    )
else:
    # If no author link, create without author info
    print("No author link found - creating curated reviews without author information")
    curated_reviews_gold = reviews_books.select(
        col("r.review_id"),
        col("r.book_id"),
        col("b.title"),
        lit("unknown").alias("author_id"),  # Placeholder
        lit("Unknown Author").alias("name"),  # Placeholder
        col("r.user_id"),
        col("r.rating"),
        col("r.review_text"),
        col("b.language_code").alias("language"),
        col("r.n_votes"),
        col("r.date_added")
    )

# Verify the results
print("Curated Reviews Gold DataFrame Schema:")
curated_reviews_gold.printSchema()
print("\nCurated Reviews Gold DataFrame Sample Rows:")
curated_reviews_gold.show(10, truncate=True)

print(f"\nTotal records in curated gold table: {curated_reviews_gold.count()}")

Creating curated gold table...
No author link found - creating curated reviews without author information
Curated Reviews Gold DataFrame Schema:
root
 |-- review_id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = false)
 |-- name: string (nullable = false)
 |-- user_id: string (nullable = true)
 |-- rating: integer (nullable = true)
 |-- review_text: string (nullable = true)
 |-- language: string (nullable = true)
 |-- n_votes: long (nullable = true)
 |-- date_added: string (nullable = true)


Curated Reviews Gold DataFrame Sample Rows:
+--------------------+--------+--------------------+---------+--------------+--------------------+------+--------------------+--------+-------+--------------------+
|           review_id| book_id|               title|author_id|          name|             user_id|rating|         review_text|language|n_votes|          date_added|
+--------------------+--------+-------