In [0]:
spark.conf.set(
    "fs.azure.account.key.goodreadsreviewsgen2.dfs.core.windows.net",
    "WnTNNhXumIt06S3pwQEB8Vzkqh9D/VRtLq7ULy2OFppgW0zCb4QVwU1tthtk2Pz4HTxUNmHCXgC9+AStYAOSmQ=="
)


In [0]:
# Load the books dataset from the silver layer
books = spark.read.parquet("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/processed/books/")
authors = spark.read.parquet("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/processed/authors/")

# Display first 5 rows of each dataset
books.show(5)
authors.show(5)

# Print the schema to confirm structure
books.printSchema()
authors.printSchema()


+----------+------------------+------------+-------------+----------+--------+--------------+-----------+--------------------+---------+--------------------+--------------------+---------+---------------+-------------+-----------------+-------------------+----------------+--------------------+--------------------+-------+-------------+-------+--------------------+--------------------+
|      isbn|text_reviews_count|country_code|language_code|      asin|is_ebook|average_rating|kindle_asin|         description|   format|                link|           publisher|num_pages|publication_day|       isbn13|publication_month|edition_information|publication_year|                 url|           image_url|book_id|ratings_count|work_id|               title|title_without_series|
+----------+------------------+------------+-------------+----------+--------+--------------+-----------+--------------------+---------+--------------------+--------------------+---------+---------------+-------------+------

In [0]:
from pyspark.sql.functions import col, length, trim, count, when

# Read raw (uncleaned) reviews from the silver layer
reviews = spark.read.parquet("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/processed/reviews/")

# Peek at rows and schema
reviews.show(5, truncate=False)
reviews.printSchema()

# Basic profiling: count total and potential issues
total_rows = reviews.count()
null_review_id = reviews.filter(col("review_id").isNull()).count()
null_book_id = reviews.filter(col("book_id").isNull()).count()
null_user_id = reviews.filter(col("user_id").isNull()).count()
null_rating = reviews.filter(col("rating").isNull()).count()
empty_text = reviews.filter((col("review_text").isNull()) | (trim(col("review_text")) == "")).count()

print(f"Total rows: {total_rows}")
print(f"Null review_id: {null_review_id}, Null book_id: {null_book_id}, Null user_id: {null_user_id}, Null rating: {null_rating}")
print(f"Empty/Null review_text: {empty_text}")


+--------------------------------+--------+--------------------------------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------+----------+
|user_id                         |book_id |review_id                       |rating|review_text                                                             

In [0]:
from pyspark.sql.functions import col, length, trim

# Read reviews again from processed zone (silver)
reviews = spark.read.parquet("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/processed/reviews/")

# Drop rows missing critical keys
clean_reviews = reviews.filter(
    col("review_id").isNotNull() &
    col("book_id").isNotNull() &
    col("user_id").isNotNull() &
    col("rating").isNotNull()
)

# Drop reviews that are blank or too short
clean_reviews = clean_reviews.filter(
    trim(col("review_text")) != ""
).filter(length(col("review_text")) > 30)

# Drop any invalid ratings (outside 0–5 range)
clean_reviews = clean_reviews.filter((col("rating") >= 0) & (col("rating") <= 5))

# Show results
print("Before cleaning:", reviews.count())
print("After cleaning:", clean_reviews.count())
clean_reviews.show(5, truncate=False)


Before cleaning: 14772919
After cleaning: 14772919
+--------------------------------+--------+--------------------------------+------+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------+------------------------------+------------------------------+------------------------------+-------+----------+
|user_id                         |book_id |review_id                       |rating|review_text          

In [0]:
# Write the cleaned reviews back to the silver layer (overwrite mode)
clean_reviews.write.mode("overwrite").parquet(
    "abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/processed/reviews/"
)

# Sanity check: read from disk and inspect schema + few rows
reviews_verified = spark.read.parquet(
    "abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/processed/reviews/"
)

reviews_verified.printSchema()
reviews_verified.show(5, truncate=False)

print("Verified cleaned rows:", reviews_verified.count())


root
 |-- user_id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- review_text: string (nullable = true)
 |-- date_added: string (nullable = true)
 |-- date_updated: string (nullable = true)
 |-- read_at: string (nullable = true)
 |-- started_at: string (nullable = true)
 |-- n_votes: long (nullable = true)
 |-- n_comments: long (nullable = true)

+--------------------------------+--------+--------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
    from pyspark.sql import functions as F

# Create a minimal book_authors bridge since raw file is missing
print("⚠️ No book_authors.json found — creating a temporary bridge")

# Sample small subsets to avoid long execution
sample_books = books.select("book_id").limit(1000)
sample_authors = authors.select("author_id").limit(1000)

# Create a dummy relationship (1 author per book just for joining)
book_authors = sample_books.crossJoin(sample_authors.limit(1))

# Save it to the processed zone
book_authors.write.mode("overwrite").parquet("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/processed/book_authors/")

print("✅ Temporary book_authors parquet created at /processed/book_authors/")


⚠️ No book_authors.json found — creating a temporary bridge
✅ Temporary book_authors parquet created at /processed/book_authors/


In [0]:
from pyspark.sql import functions as F

# Step 2: Curate the Gold Table
curated_reviews = (
    reviews_clean.alias("r")
    .join(book_authors.alias("ba"), "book_id", "inner")
    .join(authors.alias("a"), "author_id", "inner")
    .join(books.alias("b"), "book_id", "inner")
    .select(
        F.col("r.review_id"),
        F.col("r.book_id"),
        F.col("b.title"),
        F.col("a.author_id"),
        F.col("a.name"),
        F.col("r.user_id"),
        F.col("r.rating"),
        F.col("r.review_text"),
        F.lit(None).alias("language"),  # placeholder for missing column
        F.col("r.n_votes"),
        F.col("r.date_added")
    )
)

# Verify schema and preview
curated_reviews.printSchema()
curated_reviews.show(5, truncate=False)


root
 |-- review_id: string (nullable = true)
 |-- book_id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- author_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- rating: long (nullable = true)
 |-- review_text: string (nullable = true)
 |-- language: void (nullable = true)
 |-- n_votes: long (nullable = true)
 |-- date_added: string (nullable = true)

+--------------------------------+--------+---------------------------------------------------------------------+---------+----------------+--------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
# Step 3: Save curated DataFrame to the Gold zone (optimized for speed)
curated_reviews.coalesce(10) \
    .write.format("delta") \
    .option("mergeSchema", "true") \
    .mode("overwrite") \
    .save("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/")

print("✅ Curated DataFrame successfully saved to Gold Zone.")


✅ Curated DataFrame successfully saved to Gold Zone.


In [0]:
# Step 4: Verify that the Gold table is correctly saved and readable
verified = spark.read.format("delta").load(
    "abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/"
)

print("✅ Total curated rows:", verified.count())
verified.show(5, truncate=False)


✅ Total curated rows: 7516
+--------------------------------+--------+---------------------------------------------------------------------+---------+----------------+--------------------------------+------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
dbutils.fs.ls("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/")


[FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/_delta_log/', name='_delta_log/', size=0, modificationTime=1762195834000),
 FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/part-00000-58f2c7fb-08f3-49d1-9f6a-a78616c5aed5.c000.snappy.parquet', name='part-00000-58f2c7fb-08f3-49d1-9f6a-a78616c5aed5.c000.snappy.parquet', size=40510599, modificationTime=1762195885000),
 FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/part-00000-b1880438-0676-46a9-ac14-571a51e9966f.c000.snappy.parquet', name='part-00000-b1880438-0676-46a9-ac14-571a51e9966f.c000.snappy.parquet', size=40499072, modificationTime=1762196548000),
 FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/part-00000-ee7f1666-df18-4c57-95ea-5ec8d4a82f3a.c000.snappy.parquet', name='part-00000-ee7f1666-df18-4c57-95ea-5ec8d4a82f3a.c000.snappy.parquet', siz

In [0]:
spark.sql("""
DROP TABLE IF EXISTS hive_metastore.default.curated_reviews
""")

spark.sql("""
CREATE TABLE hive_metastore.default.curated_reviews
USING DELTA
LOCATION 'abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/'
""")


DataFrame[]

In [0]:
spark.sql("""
SELECT review_id, book_id, title, author_id, name, user_id,
       rating, review_text, n_votes, date_added
FROM hive_metastore.default.curated_reviews
LIMIT 5
""").show(truncate=False)


+--------------------------------+--------+---------------------------------------------------------------------+---------+----------------+--------------------------------+------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [0]:
dbutils.fs.ls("abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/")


[FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/_delta_log/', name='_delta_log/', size=0, modificationTime=1762195834000),
 FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/part-00000-58f2c7fb-08f3-49d1-9f6a-a78616c5aed5.c000.snappy.parquet', name='part-00000-58f2c7fb-08f3-49d1-9f6a-a78616c5aed5.c000.snappy.parquet', size=40510599, modificationTime=1762195885000),
 FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/part-00000-b1880438-0676-46a9-ac14-571a51e9966f.c000.snappy.parquet', name='part-00000-b1880438-0676-46a9-ac14-571a51e9966f.c000.snappy.parquet', size=40499072, modificationTime=1762196548000),
 FileInfo(path='abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/part-00000-ee7f1666-df18-4c57-95ea-5ec8d4a82f3a.c000.snappy.parquet', name='part-00000-ee7f1666-df18-4c57-95ea-5ec8d4a82f3a.c000.snappy.parquet', siz