In [0]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, trim, lower, upper, initcap, length, regexp_replace, avg, count

# Start Spark session
spark = SparkSession.builder.appName("Goodreads Cleaning Gold Layer").getOrCreate()


In [0]:
# ===============================
# Load curated dataset (Delta)
# ===============================

# Authentication (keep your SAS settings from earlier here)
spark.conf.set(
    "fs.azure.account.auth.type.goodreadsreviewsgen2.dfs.core.windows.net", "SAS")
spark.conf.set(
    "fs.azure.sas.token.provider.type.goodreadsreviewsgen2.dfs.core.windows.net",
    "org.apache.hadoop.fs.azurebfs.sas.FixedSASTokenProvider")
spark.conf.set(
    "fs.azure.sas.fixed.token.goodreadsreviewsgen2.dfs.core.windows.net",
    "sp=rl&st=2025-11-10T16:31:14Z&se=2025-11-11T00:46:14Z&spr=https&sv=2024-11-04&sr=c&sig=7ObMZmTF7%2BsuRdomnRUyJvCoLHZESZlSbvTtzwA2goY%3D"
)

# Read dataset as Delta instead of Parquet
df = spark.read.format("delta").load(
    "abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews"
)

# Display the data
display(df)


review_id,book_id,title,author_id,name,user_id,rating,review_text,n_votes,date_added
56d3bfe96ef9ed0b14cd1fe5a981150e,20566082,"A Kitty in the Lion's Den (Sweet Water, #3)",604031,Ronald J. Fields,6909165b2c5b492a56c052d96c408ae1,4,3.5 Lots of sex and violence! Exactly what I want when I pick up a JS book.,0,Fri Jan 08 16:59:51 -0800 2016
396b842d094b11e8e0de9526943ea2ae,17279616,Štyri hostiny,604031,Ronald J. Fields,918b32591b04968a9afb560bdc9833c4,4,"Kupila som si tuto knizku len preto, aby som mala nieco ""intelektualskejsie"" na citanie pri vsetkych detektivkach a historickych romanoch. Ale bol to skvely vyber. Chvilu som ju nechala doma odpocivat a az neskor som sa pustila do styroch hostin, na ktore pozyva Jakob Seinfeld chlapca Zajdeho. Jeden z troch otcov svojho syna. Traja otcovia, jeden syn? Vselico sa stava a nic nie je nemozne. Aspon nie v tomto spolocenskom romane od Meira Shaleva. Musim priznat, ze mi trvalo dlhsie, kym som sa zacitala a knizka nema ""grady"", skor si tak plynie. Treba jej dat sancu, ma svoje caro. Poziciam si uryvok (jemne ale len jemne spoilerovy): ""Rozpravka o lejaku, o vode stupajucej vo vadi, o klamarskom revizionistovi, o manzelovi, ktory otala, a o manzelke, ktora bola neverna, prisla o dceru a prisla zit a pracovat do mastale vdovca, ktoreho kravy dojila a ktoreho deti vychovavala... Rozpravka o handliarovi, ktory nevie soferovat, o dietati, nad ktorym nema vladu smrt ani ziadostivost, o papierovych lodkach, o odstrihnutom vrkoci... No vrkoc bol schovany, had ustipol, albin prisiel, klamar klamal, manzel otalal, zena otehotnela a tam, v mastali, zila, pracovala, spala, plakala a porodila syna, nad ktorym smrt nemala vladu... Samozrejme, ze to tak bolo. Ak to tak nebolo, tak teda ako?"" Pribeh o vasni a o ludskom osude sa pise na zadnej obalke. A nic viac k tomu netreba.",2,Thu Mar 24 05:25:34 -0700 2016
48791f72b34f076a6c1ac944b04b9bda,12349663,"Naoki Urasawa's 20th Century Boys, Volume 19 (20th Century Boys, #19)",604031,Ronald J. Fields,997c4810e3a16bcad347c6455d1625e5,3,"This series is getting close to the end, you can feel the build-up, but sadly the long coming revelation and return of the main character Kenji felt a bit trite and predictable. Ok, so the whole of 20th Century Boys is very contrived and written to keep the suspense going seemingly for ever, but the simple and overused explanation of amnesia felt like too much of a gimmick. Maybe it's yet another homage to Japanese popular culture, but I must admit to feeling a bit disappointed... Oh well, onward to the last few volumes.",0,Wed Oct 23 00:42:02 -0700 2013
47ff25e965e3fc8422595bf6be804ad6,1403505,"The Pity Party: 8th Grade in the Life of Me, Cass",604031,Ronald J. Fields,0e62ee0b7d9061398472a6fa9884293c,5,"I liked this book, but it didn't sound as if it took place in 1982",0,Sun Oct 25 14:36:40 -0700 2009
77ccc576a568543e7f73d6dd041a1ab2,25402549,The Possession of Lawrence Eugene Davis,604031,Ronald J. Fields,02db8f5a7c61e788d2f2f6638301d45f,4,Possession is my first book by E. E. Ottoman and only my second from this publisher. The title and description just caught my attention. I am always looking for a different take on the classic paranormal romance book. I enjoyed the historical events like the dust bowl and World War references. I wish I could have learned more about Lawrence before the actual action of the novel began. I wanted to be more involved with his character before he is faced with the possession. I like the twists and turn of the story. The different paranormal aspects was interesting.,0,Tue Aug 25 14:38:39 -0700 2015
d929ca1000f9b057bb65ec80e4bd4e60,1626346,Das Schlimmste kommt noch oder Fast eine Jugend,604031,Ronald J. Fields,29e263fab9ac7d09d9b601a846f78a09,4,Gut geschriebenes Buch ohne unerwartete Wendungen oder grosse Gedanken. Schreibweise ist flussig und melancholisch umgesetzt. Passend zum Inhalt. Weiteres kann ich empfehlen fur diese Epoche. Das schweigen des Lichtes- Niel Flemming The great gatsby. John. Fitzgerarld In der Nacht- Dennis Lehane,0,Sat Feb 18 11:15:15 -0800 2017
69651ff1d326891a8762be2fb14c9886,427479,Terence Conran's DIY By Design: Over 30 Projects To Make and More Than 100 Design Ideas For Every Room In Your Home,604031,Ronald J. Fields,e0270e239207276514a4cc1909c9085b,0,the Book of Shelves. London flat 1990. This doesn't apply in the dust of Provence.,0,Wed Jul 27 07:24:21 -0700 2011
80685244fde0f40d32f4f8a0fcaab906,25501126,Counting Calories: How to Count Calories and Lose Weight Fast,604031,Ronald J. Fields,a6e798f1ba498ca899569c34fdf4e645,3,I thought this book was a little vague and didnt flow well.,0,Sat Aug 27 12:50:01 -0700 2016
21f7910a9d542be85800d765ba15d6c8,13648085,"Midnight Rescue (Killer Instincts, #1)",604031,Ronald J. Fields,d5a1240c232f152292e552f7c44ae9ab,4,"Find this review and more at kimberlyfaye reads . Of all the romance genres, romantic suspense is probably the one I shy away from the most. I've definitely enjoyed some good books in that genre, but have had nearly as many failures as successes, so I'm not always quick to pick one up. After thoroughly enjoying Elle's Brazen books and her new adult Off-Campus series - and seeing an author whose work I love talk about this series on Twitter, I knew I had to give it a try. I am so flipping glad I did, too. I *really* enjoyed this and have already chucked aside other review books to start the second book in the series. It didn't take long at all for me to get sucked into Abby and Kane's story and to get involved in the lives of the other characters in this book. I like that while it was obvious this was primarily about Abby and Kane, we were introduced to several other characters and got the backstories of several of them. Abby was a badass and I liked her from the start. She was rough around the edges and not like most of the women I find in the books I typically read, but that was what I liked about her. She was edgy, brave and driven. But, as much as I liked those qualities, I couldn't help but appreciate the way Kane was able to draw another, softer side from her - all while showing her that sex can be about more than power. Kane was hot, sexy and protective - just like I like my alpha males. They were an explosive match and the slow burn of sexual tension was nearly as intense as the action surrounding the rescue storyline. Midnight Rescue was sexy, suspenseful and all-consuming. I read this one from cover to cover, so to speak, in the matter of just a few hours. Unfortunately for me, I started it right before bed one night and actually dreamt about it. That's how invested in the story I was. I'm only a chapter or so into the second book right now, but I can tell you already there will be no rest until I finish it. I need more of all of the characters. I've already started pairing people off in my head, so I can't wait to see if it all plays out like I want it to. And, of course, I want to see what kind of situations they find themselves in along the way. I'm completely and totally hooked. And the best thing? By waiting until now to read them, I can totally binge-read. And, believe me, I will. I received a copy of this book from the publisher, via NetGalley, in exchange for an honest review.",1,Sat Mar 21 16:44:44 -0700 2015
d8de0d6447a21aebfecf9203de5cc2b9,3209316,Emma,604031,Ronald J. Fields,da13bcc8cfc339e415d928799e214c62,4,"1/2 My opinions of this work are two-fold, as Emma was my first full length journey into the world of audiobooks. The format (audiobook), was exceptional. A full 5 star effort. I have tried numerous times to suffer through an audiobook and they just drive me batty. I do not have the patience for them, usually because the individual perfoming them is substandard. Michael Page is fantastic! The number of men and women in this novel were evenly matched, but the protag is female. I attempted to read another audio version of Emma (this one by a woman) and it was horrendous! Mr. Page is talented enough to perform any gender, any age, and beat out a woman on the performance of a another woman...wonderful! If you are to try this in audio format, make sure you do not grab the one by Flo Gibson. All of her female sharacters sound like Mrs. Bennet from P and P. Argh! Awful. The story itself was very enjoyable, but the humor of all of Emma's blunders was the real pleasure. I am not a person who normally reads work that is focused on characters and their interactions. I tend to enjoy setting/world building far more. The minutia of people's emotions is not my standard fuel for literary enjoyment. A well written story will always trump my preferences however, and Emma was written with enjoyable skill. I suspect Austen had a smile hovering at the corner of her mouth the whole of the time she was writing it. I surely did while reading it. :)",0,Tue Jul 12 12:21:43 -0700 2011


In [0]:
df = df.dropna(subset=["rating", "review_text", "book_id", "author_id"])


In [0]:
df = df.dropDuplicates(["review_id"])
df = df.dropDuplicates(["user_id", "book_id"])


In [0]:
from pyspark.sql.functions import trim, lower

df = df.withColumn("review_text", trim(lower(df.review_text)))
df = df.withColumn("title", trim(lower(df.title)))
df = df.withColumn("name", trim(lower(df.name)))


In [0]:
from pyspark.sql.functions import length

df = df.withColumn("review_length", length(df.review_text))
df = df.filter(df.review_length >= 10)


In [0]:
df.select("rating", "n_votes").summary().show()


+-------+------------------+-----------------+
|summary|            rating|          n_votes|
+-------+------------------+-----------------+
|  count|              7516|             7516|
|   mean|3.6768227780734435|1.163650878126663|
| stddev|1.2797622517028802|5.754071825311664|
|    min|                 0|               -2|
|    25%|                 3|                0|
|    50%|                 4|                0|
|    75%|                 5|                1|
|    max|                 5|              208|
+-------+------------------+-----------------+



In [0]:
from pyspark.sql.functions import avg, count

agg_df = df.groupBy("book_id").agg(
    avg("rating").alias("avg_rating_per_book"),
    count("review_id").alias("num_reviews")
)
display(agg_df)


book_id,avg_rating_per_book,num_reviews
20566082,4.0,1
25402549,4.285714285714286,7
13648085,2.75,8
780912,3.25,4
218094,3.0,1
18628484,4.571428571428571,7
581166,3.0,2
9848978,5.0,1
2168736,3.4285714285714284,7
26201145,4.666666666666667,3


In [0]:
# 1) Point to your Gold root once
gold_root = "abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold"
target    = f"{gold_root}/features_v1"

# 2) (Optional) See what's there
display(dbutils.fs.ls(gold_root))

path,name,size,modificationTime
abfss://lakehouse@goodreadsreviewsgen2.dfs.core.windows.net/gold/curated_reviews/,curated_reviews/,0,1762195834000


In [0]:
# Write features to Databricks internal storage instead of ADLS
(agg_df
    .write
    .format("delta")
    .mode("overwrite")
    .option("overwriteSchema", "true")
    .save("/mnt/tmp/features_v1")
)

print("✅ Successfully written locally to /mnt/tmp/features_v1")


✅ Successfully written locally to /mnt/tmp/features_v1


In [0]:
display(spark.read.format("delta").load("/mnt/tmp/features_v1"))


book_id,avg_rating_per_book,num_reviews
25402549,4.285714285714286,7
780912,3.25,4
18628484,4.571428571428571,7
2471304,4.0,1
23246792,3.5,4
13340336,3.0,1
2168736,3.4285714285714284,7
23246799,5.0,2
831611,3.0,1
7943805,3.6666666666666665,3


In [0]:
# Load and verify the final Gold dataset (local Databricks storage)
check_df = spark.read.format("delta").load("/mnt/tmp/features_v1")

# Display a sample of records to confirm structure and values
display(check_df.limit(10))


book_id,avg_rating_per_book,num_reviews
25402549,4.285714285714286,7
780912,3.25,4
18628484,4.571428571428571,7
2471304,4.0,1
23246792,3.5,4
13340336,3.0,1
2168736,3.4285714285714284,7
23246799,5.0,2
831611,3.0,1
7943805,3.6666666666666665,3
