In [5]:
pip install delta-spark==2.4.0

Note: you may need to restart the kernel to use updated packages.


In [6]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.functions import regexp_replace, col, when, trim, to_date
from pyspark.sql.types import IntegerType, StringType

In [9]:
#boxoffice_data_2024 
spark = ( SparkSession 
         .builder 
         .appName("BoxOffice Silver") 
         .getOrCreate() 
        )  
bronze_path = "hdfs://hdfs-nn:9000/demo/bronze/boxoffice_data_2024.csv" 
silver_path = "hdfs://hdfs-nn:9000/demo/silver/boxoffice_data_2024/"  
boxoffice_bronze_df = ( spark.read .option("header", "true") # set to "false" if your CSV has no header
                       .option("inferSchema", "true") # or define schema explicitly
                       .option("delimiter", ",") 
                       .csv(bronze_path)
                      ) 
boxoffice_bronze_df.show(5)
boxoffice_bronze_df.printSchema()  
boxoffice_silver_df = ( boxoffice_bronze_df 
                       .withColumn( "gross", regexp_replace(col("gross"), "[^0-9.]", "")
                                   .cast("double") ) 
                      ) 
boxoffice_silver_df.select("gross").show(10) 
boxoffice_silver_df.printSchema() 
boxoffice_silver_df = ( boxoffice_silver_df 
                       .withColumnRenamed("Year", "year") 
                       .withColumnRenamed("Title", "title") 
                      )  
boxoffice_silver_df.write .mode("overwrite") .parquet(silver_path)   
spark.read.parquet(silver_path)

+----+--------------------+------------+
|Year|               Title|       Gross|
+----+--------------------+------------+
|1984|   Beverly Hills Cop|$234,760,478|
|1984|        Ghostbusters|$229,376,332|
|1984|Indiana Jones and...|$179,876,727|
|1984|            Gremlins|$148,171,538|
|1984|      The Karate Kid| $90,817,155|
+----+--------------------+------------+
only showing top 5 rows

root
 |-- Year: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- Gross: string (nullable = true)

+------------+
|       gross|
+------------+
|2.34760478E8|
|2.29376332E8|
|1.79876727E8|
|1.48171538E8|
| 9.0817155E7|
| 8.1198894E7|
| 8.0038626E7|
| 7.6572547E7|
| 7.6471046E7|
| 6.9821334E7|
+------------+
only showing top 10 rows

root
 |-- Year: integer (nullable = true)
 |-- Title: string (nullable = true)
 |-- gross: double (nullable = true)



DataFrame[year: int, title: string, gross: double]

In [10]:
#movies_info

spark = (
    SparkSession
    .builder
    .appName("Movies Info Silver")
    .getOrCreate()
)

bronze_path = "hdfs://hdfs-nn:9000/demo/bronze/movies_info.json"
silver_path = "hdfs://hdfs-nn:9000/demo/silver/movies_info/"

info_bronze_df = (
    spark.read
        .option("multiLine", "true")
        .option("mode", "PERMISSIVE")
        .option("columnNameOfCorruptRecord", "_corrupt_record")
        .json(bronze_path)
)

info_bronze_df.show(5, truncate=False)

info_silver_df = (info_bronze_df.select(
        F.col("name").alias("title"),
        F.to_date(
            F.regexp_extract(F.col("released"), r"^([A-Za-z]+ \d{1,2}, \d{4})", 1),
            "MMMM d, yyyy"
        ).alias("released"),
        F.col("runtime").cast("int"),
        F.col("budget").cast("int"),
        F.col("director").cast("string"),
        F.col("writer").cast("string"),
        F.col("gross").cast("int")))

(
    info_silver_df
        .write
        .mode("overwrite")
        .parquet(silver_path)
)

spark.read.parquet(silver_path)

info_silver_df.show(5, truncate=False)

+--------+------------------+--------------+---------------+---------+---------+----------------------------------------------+------+-----------------------------+-------+-----+--------------+-------+-----------------------+----+
|budget  |company           |country       |director       |genre    |gross    |name                                          |rating|released                     |runtime|score|star          |votes  |writer                 |year|
+--------+------------------+--------------+---------------+---------+---------+----------------------------------------------+------+-----------------------------+-------+-----+--------------+-------+-----------------------+----+
|19000000|Warner Bros.      |United Kingdom|Stanley Kubrick|Drama    |46998772 |The Shining                                   |R     |June 13, 1980 (United States)|146    |8.4  |Jack Nicholson|927000 |Stephen King           |1980|
|4500000 |Columbia Pictures |United States |Randal Kleiser |Adventure|588531

In [11]:
#boxoffice_info

from pyspark.sql import SparkSession, functions as F

spark = (
    SparkSession.builder
    .appName("Join Boxoffice + MoviesInfo")
    .getOrCreate()
)

boxoffice_silver_path = "hdfs://hdfs-nn:9000/demo/silver/boxoffice_data_2024/"
moviesinfo_silver_path = "hdfs://hdfs-nn:9000/demo/silver/movies_info/"
joined_path = "hdfs://hdfs-nn:9000/demo/silver/boxoffice_info"

# 1) Read silver datasets
boxoffice = (
    spark.read.parquet(boxoffice_silver_path)
    .select(
        F.trim(F.col("title")).alias("title"),
        F.col("year").cast("int").alias("year"),
        F.col("gross").cast("long").alias("gross")
    )
)

movies = (
    spark.read.parquet(moviesinfo_silver_path)
    .select(
        F.trim(F.col("title")).alias("title"),
        F.to_date(F.col("released")).alias("released"),
        F.col("runtime").cast("int").alias("runtime"),
        F.col("director").cast("string").alias("director"),
        F.col("writer").cast("string").alias("writer"),
        F.col("budget").cast("long").alias("budget")
    )
)

# 2) De-dup / consolidate movies info ao nível (title, released, runtime)
#    Mantém budget máximo e escolhe um director/writer não nulo.
movies_1 = (
    movies
    .groupBy("title", "released", "runtime")
    .agg(
        F.max("budget").alias("budget"),
        F.first("director", ignorenulls=True).alias("director"),
        F.first("writer", ignorenulls=True).alias("writer"),
    )
)

# 3) Join: title + year(released)
joined = (
    boxoffice.alias("b")
    .join(
        movies_1.alias("m"),
        on=[
            F.col("b.title") == F.col("m.title"),
            F.col("b.year") == F.year(F.col("m.released"))
        ],
        how="left"
    )
    .select(
        F.col("m.released").alias("released"),
        F.col("b.title").alias("title"),
        F.col("b.gross").cast("long").alias("gross"),
        F.col("m.budget").cast("long").alias("budget"),
        F.col("m.director").alias("director"),
        F.col("m.writer").alias("writer"),
        F.col("m.runtime").cast("int").alias("runtime")
    )
)

joined.write.mode("overwrite").parquet(joined_path)
spark.read.parquet(joined_path).show(10, truncate=False)


+----------+------------------------------------+---------+--------+----------------+-----------------+-------+
|released  |title                               |gross    |budget  |director        |writer           |runtime|
+----------+------------------------------------+---------+--------+----------------+-----------------+-------+
|1984-12-05|Beverly Hills Cop                   |234760478|14000000|Martin Brest    |Daniel Petrie Jr.|105    |
|1984-06-08|Ghostbusters                        |229376332|30000000|Ivan Reitman    |Dan Aykroyd      |105    |
|1984-05-23|Indiana Jones and the Temple of Doom|179876727|28000000|Steven Spielberg|Willard Huyck    |118    |
|1984-06-08|Gremlins                            |148171538|11000000|Joe Dante       |Chris Columbus   |106    |
|1984-06-22|The Karate Kid                      |90817155 |8000000 |John G. Avildsen|Robert Mark Kamen|126    |
|1984-03-23|Police Academy                      |81198894 |4500000 |Hugh Wilson     |Neal Israel      |9

In [12]:
#dataset_piracy

spark = (
    SparkSession
    .builder
    .appName("Piracy Silver")
    .getOrCreate()
)

bronze_path = "hdfs://hdfs-nn:9000/demo/bronze/dataset_piracy.csv"
silver_path = "hdfs://hdfs-nn:9000/demo/silver/dataset_piracy/"

piracy_bronze_df = (
    spark.read
         .option("header", "true")      # o header da primeira linha
         .option("inferSchema", "true") # deixa o Spark inferir tipos
         .option("delimiter", ",")
         .csv(bronze_path)
)

piracy_bronze_df.show(5)
piracy_bronze_df.printSchema()

df = piracy_bronze_df

df = df.dropDuplicates()

cols_to_drop = [c for c in ["", "IMDb-rating", "id", "storyline", "rt_hours", "rt_mins", "language", "industry"] if c in df.columns]
df = df.drop(*cols_to_drop)

df = (
    df
    .withColumn("posted_date", to_date(col("posted_date"), "dd MMM, yyyy"))
    .withColumn("release_date", to_date(col("release_date"), "MMM dd yyyy"))
)

categorical_cols = [
    f.name for f in df.schema.fields
    if isinstance(f.dataType, StringType)
]

df = df.na.fill("Missing", subset=categorical_cols)

for c in ["views", "downloads"]:
    if c in df.columns:
        df = df.withColumn(
            c,
            regexp_replace(col(c).cast(StringType()), "[^0-9]", "").cast(IntegerType())
        )

df = df.filter(
    col("title").isNotNull() &
    (col("title") != "Missing") &
    col("posted_date").isNotNull()
)

piracy_silver_df = df

piracy_silver_df.show(30)
piracy_silver_df.printSchema()

pdf = piracy_silver_df.toPandas()   
pdf.to_csv("piracy_silver_sample.csv", index=False)

piracy_silver_df.describe(['_c0','downloads',"posted_date",'release_date','title','views', "writer", "director", "appropriate_for"]).toPandas()

(
    piracy_silver_df
        .write
        .mode("overwrite")
        .parquet(silver_path)
)

spark.read.parquet(silver_path)

+--------------------+----------------+---------------+-----------+---------+------+-------------------+--------+------------+------------+--------+---------+-----+-----+------+
|                 _c0|     IMDb-rating|appropriate_for|   director|downloads|    id|           industry|language| posted_date|release_date|run_time|storyline|title|views|writer|
+--------------------+----------------+---------------+-----------+---------+------+-------------------+--------+------------+------------+--------+---------+-----+-----+------+
|                   0|             4.8|              R|  John Swab|      304|372092|Hollywood / English| English|20 Feb, 2023| Jan 28 2023|     105|      Doc| null| null|  null|
| facilitates a fr...|        trading |           null|       null|     null|  null|               null|    null|        null|        null|    null|     null| null| null|  null|
|prosecutorial len...| Doc is left to |           null|       null|     null|  null|               null|    nu

DataFrame[_c0: string, appropriate_for: string, director: string, downloads: int, posted_date: date, release_date: date, run_time: string, title: string, views: int, writer: string]

In [13]:
#audience_reviews

spark = SparkSession.builder.appName("SilverAudienceReviews").getOrCreate()


df = spark.read.option("header", True).csv("audience_reviews.csv")

# Limpeza: considera vazios, espaços e strings não-numéricas como NULL
df_clean = df.withColumn(
    "Rating_num",
    when(trim(col("Rating")) == "", None)
    .when(col("Rating").rlike("^[0-9.]+$"), col("Rating").cast("double"))
    .otherwise(None)
)

# manter só as que têm um rating válido:
df_clean = df_clean.filter(col("Rating_num").isNotNull())

# Seleciona apenas as colunas essenciais (Show, Rating_num)
df_final = df_clean.select("Show", "Rating_num")
df_final.write.format("parquet").mode("overwrite").save("hdfs://hdfs-nn:9000/demo/silver/audience_reviews")

df_final.show(5)
df_final.printSchema()

+----+----------+
|Show|Rating_num|
+----+----------+
|Test|       0.0|
|Loot|       2.5|
|Loot|       4.5|
|Loot|       0.5|
|Loot|       4.5|
+----+----------+
only showing top 5 rows

root
 |-- Show: string (nullable = true)
 |-- Rating_num: double (nullable = true)



In [14]:
#critic_reviews

spark = SparkSession.builder.appName("SilverCriticReviews").getOrCreate()

bronze_path = "hdfs://hdfs-nn:9000/demo/bronze/critic_reviews.csv"

df = (
    spark.read
         .option("header", "true")      # o header da primeira linha
         .option("inferSchema", "true") # deixa o Spark inferir tipos
         .option("delimiter", ",")
         .csv(bronze_path)
)

df = df.select("Show", "Sentiment")


#df_clean = df_clean.withColumn("Sentiment", col("Sentiment").cast("int"))


#df_final = df_clean.select("Show", "Sentiment")


df.write.format("parquet").mode("overwrite").save("hdfs://hdfs-nn:9000/demo/silver/critic_reviews")

df.show(5)
df.printSchema()

+----------------+---------+
|            Show|Sentiment|
+----------------+---------+
|Mare of Easttown|        1|
|Mare of Easttown|        1|
|Mare of Easttown|        1|
|Mare of Easttown|        1|
|Mare of Easttown|        1|
+----------------+---------+
only showing top 5 rows

root
 |-- Show: string (nullable = true)
 |-- Sentiment: string (nullable = true)



In [15]:
#tv_show_links

spark = SparkSession.builder.appName("SilverTVShowLinks").getOrCreate()


df = spark.read.option("header", True).csv("tv_show_links.csv")


df_final = df.select("Show", "Network", "Critic Score", "Audience Score")


df_final = df_final.dropDuplicates()


df_final.write.format("parquet").mode("overwrite").save("hdfs://hdfs-nn:9000/demo/silver/tv_show_links")

df_final.show(5)
df_final.printSchema()

+--------------------+----------+------------+--------------+
|                Show|   Network|Critic Score|Audience Score|
+--------------------+----------+------------+--------------+
|         Outer Banks|   Netflix|         78%|           79%|
|           Cobra Kai|   Netflix|         93%|           92%|
|The Haunting of H...|   Netflix|         93%|           91%|
|         Blue Bloods|Paramount+|          --|           88%|
|  Person of Interest|   HBO MAX|         92%|           76%|
+--------------------+----------+------------+--------------+
only showing top 5 rows

root
 |-- Show: string (nullable = true)
 |-- Network: string (nullable = true)
 |-- Critic Score: string (nullable = true)
 |-- Audience Score: string (nullable = true)



In [16]:
#rotten_tomatoes_movies

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, trim, to_date, regexp_replace
from pyspark.sql.types import IntegerType

spark = SparkSession.builder.appName("SilverRottenTomatoes").getOrCreate()

df = (spark.read
      .option("header", True)
      .option("multiLine", True)
      .option("quote", "\"")
      .option("escape", "\"")
      .csv("rotten_tomatoes_movies.csv"))

# Normalizar strings: trim + vazio -> null
string_cols = [
    "movie_title", "rating", "genre", "directors", "writers",
    "cast", "studio_name", "tomatometer_status"
]
for c in string_cols:
    df = df.withColumn(c, when(trim(col(c)) == "", None).otherwise(trim(col(c))))

# Datas
df = df.withColumn("in_theaters_date", to_date(col("in_theaters_date"), "yyyy-MM-dd"))
df = df.withColumn("on_streaming_date", to_date(col("on_streaming_date"), "yyyy-MM-dd"))

# Numéricos (para não irem como texto)
df = df.withColumn("runtime_in_minutes", when(trim(col("runtime_in_minutes")) == "", None)
                                  .otherwise(trim(col("runtime_in_minutes"))))
df = df.withColumn("runtime_in_minutes", when(col("runtime_in_minutes") == "0", None)
                                  .otherwise(col("runtime_in_minutes")).cast(IntegerType()))

df = df.withColumn("tomatometer_rating", regexp_replace(col("tomatometer_rating"), "%", "").cast(IntegerType()))
df = df.withColumn("audience_rating",    regexp_replace(col("audience_rating"), "%", "").cast(IntegerType()))
df = df.withColumn("tomatometer_count",  col("tomatometer_count").cast(IntegerType()))
df = df.withColumn("audience_count",     col("audience_count").cast(IntegerType()))

# Validar status (resto -> null)
valid_status = ["Certified Fresh", "Fresh", "Rotten"]
df = df.withColumn("tomatometer_status",
                   when(col("tomatometer_status").isin(valid_status), col("tomatometer_status")).otherwise(None))

# Dedup por chave
df = df.dropDuplicates(["movie_title", "in_theaters_date", "runtime_in_minutes"])

# Seleção final (NÃO inclui critics_consensus)
df_final = df.select(
    "movie_title", "rating", "genre", "directors", "writers",
    "in_theaters_date", "on_streaming_date", "runtime_in_minutes",
    "studio_name", "tomatometer_status", "tomatometer_rating",
    "tomatometer_count", "audience_rating", "audience_count"
)

df_final.write.mode("overwrite").parquet("hdfs://hdfs-nn:9000/demo/silver/rotten_tomatoes_movies")
df_final.show(50)

+--------------------+------+--------------------+--------------------+--------------------+----------------+-----------------+------------------+--------------------+------------------+------------------+-----------------+---------------+--------------+
|         movie_title|rating|               genre|           directors|             writers|in_theaters_date|on_streaming_date|runtime_in_minutes|         studio_name|tomatometer_status|tomatometer_rating|tomatometer_count|audience_rating|audience_count|
+--------------------+------+--------------------+--------------------+--------------------+----------------+-----------------+------------------+--------------------+------------------+------------------+-----------------+---------------+--------------+
|             #Horror|    NR|              Horror|        Tara Subkoff|        Tara Subkoff|      2015-11-20|       2016-04-05|                90|    Lowland Pictures|            Rotten|                50|               12|            

In [17]:
#full_data

spark = (SparkSession.builder
         .appName("Silverfull_data")
         .enableHiveSupport()
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

HDFS_NN = "hdfs://hdfs-nn:9000"
BRONZE_BASE = f"{HDFS_NN}/demo/bronze"
SILVER_BASE = f"{HDFS_NN}/demo/silver"

BRONZE_FULL = f"{BRONZE_BASE}/full_data.csv"
OUT_FULL    = f"{SILVER_BASE}/full_data"


raw  = spark.read.text(BRONZE_FULL)
norm = raw.select(F.regexp_replace(F.col("value"), r"\s{2,}", "\t").alias("line"))
parts = norm.select(F.split(F.col("line"), "\t").alias("p"))


f = (parts.select(
        F.trim(F.col("p")[0]).alias("Ceremony"),
        F.trim(F.col("p")[1]).alias("Year"),
        F.trim(F.col("p")[2]).alias("Class"),
        F.trim(F.col("p")[3]).alias("CanonicalCategory"),
        F.trim(F.col("p")[4]).alias("Category"),
        F.trim(F.col("p")[5]).alias("NomId"),
        F.trim(F.col("p")[6]).alias("Film"),
        F.trim(F.col("p")[7]).alias("FilmId"),
        F.trim(F.col("p")[8]).alias("Name"),
        F.trim(F.col("p")[9]).alias("Nominees"),
        F.trim(F.col("p")[10]).alias("NomineeIds"),
        F.trim(F.col("p")[11]).alias("Winner"),
        F.trim(F.col("p")[12]).alias("Detail"),
        F.trim(F.col("p")[13]).alias("Note"),
        F.trim(F.col("p")[14]).alias("Citation"),
        F.trim(F.col("p")[15]).alias("MultifilmNomination"),
     )
     .filter(F.col("Ceremony") != "Ceremony")
)


f = (f.withColumn("Film", F.lower(F.trim(F.col("Film"))))
       .withColumn("CanonicalCategory",
            F.when(F.col("CanonicalCategory").isNotNull(),
                F.lower(F.trim(F.col("CanonicalCategory")))))
      .withColumn("Winner",
            F.when(F.col("Winner").isNull() | (F.trim(F.col("Winner")) == ""), F.lit(False))
                 .otherwise(F.lit(True)))
      .withColumn("MultifilmNomination",
            F.when(F.col("MultifilmNomination").isNull() | (F.trim(F.col("MultifilmNomination")) == ""), F.lit(False))
                 .otherwise(F.lit(True)))

)


missing_cols = ["Film", "FilmId", "Name", "Nominees"]

for colname in missing_cols:
    if colname in f.columns:
        f = f.withColumn(
            colname,
            F.when(F.col(colname).isNull() | (F.trim(F.col(colname)) == ""), F.lit("Missing/Unknown"))
             .otherwise(F.col(colname)))



keep_f = [x for x in ["Year","Film","CanonicalCategory","FilmId","Name","Nominees","Winner","MultifilmNomination"] if x in f.columns]
f = f.select(*keep_f)

(f.repartition(20)
 .write.mode("overwrite")
 .parquet(OUT_FULL))
print(f"OK — FULL_DATA → {OUT_FULL}")

f.show(5)

OK — FULL_DATA → hdfs://hdfs-nn:9000/demo/silver/full_data
+-------+--------------------+--------------------+---------+-------------------+-------------------+------+-------------------+
|   Year|                Film|   CanonicalCategory|   FilmId|               Name|           Nominees|Winner|MultifilmNomination|
+-------+--------------------+--------------------+---------+-------------------+-------------------+------+-------------------+
|1927/28|           the noose|actor in a leadin...|tt0019217|Richard Barthelmess|Richard Barthelmess|  true|              false|
|1927/28|the patent leathe...|actor in a leadin...|tt0018253|Richard Barthelmess|Richard Barthelmess|  true|              false|
|1927/28|    the last command|actor in a leadin...|tt0019071|      Emil Jannings|      Emil Jannings|  true|              false|
|1927/28|the way of all flesh|actor in a leadin...|tt0019553|      Emil Jannings|      Emil Jannings|  true|              false|
|1927/28|     a ship comes in|actress 

In [19]:
#the_oscar_award

spark = (SparkSession.builder
         .appName("Silverthe_oscar_award")
         .enableHiveSupport()
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

HDFS_NN = "hdfs://hdfs-nn:9000"
BRONZE_BASE = f"{HDFS_NN}/demo/bronze"
SILVER_BASE = f"{HDFS_NN}/demo/silver"

BRONZE_OSCARS = f"{BRONZE_BASE}/the_oscar_award.csv"
OUT_OSCARS    = f"{SILVER_BASE}/the_oscar_award"

# Ler
o = (spark.read.option("header", True).csv(BRONZE_OSCARS))


o = (o.withColumn("film", F.lower(F.trim(F.col("film"))))
       .withColumn("canon_category", F.when(F.col("canon_category").isNotNull(), F.lower(F.trim(F.col("canon_category")))))
)

missing_cols = ["film"]

for colname in missing_cols:
    if colname in o.columns:
        o = o.withColumn(
            colname,
            F.when(F.col(colname).isNull() | (F.trim(F.col(colname)) == ""), F.lit("Unknown"))
             .otherwise(F.col(colname)))


keep_o = [x for x in ["year_film","year_ceremony","canon_category","name","film","winner"] if x in o.columns]
o = o.select(*keep_o)

(o.repartition(12)
   .write.mode("overwrite")
   .parquet(OUT_OSCARS))

print(f"OK — OSCAR_AWARD → {OUT_OSCARS}")

o.show(5)

OK — OSCAR_AWARD → hdfs://hdfs-nn:9000/demo/silver/the_oscar_award
+---------+-------------+--------------------+-------------------+--------------------+------+
|year_film|year_ceremony|      canon_category|               name|                film|winner|
+---------+-------------+--------------------+-------------------+--------------------+------+
|     1927|         1928|actor in a leadin...|Richard Barthelmess|           the noose| False|
|     1927|         1928|actor in a leadin...|Richard Barthelmess|the patent leathe...| False|
|     1927|         1928|actor in a leadin...|      Emil Jannings|    the last command|  True|
|     1927|         1928|actor in a leadin...|      Emil Jannings|the way of all flesh|  True|
|     1927|         1928|actress in a lead...|     Louise Dresser|     a ship comes in| False|
+---------+-------------+--------------------+-------------------+--------------------+------+
only showing top 5 rows



In [20]:
#oscar_fulldata

spark = (
    SparkSession
    .builder
    .appName("Join TheOscarAward + FullData")
    .getOrCreate())
spark.sparkContext.setLogLevel("WARN")
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

oscar_silver_path = "hdfs://hdfs-nn:9000/demo/silver/the_oscar_award"
full_silver_path  = "hdfs://hdfs-nn:9000/demo/silver/full_data"

joined_path = "hdfs://hdfs-nn:9000/demo/silver/oscar_fulldata"

# 1) Read silver datasets
o = (
    spark.read.parquet(oscar_silver_path)
    .select(
        F.col("year_film").cast("int").alias("year_film"),
        F.col("year_ceremony").cast("int").alias("year_ceremony"),
        F.lower(F.trim(F.col("canon_category"))).alias("canon_category"),
        F.col("name").alias("name"),
        F.lower(F.trim(F.col("film"))).alias("film"),
        F.col("winner").cast("boolean").alias("winner")
    )
)

f = (
    spark.read.parquet(full_silver_path)
    .select(
        F.col("Year").cast("int").alias("year_film"),  # <- assumimos Year == year_film
        F.lower(F.trim(F.col("CanonicalCategory"))).alias("canon_category"),
        F.lower(F.trim(F.col("Film"))).alias("film"),
        F.col("FilmId").alias("FilmId"),
        F.col("Nominees").alias("Nominees"),
        F.col("MultifilmNomination").cast("boolean").alias("MultifilmNomination")
    )
)

# (optional) dedup full_data no nível da chave de join (para não duplicar linhas)
f_1 = (
    f.groupBy("year_film", "film", "canon_category")
     .agg(
         F.first("FilmId", ignorenulls=True).alias("FilmId"),
         F.first("Nominees", ignorenulls=True).alias("Nominees"),
         F.max("MultifilmNomination").alias("MultifilmNomination")
     )
)

# 2) Join: year_film + film + canon_category
joined = (
    o.alias("o")
    .join(
        f_1.alias("f"),
        on=[
            F.col("o.year_film") == F.col("f.year_film"),
            F.col("o.film") == F.col("f.film"),
            F.col("o.canon_category") == F.col("f.canon_category")
        ],
        how="left"
    )
    .select(
        F.col("o.year_film"),
        F.col("o.year_ceremony"),
        F.col("o.canon_category"),
        F.col("o.film"),
        F.col("o.winner"),
        F.col("f.FilmId"),
        F.col("f.Nominees"),
        F.col("f.MultifilmNomination")
    )
)

joined.write.mode("overwrite").parquet(joined_path)
spark.read.parquet(joined_path).show(30, truncate=False)

+---------+-------------+--------------------------------+-------------------------------------------------+------+---------+--------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|year_film|year_ceremony|canon_category                  |film                                             |winner|FilmId   |Nominees                                                                                                                              |MultifilmNomination|
+---------+-------------+--------------------------------+-------------------------------------------------+------+---------+--------------------------------------------------------------------------------------------------------------------------------------+-------------------+
|1944     |1945         |music (original song)           |minstrel man                                     |false |tt0037076|Harry Revel, Paul Webster       

In [22]:
#actorfilms

spark = SparkSession.builder.appName("LimpezaActorFilms").getOrCreate()

df = spark.read.option("header", True).csv("hdfs://hdfs-nn:9000/demo/bronze/actorfilms.csv")

df_clean = df.dropDuplicates()

df_clean = (
    df_clean.withColumn("Year", col("Year").cast("int"))
    .withColumn("Votes", col("Votes").cast("int"))
    .withColumn("Rating", col("Rating").cast("double"))
)

output_path = "hdfs://hdfs-nn:9000/demo/silver/actorfilms"

df_clean.write.format("parquet").mode("overwrite").save(output_path)
df_clean.show(5)

+-------------------+---------+------------------+----+------+------+---------+
|              Actor|  ActorID|              Film|Year| Votes|Rating|   FilmID|
+-------------------+---------+------------------+----+------+------+---------+
|       Fred Astaire|nm0000001|  Finian's Rainbow|1968|  3377|   6.2|tt0062974|
|Olivia de Havilland|nm0000014|           Raffles|1939|   833|   6.3|tt0032963|
|   Marlene Dietrich|nm0000017|Destry Rides Again|1939| 10411|   7.7|tt0031225|
|       Kirk Douglas|nm0000018|     Lust for Life|1956| 10760|   7.4|tt0049456|
|    Charlton Heston|nm0000032|Planet of the Apes|2001|211293|   5.7|tt0133152|
+-------------------+---------+------------------+----+------+------+---------+
only showing top 5 rows



In [23]:
#credits_netflix

spark = (SparkSession.builder
         .appName("Silvercredits")
         .enableHiveSupport()
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")
spark.conf.set("spark.sql.sources.partitionOverwriteMode", "dynamic")

HDFS_NN = "hdfs://hdfs-nn:9000"
BRONZE_BASE = f"{HDFS_NN}/demo/bronze"
SILVER_BASE = f"{HDFS_NN}/demo/silver"

BRONZE_CREDITS = f"{BRONZE_BASE}/credits_netflix.csv"
OUT_CREDITS    = f"{SILVER_BASE}/credits_netflix"


c = (spark.read.option("header", True).csv(BRONZE_CREDITS))


c = (c.withColumn("name", F.regexp_replace(F.trim(F.col("name")), r"\s+", " "))
       .withColumn("role", F.upper(F.trim(F.col("role"))))
       .filter(F.col("role").isin("ACTOR","DIRECTOR"))
)
keep_c = [x for x in ["person_id", "name","role"] if x in c.columns]
c = c.select(*keep_c)

(c.repartition(20).write.mode("overwrite").parquet(OUT_CREDITS))
print(f"OK — CREDITS → {OUT_CREDITS}")

c.show(5)

OK — CREDITS → hdfs://hdfs-nn:9000/demo/silver/credits_netflix
+---------+---------------+-----+
|person_id|           name| role|
+---------+---------------+-----+
|     3748| Robert De Niro|ACTOR|
|    14658|   Jodie Foster|ACTOR|
|     7064|  Albert Brooks|ACTOR|
|     3739|  Harvey Keitel|ACTOR|
|    48933|Cybill Shepherd|ACTOR|
+---------+---------------+-----+
only showing top 5 rows



In [24]:
#credits_amazon

spark = SparkSession.builder.appName("LimpezaCredits").getOrCreate()

df = spark.read.option("header", True).csv("hdfs://hdfs-nn:9000/demo/bronze/credits_amazon.csv")

df_clean = df.dropDuplicates()

cols_to_drop = ["character","id"] 
df_clean = df_clean.drop(*cols_to_drop)

df_clean = (
    df_clean.withColumn("person_id", col("person_id").cast("int"))
)

output_path = "hdfs://hdfs-nn:9000/demo/silver/credits_amazon"

df_clean.write.format("parquet").mode("overwrite").save(output_path)

df_clean.show(5)

+---------+-----------------+-----+
|person_id|             name| role|
+---------+-----------------+-----+
|    36507|     Carl Switzer|ACTOR|
|    58342|Ronald R. Rondell|ACTOR|
|    55732|      Bessie Love|ACTOR|
|   175617|     Joseph Forte|ACTOR|
|   151228|     Merle Oberon|ACTOR|
+---------+-----------------+-----+
only showing top 5 rows



In [25]:
#credits

from pyspark.sql import SparkSession, functions as F

spark = (SparkSession.builder
         .appName("JoinCredits")
         .getOrCreate())

HDFS_NN = "hdfs://hdfs-nn:9000"
SILVER_BASE = f"{HDFS_NN}/demo/silver"

AMZ_PATH = f"{SILVER_BASE}/credits_amazon"
NFX_PATH = f"{SILVER_BASE}/credits_netflix"
OUT_PATH = f"{SILVER_BASE}/credits"

amz = spark.read.parquet(AMZ_PATH)
nfx = spark.read.parquet(NFX_PATH)

def clean_credits(df):
    return (df
        .select(
            F.col("person_id").cast("int").alias("person_id"),
            F.regexp_replace(F.trim(F.col("name")), r"\s+", " ").alias("name"),
            F.upper(F.trim(F.col("role"))).alias("role")
        )
        .filter(F.col("person_id").isNotNull())
        .filter(F.col("name").isNotNull())
        .filter(F.col("role").isin("ACTOR", "DIRECTOR"))
        .dropDuplicates(["person_id", "role"])   # mesmo id pode ter 2 roles; mantém 1 linha por role
    )

amz_c = clean_credits(amz)
nfx_c = clean_credits(nfx)

credits_joined = (
    amz_c
    .unionByName(nfx_c)
    .dropDuplicates(["person_id", "role"])  # remove duplicados cross-source
)

credits_joined.write.mode("overwrite").parquet(OUT_PATH)
print(f"OK — CREDITS JOINED → {OUT_PATH}")

credits_joined.show(10, truncate=False)
credits_joined.groupBy("role").count().show()


OK — CREDITS JOINED → hdfs://hdfs-nn:9000/demo/silver/credits
+---------+-------------------+-----+
|person_id|name               |role |
+---------+-------------------+-----+
|8        |Arsher Ali         |ACTOR|
|19       |Guido Caprino      |ACTOR|
|36       |Livia Brito Pestana|ACTOR|
|45       |David Lewis        |ACTOR|
|46       |Jack La Rue        |ACTOR|
|53       |Phil LaMarr        |ACTOR|
|56       |Michael Rosenbaum  |ACTOR|
|59       |Juliette Goglia    |ACTOR|
|60       |Jamie Chung        |ACTOR|
|61       |Ryan Hansen        |ACTOR|
+---------+-------------------+-----+
only showing top 10 rows

+--------+------+
|    role| count|
+--------+------+
|DIRECTOR|  9209|
|   ACTOR|112751|
+--------+------+



In [26]:
#titles_netflix

from pyspark.sql import SparkSession, functions as F

spark = (SparkSession.builder
         .appName("SilverTitlesNetflix_FIXED")
         .enableHiveSupport()
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")

HDFS_NN     = "hdfs://hdfs-nn:9000"
BRONZE_BASE = f"{HDFS_NN}/demo/bronze"
SILVER_BASE = f"{HDFS_NN}/demo/silver"

BRONZE_TITLES_NF = f"{BRONZE_BASE}/titles_netflix.csv"
OUT_TITLES_NF    = f"{SILVER_BASE}/titles_netflix"

raw = (spark.read.format("csv")
       .option("header", True)
       .option("multiLine", True)     # important if description has newlines [web:36]
       .option("quote", "\"")
       .option("escape", "\"")
       .load(BRONZE_TITLES_NF))

t = (raw
     .withColumnRenamed("title", "title_netflix")
     .withColumn("title_netflix", F.trim(F.col("title_netflix")))
     .withColumn("type", F.upper(F.trim(F.col("type"))))
     .withColumn("release_year", F.col("release_year").cast("int"))
     .withColumn("runtime", F.col("runtime").cast("int"))
     .withColumn("age_certification", F.upper(F.trim(F.col("age_certification"))))
     .withColumn("genres", F.lower(F.trim(F.col("genres"))))
     .withColumn("production_countries", F.lower(F.trim(F.col("production_countries"))))
     .withColumn("imdb_score", F.col("imdb_score").cast("double"))
     .withColumn("tmdb_popularity", F.col("tmdb_popularity").cast("double"))
     .withColumn("tmdb_score", F.col("tmdb_score").cast("double"))
)

# Fill missing categoricals
for c in ["title_netflix", "age_certification", "genres", "production_countries", "type"]:
    if c in t.columns:
        t = t.withColumn(
            c,
            F.when(F.col(c).isNull() | (F.trim(F.col(c)) == ""), F.lit("unknown"))
             .otherwise(F.col(c))
        )

# Keep ONLY what you want (prevents any stray columns like description leaking forward) [web:47]
t = (t.select(
        "title_netflix", "runtime", "type", "release_year", "age_certification",
        "genres", "production_countries", "imdb_score", "tmdb_popularity", "tmdb_score"
     )
     .dropDuplicates()
)

t.write.mode("overwrite").parquet(OUT_TITLES_NF)
print(f"OK — TITLES NETFLIX (fixed) → {OUT_TITLES_NF}")
t.show(5, truncate=False)


OK — TITLES NETFLIX (fixed) → hdfs://hdfs-nn:9000/demo/silver/titles_netflix
+------------------------------+-------+-----+------------+-----------------+------------------------------------------+--------------------+----------+---------------+----------+
|title_netflix                 |runtime|type |release_year|age_certification|genres                                    |production_countries|imdb_score|tmdb_popularity|tmdb_score|
+------------------------------+-------+-----+------------+-----------------+------------------------------------------+--------------------+----------+---------------+----------+
|Richard Pryor: Live in Concert|78     |MOVIE|1979        |R                |['comedy', 'documentation']               |['us']              |8.1       |4.718          |7.5       |
|Heartland                     |44     |SHOW |2007        |TV-PG            |['drama', 'family']                       |['ca']              |8.4       |74.638         |8.3       |
|Freddy vs. Jason      

In [27]:
# titles_amazon

from pyspark.sql import SparkSession, functions as F

spark = (SparkSession.builder
         .appName("SilverTitlesAmazon_FIXED")
         .enableHiveSupport()
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")

HDFS_NN     = "hdfs://hdfs-nn:9000"
BRONZE_BASE = f"{HDFS_NN}/demo/bronze"
SILVER_BASE = f"{HDFS_NN}/demo/silver"

BRONZE_TITLES_AMZ = f"{BRONZE_BASE}/titles_amazon.csv"
OUT_TITLES_AMZ    = f"{SILVER_BASE}/titles_amazon"

raw = (spark.read.format("csv")
       .option("header", True)
       .option("multiLine", True)     # important if description has newlines [web:36]
       .option("quote", "\"")
       .option("escape", "\"")
       .load(BRONZE_TITLES_AMZ))

df = (raw
      .withColumnRenamed("title", "title_amazon")
      .withColumn("title_amazon", F.trim(F.col("title_amazon")))
      .withColumn("type", F.upper(F.trim(F.col("type"))))
      .withColumn("runtime", F.col("runtime").cast("int"))
      .withColumn("release_year", F.col("release_year").cast("int"))
      .withColumn("age_certification", F.upper(F.trim(F.col("age_certification"))))
      .withColumn("genres", F.lower(F.trim(F.col("genres"))))
      .withColumn("production_countries", F.lower(F.trim(F.col("production_countries"))))
      .withColumn("imdb_score", F.col("imdb_score").cast("double"))
      .withColumn("imdb_votes", F.col("imdb_votes").cast("int"))
      .withColumn("tmdb_popularity", F.col("tmdb_popularity").cast("double"))
      .withColumn("tmdb_score", F.col("tmdb_score").cast("double"))
)

for c in ["title_amazon", "age_certification", "genres", "production_countries", "type"]:
    if c in df.columns:
        df = df.withColumn(
            c,
            F.when(F.col(c).isNull() | (F.trim(F.col(c)) == ""), F.lit("unknown"))
             .otherwise(F.col(c))
        )

df = (df.select(
        "title_amazon", "runtime", "type", "release_year", "age_certification",
        "genres", "production_countries", "imdb_score", "imdb_votes",
        "tmdb_popularity", "tmdb_score"
      )
      .dropDuplicates()
)

df.write.mode("overwrite").parquet(OUT_TITLES_AMZ)
print(f"OK — TITLES AMAZON (fixed) → {OUT_TITLES_AMZ}")
df.show(5, truncate=False)


OK — TITLES AMAZON (fixed) → hdfs://hdfs-nn:9000/demo/silver/titles_amazon
+----------------------+-------+-----+------------+-----------------+-------------------------------------+--------------------+----------+----------+---------------+----------+
|title_amazon          |runtime|type |release_year|age_certification|genres                               |production_countries|imdb_score|imdb_votes|tmdb_popularity|tmdb_score|
+----------------------+-------+-----+------------+-----------------+-------------------------------------+--------------------+----------+----------+---------------+----------+
|'Neath Brooklyn Bridge|61     |MOVIE|1942        |unknown          |['romance', 'war', 'comedy', 'drama']|['us']              |5.8       |659       |1.809          |6.9       |
|The Medicine Man      |66     |MOVIE|1930        |unknown          |['comedy', 'romance']                |['us']              |4.9       |249       |2.306          |5.0       |
|Sex Madness           |57     |MOV

In [20]:
#titles

from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window

spark = (SparkSession.builder
         .appName("Titles_Union_Dedup_Simple")
         .getOrCreate())
spark.sparkContext.setLogLevel("WARN")

HDFS_NN     = "hdfs://hdfs-nn:9000"
SILVER_BASE = f"{HDFS_NN}/demo/silver"

NETFLIX_PATH = f"{SILVER_BASE}/titles_netflix"
AMAZON_PATH  = f"{SILVER_BASE}/titles_amazon"
OUT_PATH     = f"{SILVER_BASE}/titles"

# helper: normalize title
def clean_title(c):
    return F.regexp_replace(F.trim(c), r"\s+", " ")

# --- Read
nf  = spark.read.parquet(NETFLIX_PATH)
amz = spark.read.parquet(AMAZON_PATH)

# NOTE: in your silver, column names are likely: titlenetflix/titleamazon (not title_netflix/title_amazon) [file:29]
# If so, change them below accordingly.

nf2 = (nf
    .withColumn("title", clean_title(F.col("title_netflix")))     # or "title_netflix" if that is your real col name
    .withColumn("type", F.lower(F.trim(F.col("type"))))
    .withColumn("netflix", F.lit(1))
    .withColumn("amazon", F.lit(0))
    .drop("titlenetflix")
)

amz2 = (amz
    .withColumn("title", clean_title(F.col("title_amazon")))      # or "title_amazon" if that is your real col name
    .withColumn("type", F.lower(F.trim(F.col("type"))))
    .withColumn("netflix", F.lit(0))
    .withColumn("amazon", F.lit(1))
    .drop("titleamazon")
)

u = nf2.unionByName(amz2, allowMissingColumns=True)

group_key = ["title", "release_year", "type"]   # change to "release_year" if that’s your real name

out0 = (u
    .groupBy(*group_key)
    .agg(
        F.max("netflix").alias("netflix"),
        F.max("amazon").alias("amazon"),
        F.first("age_certification", ignorenulls=True).alias("age_certification"),
        F.first("genres", ignorenulls=True).alias("genres"),
        F.first("runtime", ignorenulls=True).alias("runtime"),
        F.first("production_countries", ignorenulls=True).alias("production_countries"),
        F.first("imdb_score", ignorenulls=True).alias("imdb_score"),
        F.first("imdb_votes", ignorenulls=True).alias("imdb_votes"),
        F.first("tmdb_popularity", ignorenulls=True).alias("tmdb_popularity"),
        F.first("tmdb_score", ignorenulls=True).alias("tmdb_score"),
    )
)

w = Window.orderBy(F.col("title"), F.col("release_year"), F.col("type"))

out = (out0
    .withColumn("id_num", F.row_number().over(w))
    .withColumn("id_title", F.lpad(F.col("id_num").cast("string"), 4, "0"))
    .drop("id_num")
    .select(
        "id_title", "title", F.col("release_year").alias("release_year"), "runtime", "type", "netflix", "amazon",
        "age_certification", "genres", "production_countries",
        "imdb_score", "imdb_votes", "tmdb_popularity", "tmdb_score"
    )
)

out.write.mode("overwrite").parquet(OUT_PATH)
print(f"OK → {OUT_PATH}")
out.show(50, truncate=False)


OK → hdfs://hdfs-nn:9000/demo/silver/titles
+--------+-----------------------------------------+------------+-------+-----+-------+------+-----------------+---------------------------------------------------------------+--------------------+----------+----------+------------------+----------+
|id_title|title                                    |release_year|runtime|type |netflix|amazon|age_certification|genres                                                         |production_countries|imdb_score|imdb_votes|tmdb_popularity   |tmdb_score|
+--------+-----------------------------------------+------------+-------+-----+-------+------+-----------------+---------------------------------------------------------------+--------------------+----------+----------+------------------+----------+
|0001    |#ABtalks                                 |2018        |68     |show |1      |0     |TV-PG            |[]                                                             |[]                  |9.6      

In [21]:
spark.stop()