### 0. Import Libraries

In [None]:
import pyspark.sql.functions as F

### 1. Extract Data

In [None]:
df_silver_data = spark.read.format("delta").load("Tables/IMDb_Movies_with_People")
display(df_silver_data)

### 2. Transform Data

In [None]:
# Aggregate: Top-rated movies by genre
df_top_movies_by_genre = df_silver_data.groupBy("genres").agg(
    F.avg("averageRating").alias("avg_rating"),
    F.count("*").alias("movie_count")
).orderBy(F.desc("avg_rating"))

display(df_top_movies_by_genre)

In [None]:
# Aggregate: Average ratings by year
df_avg_ratings_by_year = df_silver_data.groupBy("startYear").agg(
    F.avg("averageRating").alias("avg_rating"),
    F.count("*").alias("movie_count")
).orderBy("startYear")

display(df_avg_ratings_by_year)

In [None]:
# Aggregate: Most prolific actors/directors
df_most_prolific_people = df_silver_data.groupBy("nconst", "primaryName").agg(
    F.count("*").alias("movie_count")
).orderBy(F.desc("movie_count"))

display(df_most_prolific_people)

### 3. Load Data

In [None]:
df_top_movies_by_genre.write.format('delta').mode('overwrite').save('Tables/IMDb_Top_Movies_by_Genre')
df_avg_ratings_by_year.write.format('delta').mode('overwrite').save('Tables/IMDb_Average_Ratings_by_Year')
df_most_prolific_people.write.format('delta').mode('overwrite').save('Tables/IMDb_Prolific_People')