In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import avg, col, count, desc

# Configuration
project_id = "de2025-471807"
bq_dataset = "netflix"
temp_bucket = "netflix-group5-temp"
gcs_data_bucket = "netflix_data_25"

# Spark configuration
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("BigQueryMoviesSimple")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create the Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector
spark.conf.set('temporaryGcsBucket', temp_bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

# Load data from BigQuery (assuming you uploaded movies.csv as table 'movies' in the netflix dataset)
# Format: project_id.dataset.table_name
# If you named the table differently when uploading, change 'movies' to your table name
df = spark.read \
  .format("bigquery") \
  .load(f"{project_id}.{bq_dataset}.movies")

# Show schema and preview
print("Movies dataset schema:")
df.printSchema()
print("\nFirst 5 rows:")
df.show(5, truncate=False)



In [None]:
# Simple aggregations on movies data
# 1. Count movies by genre
movies_by_genre = df.groupBy("genre_primary").count().orderBy(col("count").desc())
print("Movies count by genre:")
movies_by_genre.show(20)

# 2. Average IMDB rating by content type
avg_rating_by_type = df.groupBy("content_type").agg(
    avg("imdb_rating").alias("avg_imdb_rating"),
    count("*").alias("movie_count")
).orderBy(col("avg_imdb_rating").desc())
print("\nAverage IMDB rating by content type:")
avg_rating_by_type.show()

# 3. Movies by release year (recent years)
movies_by_year = df.groupBy("release_year").count().orderBy(col("release_year").desc())
print("\nMovies count by release year (top 20):")
movies_by_year.show(20)


In [None]:
# Write aggregated results to BigQuery
# Note: The movies table is already in BigQuery (uploaded via UI), so we only write aggregations

# Write movies_by_genre aggregation
print("Writing 'movies_by_genre' aggregation to BigQuery...")
movies_by_genre.write.format('bigquery') \
  .option('table', f'{project_id}.{bq_dataset}.movies_by_genre') \
  .mode("overwrite") \
  .save()

# Write avg_rating_by_type aggregation
print("Writing 'avg_rating_by_type' aggregation to BigQuery...")
avg_rating_by_type.write.format('bigquery') \
  .option('table', f'{project_id}.{bq_dataset}.avg_rating_by_type') \
  .mode("overwrite") \
  .save()

# Write movies_by_year aggregation
print("Writing 'movies_by_year' aggregation to BigQuery...")
movies_by_year.write.format('bigquery') \
  .option('table', f'{project_id}.{bq_dataset}.movies_by_year') \
  .mode("overwrite") \
  .save()

print("\nAll aggregations written to BigQuery successfully!")


In [None]:
# Stop the Spark context
spark.stop()