In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import avg, col, count, desc
from pyspark.sql import functions as F
from pyspark.sql.window import Window

# =========================================
# START SPARK SESSION
# =========================================

# Configuration
project_id = "dejadsgl"
bq_dataset = "netflix"
temp_bucket = "netflix-group5-temp_gl"
gcs_data_bucket = "netflix_data_25"

# Spark configuration
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("AggregationsAnalytics")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create the Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector
spark.conf.set('temporaryGcsBucket', temp_bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print("Spark session started.")

# =========================================
# LOAD TABLES
# =========================================

# Load data from BigQuery
df = spark.read \
            .format("bigquery") \
            .load(f"{project_id}.{bq_dataset}.unified_review_dataset")
print(f"\nLoaded table: unified_review_dataset")
df.printSchema()

print("Done.")
# print the top 10 of the unified_review_dataset loaded in df
df.show(10, truncate=False)
# Zorg dat we date-velden hebben
df = (
    df
    .withColumn("day", F.to_date("review_date"))                # datum
    .withColumn("week_start", F.date_trunc("week", "day"))      # begin van de week
)

# =============================================================
# DAILY TOP 10 MOVIES
# =============================================================
daily_views = (
    df
    .groupBy("day", "movie_id")
    .agg(
        F.count("*").alias("num_events"),          # aantal reviews / events
        F.avg("rating").alias("avg_rating")        # optioneel: gemiddelde rating
    )
)

w_day = Window.partitionBy("day").orderBy(F.desc("num_events"))

daily_top10_per_movie = (
    daily_views
    .withColumn("rank", F.row_number().over(w_day))
    .filter(F.col("rank") <= 10)
)

# =============================================================
# WEEKLY TOP 10 MOVIES
# =============================================================
weekly_views = (
    df
    .groupBy("week_start", "movie_id")
    .agg(
        F.count("*").alias("num_events"),
        F.avg("rating").alias("avg_rating")
    )
)

w_week = Window.partitionBy("week_start").orderBy(F.desc("num_events"))

weekly_top10_per_movie = (
    weekly_views
    .withColumn("rank", F.row_number().over(w_week))
    .filter(F.col("rank") <= 10)
)

# =============================================================
# Recency per user
# =============================================================
user_recency = (
    df
    .groupBy("user_id")
    .agg(F.max("review_date").alias("last_interaction_date"))
    .withColumn(
        "days_since_last_interaction",
        F.datediff(F.current_date(), "last_interaction_date")
    )
)

# Activiteit laatste 30 dagen
last_30d = F.date_sub(F.current_date(), 30)

user_activity_30d = (
    df
    .filter(F.col("review_date") >= last_30d)
    .groupBy("user_id")
    .agg(
        F.count("*").alias("events_30d"),
        F.countDistinct("movie_id").alias("unique_titles_30d")
    )
)

# =============================================================
# Basis-user set (alle unieke users uit df)
# =============================================================
user_base = (
    df
    .select("user_id")
    .distinct()
    .join(user_recency, on="user_id", how="left")
    .join(user_activity_30d, on="user_id", how="left")
    .fillna({
        "days_since_last_interaction": 9999,
        "events_30d": 0,
        "unique_titles_30d": 0
    })
)

user_segments = (
    user_base
    .withColumn(
        "segment",
        F.when(F.col("events_30d") >= 20, "Power user")
         .when(F.col("days_since_last_interaction") <= 7, "Active")
         .when((F.col("days_since_last_interaction") > 7) & (F.col("days_since_last_interaction") <= 30), "At-risk")
         .otherwise("Dormant")
    )
)

# =============================================================
# Regional viewing patterns
# =============================================================

regional_viewing_patterns = (
    df
    .groupBy("location_country", "genre_primary")
    .agg(
        F.count("*").alias("events"),
        F.countDistinct("user_id").alias("unique_users"),
        F.countDistinct("movie_id").alias("unique_titles")
    )
)

w_region = Window.partitionBy("location_country")

regional_viewing_patterns = (
    regional_viewing_patterns
    .withColumn(
        "event_share_pct",
        100 * F.col("events") / F.sum("events").over(w_region)
    )
)

# =============================================================
# Device usage statistics
# =============================================================

device_usage_stats = (
    df
    .groupBy("device_type")
    .agg(
        F.count("*").alias("events"),
        F.countDistinct("user_id").alias("unique_users"),
        F.countDistinct("movie_id").alias("unique_titles")
    )
)

w_all_devices = Window.rowsBetween(Window.unboundedPreceding,
                                   Window.unboundedFollowing)

device_usage_stats = (
    device_usage_stats
    .withColumn(
        "event_share_pct",
        100 * F.col("events") / F.sum("events").over(w_all_devices)
    )
)

# =============================================================
# Churn risk scores per user (0-100 scale)
# =============================================================

churn_risk_scores = (
    user_base
    # basisrisico: elke dag inactiviteit +2 punten, max 100
    .withColumn(
        "base_risk",
        F.least(F.col("days_since_last_interaction") * 2, F.lit(100))
    )
    # activiteitsschild: actieve users krijgen korting op risico
    .withColumn(
        "activity_bonus",
        F.least(F.col("events_30d") * 3, F.lit(40))  # max 40 punten korting
    )
    .withColumn(
        "churn_risk_score",
        F.when(F.col("base_risk") - F.col("activity_bonus") < 0, 0)
         .when(F.col("base_risk") - F.col("activity_bonus") > 100, 100)
         .otherwise(F.col("base_risk") - F.col("activity_bonus"))
    )
    .select(
        "user_id",
        "days_since_last_interaction",
        "events_30d",
        "unique_titles_30d",
        "churn_risk_score"
    )
)

In [None]:
spark.stop()