In [9]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import avg, col, count, desc

# =========================================
# 0. START SPARK SESSION
# =========================================

# Configuration
project_id = "dejadsgl"
bq_dataset = "netflix"
temp_bucket = "netflix-group5-temp_gl"
gcs_data_bucket = "netflix_data_25"

# Spark configuration
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkFeatureDataset")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create the Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector
spark.conf.set('temporaryGcsBucket', temp_bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print("Spark session started.")

# =========================================
# 1. LOAD ALL TABLES
# =========================================

# Load data from BigQuery
df = spark.read \
            .format("bigquery") \
            .load(f"{project_id}.{bq_dataset}.unified_review_dataset")
print(f"\nLoaded table: unified_review_dataset")
df.printSchema()

print("Done.")

Spark session started.

Loaded table: unified_review_dataset
root
 |-- review_id: string (nullable = false)
 |-- user_id: string (nullable = false)
 |-- movie_id: string (nullable = false)
 |-- review_date: date (nullable = true)
 |-- device_type: string (nullable = false)
 |-- is_verified_watch: boolean (nullable = true)
 |-- review_text: string (nullable = false)
 |-- sentiment: string (nullable = false)
 |-- sentiment_score: double (nullable = true)
 |-- rating: long (nullable = true)
 |-- rating_sentiment_score: double (nullable = true)
 |-- rating_sentiment_label: string (nullable = false)
 |-- helpful_votes: double (nullable = true)
 |-- total_votes: double (nullable = true)

Done.


In [20]:
# -------------------------------------------------------------------
# A. CONTENT METRICS
# -------------------------------------------------------------------

from pyspark.sql import functions as F

# Basis: zorg dat review_date een echte date-kolom is
df = df_reviews.withColumn("review_date", F.to_date("review_date"))

# =============================================================
# 1. Total views per movie  (genre hebben we hier niet)
# =============================================================
total_views_per_movie = (
    df
    .groupBy("movie_id")
    .agg(
        F.count("*").alias("total_reviews"),
        F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0)).alias("verified_reviews")
    )
)

# Resultaat bekijken:
total_views_per_movie.show(10, truncate=False)

# =============================================================
# 2. Average completion rate (proxy: gemiddelde rating)
# =============================================================
average_completion_proxy = (
    df
    .groupBy("movie_id")
    .agg(
        F.avg("rating").alias("avg_rating"),
        F.avg("sentiment_score").alias("avg_text_sentiment_score")
    )
)

average_completion_proxy.show(10, truncate=False)

# =============================================================
# 3. Popularity trends (daily & weekly) op basis van review-volume
# =============================================================

# Daily: aantal reviews per film per dag
popularity_daily = (
    df
    .groupBy("movie_id", "review_date")
    .agg(F.count("*").alias("daily_reviews"))
)

popularity_daily.show(10, truncate=False)

# Weekly: aantal reviews per film per week
df_week = df.withColumn("week_start", F.date_trunc("week", F.col("review_date")))

popularity_weekly = (
    df_week
    .groupBy("movie_id", "week_start")
    .agg(F.count("*").alias("weekly_reviews"))
)

popularity_weekly.show(10, truncate=False)

# =============================================================
# 4. Regional preferences (USA vs Canada)
# =============================================================
# LET OP: de review dataset heeft GEEN land-kolom.
# Deze query kan pas draaien als er bijvoorbeeld een kolom
# 'location_country' is toegevoegt (via join met een user- of watch-tabel).

# regional_preferences = (
#     df
#     .filter(F.col("location_country").isin("USA", "Canada"))
#     .groupBy("movie_id", "location_country")
#     .agg(F.count("*").alias("total_reviews"))
# )
# regional_preferences.show(10, truncate=False)

# =============================================================
# 5. Device-specific performance
# =============================================================

device_specific_performance = (
    df
    .groupBy("device_type")
    .agg(
        F.count("*").alias("total_reviews"),
        F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0)).alias("verified_reviews"),
        F.avg("rating").alias("avg_rating"),
        F.avg("sentiment_score").alias("avg_sentiment_score")
    )
)

device_specific_performance.show(10, truncate=False)

print("Done.\n")

+----------+-------------+----------------+
|movie_id  |total_reviews|verified_reviews|
+----------+-------------+----------------+
|movie_0205|14           |11              |
|movie_0563|11           |10              |
|movie_0909|9            |9               |
|movie_0690|14           |7               |
|movie_0146|16           |12              |
|movie_0928|14           |11              |
|movie_0059|14           |10              |
|movie_0738|14           |11              |
|movie_0103|19           |13              |
|movie_0015|15           |13              |
+----------+-------------+----------------+
only showing top 10 rows

+----------+------------------+------------------------+
|movie_id  |avg_rating        |avg_text_sentiment_score|
+----------+------------------+------------------------+
|movie_0205|3.7857142857142856|0.6701428571428572      |
|movie_0563|3.8181818181818183|0.6416363636363637      |
|movie_0909|3.0               |0.5793333333333333      |
|movie_0690|4.07

In [22]:
# -------------------------------------------------------------------
# B. User Engagement Metrics
# -------------------------------------------------------------------

from pyspark.sql import functions as F

# Basis: goede datums + korte alias
df = df_reviews.withColumn("review_date", F.to_date("review_date"))
today = F.current_date()

# =============================================================
# 1. Days since last watch (laatste review)
# =============================================================
user_last_watch = (
    df
    .groupBy("user_id")
    .agg(F.max("review_date").alias("last_review_date"))
    .withColumn(
        "days_since_last_watch",
        F.datediff(today, "last_review_date")
    )
)

user_last_watch.show(10, truncate=False)

# =============================================================
# 2. Average daily watch time (7d, 30d rolling)
#    → proxy: gemiddelde verified views per dag
# =============================================================

# 7 dagen terug (inclusief vandaag)
last_7_days = df.filter(F.col("review_date") >= F.date_sub(today, 6))
# 30 dagen terug
last_30_days = df.filter(F.col("review_date") >= F.date_sub(today, 29))

avg_watch_7d = (
    last_7_days
    .groupBy("user_id")
    .agg(
        (
            F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0))
            / F.lit(7)
        ).alias("avg_daily_verified_views_7d")
    )
)

avg_watch_30d = (
    last_30_days
    .groupBy("user_id")
    .agg(
        (
            F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0))
            / F.lit(30)
        ).alias("avg_daily_verified_views_30d")
    )
)

avg_watch_7d.show(10, truncate=False)
avg_watch_30d.show(10, truncate=False)

# =============================================================
# 3. Binge-watching score
#    → max verified views op één dag in laatste 30 dagen
# =============================================================
binge_base = (
    last_30_days
    .groupBy("user_id", "review_date")
    .agg(
        F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0))
         .alias("verified_views_per_day")
    )
)

binge_score = (
    binge_base
    .groupBy("user_id")
    .agg(F.max("verified_views_per_day").alias("binge_watching_score"))
)

binge_score.show(10, truncate=False)

# =============================================================
# 4. Content diversity index
#    → (# unieke films) / (# verified views) in laatste 30 dagen
# =============================================================
diversity_base = (
    last_30_days
    .groupBy("user_id")
    .agg(
        F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0))
         .alias("total_verified_views_30d"),
        F.countDistinct(
            F.when(F.col("is_verified_watch") == True, F.col("movie_id"))
        ).alias("distinct_movies_30d")
    )
    .withColumn(
        "content_diversity_index",
        F.when(
            F.col("total_verified_views_30d") > 0,
            F.col("distinct_movies_30d") / F.col("total_verified_views_30d")
        ).otherwise(F.lit(0.0))
    )
)

diversity_base.select(
    "user_id", "total_verified_views_30d",
    "distinct_movies_30d", "content_diversity_index"
).show(10, truncate=False)

# =============================================================
# 5. Engagement trend (increasing / decreasing)
#    → compare verified views last_7 vs previous_7 days
# =============================================================

last_14_days = df.filter(F.col("review_date") >= F.date_sub(today, 13))

eng_base = (
    last_14_days
    .withColumn(
        "period",
        F.when(F.col("review_date") >= F.date_sub(today, 6), "last_7")
         .otherwise("prev_7")
    )
    .groupBy("user_id", "period")
    .agg(
        F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0))
         .alias("verified_views")
    )
)

eng_trend = (
    eng_base
    .groupBy("user_id")
    .pivot("period", ["prev_7", "last_7"])
    .agg(F.first("verified_views"))
    .fillna(0, subset=["prev_7", "last_7"])
    .withColumn(
        "engagement_change_pct",
        F.when(
            F.col("prev_7") > 0,
            (F.col("last_7") - F.col("prev_7")) / F.col("prev_7")
        ).otherwise(F.lit(None))
    )
    .withColumn(
        "engagement_trend",
        F.when(F.col("engagement_change_pct") > 0.1, "increasing")
         .when(F.col("engagement_change_pct") < -0.1, "decreasing")
         .otherwise("stable")
    )
)

eng_trend.show(10, truncate=False)

print("Done.\n")

+----------+----------------+---------------------+
|user_id   |last_review_date|days_since_last_watch|
+----------+----------------+---------------------+
|user_04438|2025-11-30      |-17                  |
|user_04300|2024-02-13      |639                  |
|user_01411|2025-01-13      |304                  |
|user_03424|2025-12-17      |-34                  |
|user_00391|2025-03-06      |252                  |
|user_09487|2024-10-08      |401                  |
|user_09937|2025-06-06      |160                  |
|user_09809|2024-08-14      |456                  |
|user_04103|2024-03-19      |604                  |
|user_08494|2025-03-13      |245                  |
+----------+----------------+---------------------+
only showing top 10 rows

+----------+---------------------------+
|user_id   |avg_daily_verified_views_7d|
+----------+---------------------------+
|user_04438|0.0                        |
|user_04545|0.14285714285714285        |
|user_03984|0.14285714285714285        |


In [24]:
# -------------------------------------------------------------------
# C. Churn Risk Indicators
# -------------------------------------------------------------------

from pyspark.sql import functions as F

# =============================================================
# 1. Viewing frequency drop > 50%
#    → vergelijk verified views in vorige 7 dagen vs laatste 7 dagen
# =============================================================

last_14_days = df.filter(F.col("review_date") >= F.date_sub(today, 13))

freq_base = (
    last_14_days
    .withColumn(
        "period",
        F.when(F.col("review_date") >= F.date_sub(today, 6), "last_7")
         .otherwise("prev_7")
    )
    .groupBy("user_id", "period")
    .agg(
        F.sum(F.when(F.col("is_verified_watch") == True, 1).otherwise(0))
         .alias("verified_views")
    )
)

freq_pivot = (
    freq_base
    .groupBy("user_id")
    .pivot("period", ["prev_7", "last_7"])
    .agg(F.first("verified_views"))
    .fillna(0, subset=["prev_7", "last_7"])
    .withColumn(
        "engagement_change_pct",
        F.when(
            F.col("prev_7") > 0,
            (F.col("last_7") - F.col("prev_7")) / F.col("prev_7")
        ).otherwise(F.lit(None))
    )
)

churn_freq_drop = (
    freq_pivot
    .withColumn(
        "freq_drop_gt_50pct",
        F.col("engagement_change_pct") < -0.5
    )
)

churn_freq_drop.show(10, truncate=False)

# =============================================================
# 2. No activity in 7+ days
#    → laatste review ouder dan 7 dagen
# =============================================================

user_last_watch = (
    df
    .groupBy("user_id")
    .agg(F.max("review_date").alias("last_review_date"))
    .withColumn(
        "days_since_last_watch",
        F.datediff(today, "last_review_date")
    )
)

churn_no_activity = (
    user_last_watch
    .withColumn(
        "no_activity_7_plus_days",
        F.col("days_since_last_watch") >= 7
    )
)

churn_no_activity.show(10, truncate=False)

# =============================================================
# 3. Declining completion rates
#    → proxy: dalende gemiddelde rating (laatste 7 vs vorige 7 dagen)
# =============================================================

ratings_last_14 = (
    last_14_days
    .withColumn(
        "period",
        F.when(F.col("review_date") >= F.date_sub(today, 6), "last_7")
         .otherwise("prev_7")
    )
    .groupBy("user_id", "period")
    .agg(F.avg("rating").alias("avg_rating"))
)

ratings_pivot = (
    ratings_last_14
    .groupBy("user_id")
    .pivot("period", ["prev_7", "last_7"])
    .agg(F.first("avg_rating"))
)

churn_declining_completion = (
    ratings_pivot
    .withColumn("rating_change", F.col("last_7") - F.col("prev_7"))
    .withColumn(
        "declining_completion_rate",
        F.col("rating_change") < 0
    )
)

churn_declining_completion.show(10, truncate=False)

# =============================================================
# 4. Negative review patterns
#    → > 50% negatieve reviews in laatste 90 dagen
# =============================================================

reviews_90d = df.filter(F.col("review_date") >= F.date_sub(today, 89))

neg_review_patterns = (
    reviews_90d
    .groupBy("user_id")
    .agg(
        F.count("*").alias("total_reviews_90d"),
        F.sum(F.when(F.col("sentiment") == "negative", 1).otherwise(0))
         .alias("negative_reviews_90d")
    )
    .withColumn(
        "negative_review_ratio",
        F.when(
            F.col("total_reviews_90d") > 0,
            F.col("negative_reviews_90d") / F.col("total_reviews_90d")
        ).otherwise(F.lit(0.0))
    )
    .withColumn(
        "negative_review_pattern",
        F.col("negative_review_ratio") > 0.5
    )
)

neg_review_patterns.show(10, truncate=False)

print("Done.\n")

+----------+------+------+---------------------+------------------+
|user_id   |prev_7|last_7|engagement_change_pct|freq_drop_gt_50pct|
+----------+------+------+---------------------+------------------+
|user_00201|1     |1     |0.0                  |false             |
|user_03424|0     |0     |NULL                 |NULL              |
|user_03984|0     |1     |NULL                 |NULL              |
|user_04438|0     |0     |NULL                 |NULL              |
|user_04545|0     |1     |NULL                 |NULL              |
|user_03917|0     |1     |NULL                 |NULL              |
|user_05838|0     |1     |NULL                 |NULL              |
|user_00836|0     |1     |NULL                 |NULL              |
|user_09775|0     |1     |NULL                 |NULL              |
|user_02612|0     |1     |NULL                 |NULL              |
+----------+------+------+---------------------+------------------+
only showing top 10 rows

+----------+----------

In [None]:
spark.stop()