In [4]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import avg, col, count, desc

# =========================================
# 0. START SPARK SESSION
# =========================================

# Configuration
project_id = "dejadsgl"
bq_dataset = "netflix"
temp_bucket = "netflix-group5-temp_gl"
gcs_data_bucket = "netflix_data_25"

# Spark configuration
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkIntegrationDataset")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create the Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector
spark.conf.set('temporaryGcsBucket', temp_bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print("Spark session started.")

# =========================================
# 1. LOAD ALL TABLES
# =========================================

# Load data from BigQuery
tables = {}
titles = [
    "Movies_cleaned",
    "Users_cleaned",
    "Watch_history_cleaned",
    "Reviews_cleaned"
]

for title in titles:
    df = spark.read \
            .format("bigquery") \
            .load(f"{project_id}.{bq_dataset}.{title}")

    df.cache()
    tables[title] = df   # store in dictionary

    print(f"\nLoaded table: {title}")
    df.printSchema()

print("Done.")

Spark session started.

Loaded table: Movies_cleaned
root
 |-- movie_id: string (nullable = false)
 |-- title: string (nullable = false)
 |-- content_type: string (nullable = false)
 |-- genre_primary: string (nullable = false)
 |-- genre_secondary: string (nullable = true)
 |-- rating: string (nullable = false)
 |-- language: string (nullable = false)
 |-- country_of_origin: string (nullable = false)
 |-- is_netflix_original: boolean (nullable = true)
 |-- added_to_platform: date (nullable = true)
 |-- release_year: long (nullable = true)
 |-- duration_minutes: double (nullable = true)
 |-- imdb_rating: double (nullable = true)
 |-- production_budget: double (nullable = true)
 |-- box_office_revenue: double (nullable = true)
 |-- number_of_seasons: double (nullable = true)
 |-- number_of_episodes: double (nullable = true)


Loaded table: Users_cleaned
root
 |-- user_id: string (nullable = false)
 |-- email: string (nullable = false)
 |-- first_name: string (nullable = false)
 |-- last

In [11]:
print(df.columns)

['review_id', 'user_id', 'movie_id', 'review_date', 'device_type', 'is_verified_watch', 'review_text', 'sentiment', 'rating', 'helpful_votes', 'total_votes', 'sentiment_score']


In [15]:
from pyspark.sql import functions as F

# ============================================================
# 2. USING ALREADY LOADED DATAFRAME df
# ============================================================
print("Row count in df:", df.count())
print("Columns:", df.columns)
# df.show(5, truncate=False)

# ============================================================
# 3. SENTIMENT VIA rating (numerieke review-score)
# ============================================================
print("\nSTEP 3: Adding rating-based sentiment")

rating_col = "rating"   # jouw numerieke rating-kolom

df_with_sentiment = (
    df
    .withColumn(
        "rating_sentiment_score",
        (F.col(rating_col).cast("double") - F.lit(3.0)) / F.lit(2.0)
    )
    .withColumn(
        "rating_sentiment_label",
        F.when(F.col(rating_col) >= 4, "positive")
         .when(F.col(rating_col) == 3, "neutral")
         .otherwise("negative")
    )
)

df_with_sentiment.select(
    "user_id", "movie_id", rating_col,
    "rating_sentiment_score", "rating_sentiment_label"
).show(10, truncate=False)

# ============================================================
# 4. CREATE UNIFIED REVIEW DATASET (ipv viewing)
# ============================================================
print("\nSTEP 4: Building unified review dataset")

df_viewing = df_with_sentiment.select(
    "review_id",
    "user_id",
    "movie_id",
    "review_date",
    "device_type",
    "is_verified_watch",
    "review_text",
    "sentiment",          # tekst-sentiment
    "sentiment_score",    # tekst-sentiment-score
    rating_col,           # numerieke rating
    "rating_sentiment_score",
    "rating_sentiment_label",
    "helpful_votes",
    "total_votes"
)

print("Unified dataset rows:", df_viewing.count())
df_viewing.show(10, truncate=False)

print("\nDONE: df_viewing is ready.")



Row count in df: 14744
Columns: ['review_id', 'user_id', 'movie_id', 'review_date', 'device_type', 'is_verified_watch', 'review_text', 'sentiment', 'rating', 'helpful_votes', 'total_votes', 'sentiment_score']

STEP 3: Adding rating-based sentiment
+----------+----------+------+----------------------+----------------------+
|user_id   |movie_id  |rating|rating_sentiment_score|rating_sentiment_label|
+----------+----------+------+----------------------+----------------------+
|user_07556|movie_0737|4     |0.5                   |positive              |
|user_02460|movie_0923|5     |1.0                   |positive              |
|user_01153|movie_0692|5     |1.0                   |positive              |
|user_00046|movie_0775|5     |1.0                   |positive              |
|user_02283|movie_0544|4     |0.5                   |positive              |
|user_01360|movie_0459|5     |1.0                   |positive              |
|user_04681|movie_0164|5     |1.0                   |positi

In [17]:
# ============================================================
# 5. WRITING DATASET TO BIGQUERY
# ============================================================

print("Writing df_viewing to BigQuery...")

df_viewing.write \
    .format("bigquery") \
    .option("writeMethod", "direct") \
    .option("table", f"{project_id}.{bq_dataset}.unified_review_dataset") \
    .mode("overwrite") \
    .save()

print("DONE: Table written to BigQuery.")


Writing df_viewing to BigQuery...
DONE: Table written to BigQuery.


In [19]:
spark.stop()