In [None]:
from pyspark.sql import SparkSession
from pyspark import SparkConf
from pyspark.sql.functions import avg, col, count, desc
from pyspark.sql import functions as F

# =========================================
# 0. START SPARK SESSION
# =========================================

# Configuration
project_id = "dejadsgl"
bq_dataset = "netflix"
temp_bucket = "netflix-group5-temp_gl"
data_bucket = "netflix-group5-data_gl"
gcs_data_bucket = "netflix_data_25"

# Spark configuration
sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("SparkIntegrationDataset")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")

# Create the Spark session
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Use the Cloud Storage bucket for temporary BigQuery export data used by the connector
spark.conf.set('temporaryGcsBucket', temp_bucket)

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

print("Spark session started.")

# =========================================
# 1. LOAD ALL TABLES
# =========================================

# Load data from BigQuery
tables = {}
titles = [
    "movies_cleaned",
    "users_cleaned",
    "watch_history_cleaned",
    "reviews_cleaned"
]

for title in titles:
    df = spark.read \
            .format("bigquery") \
            .load(f"{project_id}.{bq_dataset}.{title}")

    df.cache()
    tables[title] = df   # store in dictionary

    print(f"\nLoaded table: {title}")
    df.printSchema()

print("\nDONE: Session started and tables are loaded.")

# ============================================================
# 3. USING ALREADY LOADED DATAFRAME df
# ============================================================
print("Row count in df:", df.count())
print("Columns:", df.columns)

# ============================================================
# 4. SENTIMENT VIA rating (numerieke review-score)
# ============================================================
print("\nAdding rating-based sentiment")

rating_col = "rating"   # jouw numerieke rating-kolom

df_with_sentiment = (
    df
    .withColumn(
        "rating_sentiment_score",
        (F.col(rating_col).cast("double") - F.lit(3.0)) / F.lit(2.0)
    )
    .withColumn(
        "rating_sentiment_label",
        F.when(F.col(rating_col) >= 4, "positive")
         .when(F.col(rating_col) == 3, "neutral")
         .otherwise("negative")
    )
)

df_with_sentiment.select(
    "user_id", "movie_id", rating_col,
    "rating_sentiment_score", "rating_sentiment_label"
).show(10, truncate=False)

print("\nDONE: Rating-based sentiment.")

# ============================================================
# 5. CREATE UNIFIED REVIEW DATASET (ipv viewing)
# ============================================================
print("\Building unified review dataset")

df_viewing = df_with_sentiment.select(
    "review_id",
    "user_id",
    "movie_id",
    "review_date",
    "device_type",
    "is_verified_watch",
    "review_text",
    "sentiment",          # tekst-sentiment
    "sentiment_score",    # tekst-sentiment-score
    rating_col,           # numerieke rating
    "rating_sentiment_score",
    "rating_sentiment_label",
    "helpful_votes",
    "total_votes"
)

print("Unified dataset rows:", df_viewing.count())
df_viewing.show(10, truncate=False)

print("\nDONE: df_viewing is ready.")


# ============================================================
# 6. WRITING DATASET TO BIGQUERY
# ============================================================

print("Writing df_viewing to BigQuery...")

df_viewing.write \
    .format("bigquery") \
    .option("writeMethod", "direct") \
    .option("table", f"{project_id}.{bq_dataset}.unified_review_dataset") \
    .mode("overwrite") \
    .save()

print("DONE: Table written to BigQuery.")



In [None]:
spark.stop()