In [12]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit

In [2]:
# initialize SparkSession
spark = SparkSession.builder \
    .appName("Streaming Platform Analyze") \
    .getOrCreate()

24/11/11 14:00:24 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


In [7]:
# paths to CSVs
csv_apple_data = "gs://raw-platform-data/apple_data.csv"
csv_netflix_data = "gs://raw-platform-data/netflix_data.csv"
csv_hbo_data = "gs://raw-platform-data/hbo_data.csv"
csv_amazon_data = "gs://raw-platform-data/amazon_data.csv"

In [8]:
# function to load a CSV into df
def load_csv_to_df(file_path):
    try:
        df = spark.read.format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .load(file_path)
        return df
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

In [9]:
# load data to df
df_apple = load_csv_to_df(csv_apple_data)
df_netflix = load_csv_to_df(csv_netflix_data)
df_hbo = load_csv_to_df(csv_hbo_data)
df_amazon = load_csv_to_df(csv_amazon_data)

                                                                                

In [13]:

df_apple = df_apple.withColumn("platform", lit("Apple TV+"))
df_netflix = df_netflix.withColumn("platform", lit("Netflix"))
df_hbo = df_hbo.withColumn("platform", lit("HBO Max"))
df_amazon = df_amazon.withColumn("platform", lit("Amazon Prime"))

In [14]:
combined_df = df_netflix.unionByName(df_apple) \
                           .unionByName(df_hbo) \
                           .unionByName(df_amazon)

In [15]:
combined_df.show(10)

+--------------------+-----+--------------------+-----------+---------+-----------------+------------+--------------------+--------+
|               title| type|              genres|releaseYear|   imdbId|imdbAverageRating|imdbNumVotes|  availableCountries|platform|
+--------------------+-----+--------------------+-----------+---------+-----------------+------------+--------------------+--------+
|        Forrest Gump|movie|      Drama, Romance|       1994|tt0109830|              8.8|   2316975.0|                  MX| Netflix|
|   The Fifth Element|movie|Action, Adventure...|       1997|tt0119116|              7.6|    517225.0|          AT, CH, DE| Netflix|
|   Kill Bill: Vol. 1|movie|Action, Crime, Th...|       2003|tt0266697|              8.2|   1222077.0|AE, AL, AO, AT, A...| Netflix|
|             Jarhead|movie|Biography, Drama,...|       2005|tt0418763|                7|    211593.0|AD, AE, AG, AL, A...| Netflix|
|          Unforgiven|movie|      Drama, Western|       1992|tt010569