In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import lit, trim, col, regexp_replace, array_contains, split, explode, avg

In [2]:
# initialize SparkSession
spark = SparkSession.builder \
    .appName("Streaming Platform Analyze") \
    .getOrCreate()

24/11/13 15:46:56 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.


### Data Preparation

In [3]:
# paths to CSVs
csv_apple_data = "gs://raw-platform-data/apple_data.csv"
csv_netflix_data = "gs://raw-platform-data/netflix_data.csv"
csv_hbo_data = "gs://raw-platform-data/hbo_data.csv"
csv_amazon_data = "gs://raw-platform-data/amazon_data.csv"

In [4]:
# function to load a CSV into df
def load_csv_to_df(file_path):
    try:
        df = spark.read.format("csv") \
            .option("header", "true") \
            .option("inferSchema", "true") \
            .option("quote", '"') \
            .option("escape", '"') \
            .load(file_path)
        return df
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        return None

In [5]:
# load data to df
df_apple = load_csv_to_df(csv_apple_data)
df_netflix = load_csv_to_df(csv_netflix_data)
df_hbo = load_csv_to_df(csv_hbo_data)
df_amazon = load_csv_to_df(csv_amazon_data)

                                                                                

In [6]:
df_apple = df_apple.withColumn("platform", lit("Apple TV+"))
df_netflix = df_netflix.withColumn("platform", lit("Netflix"))
df_hbo = df_hbo.withColumn("platform", lit("HBO Max"))
df_amazon = df_amazon.withColumn("platform", lit("Amazon Prime"))

In [7]:
combined_df = df_netflix.unionByName(df_apple) \
                           .unionByName(df_hbo) \
                           .unionByName(df_amazon)

### Data exploration and cleaning

In [8]:
combined_df.show(10)

+--------------------+-----+--------------------+-----------+---------+-----------------+------------+--------------------+--------+
|               title| type|              genres|releaseYear|   imdbId|imdbAverageRating|imdbNumVotes|  availableCountries|platform|
+--------------------+-----+--------------------+-----------+---------+-----------------+------------+--------------------+--------+
|        Forrest Gump|movie|      Drama, Romance|       1994|tt0109830|              8.8|     2316975|                  MX| Netflix|
|   The Fifth Element|movie|Action, Adventure...|       1997|tt0119116|              7.6|      517225|          AT, CH, DE| Netflix|
|   Kill Bill: Vol. 1|movie|Action, Crime, Th...|       2003|tt0266697|              8.2|     1222077|AE, AL, AO, AT, A...| Netflix|
|             Jarhead|movie|Biography, Drama,...|       2005|tt0418763|              7.0|      211593|AD, AE, AG, AL, A...| Netflix|
|          Unforgiven|movie|      Drama, Western|       1992|tt010569

In [9]:
combined_df.printSchema()

root
 |-- title: string (nullable = true)
 |-- type: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- releaseYear: integer (nullable = true)
 |-- imdbId: string (nullable = true)
 |-- imdbAverageRating: double (nullable = true)
 |-- imdbNumVotes: integer (nullable = true)
 |-- availableCountries: string (nullable = true)
 |-- platform: string (nullable = false)



In [10]:
# convert releaseYear to int and imdbAverageRating to double
combined_df = combined_df.withColumn("releaseYear", col("releaseYear").cast("int")) \
                            .withColumn("imdbAverageRating", col("imdbAverageRating").cast("double"))

In [11]:
combined_df.select('type').distinct().show()



+-----+
| type|
+-----+
|   tv|
|movie|
+-----+



[Stage 11:>                                                         (0 + 1) / 1]                                                                                

In [12]:
# verifing rows with "'Tis Time for" in the title for specific movie 
# to confirm quoted fields were handled correctly during csv parsing

# combined_df.filter(trim(combined_df['title']) == '"" Princess"').show()
combined_df.filter(combined_df["title"].like("%'Tis Time for%")).show()

                                                                                

+--------------------+----+--------------------+-----------+----------+-----------------+------------+--------------------+------------+
|               title|type|              genres|releaseYear|    imdbId|imdbAverageRating|imdbNumVotes|  availableCountries|    platform|
+--------------------+----+--------------------+-----------+----------+-----------------+------------+--------------------+------------+
|'Tis Time for "To...|  tv|Animation, Comedy...|       2024|tt30643998|              6.4|         221|HK, ID, IN, JP, M...|     Netflix|
|'Tis Time for "To...|  tv|Animation, Comedy...|       2024|tt30643998|              6.4|         221|                  JP|Amazon Prime|
+--------------------+----+--------------------+-----------+----------+-----------------+------------+--------------------+------------+



In [13]:
# replace '&' with ',' in the genres column
combined_df = combined_df.withColumn("genres", regexp_replace("genres", "&", ","))
combined_df.select("genres").distinct().show()



+--------------------+
|              genres|
+--------------------+
|Animation, Comedy...|
|    Game-Show, Sport|
|       Family, Short|
|Animation, Docume...|
|Drama, Musical, R...|
|Drama, Romance, T...|
|   Documentary, News|
|      Music, Romance|
|Documentary, Hist...|
|Adventure, Fantas...|
|      Musical, Short|
|Action, Animation...|
|Adventure, Horror...|
|Drama, Fantasy, M...|
|  Family, Reality-TV|
|Adventure, Family...|
|History, Sport, T...|
|Action, Adventure...|
|Comedy, Music, Mu...|
|Romance, Comedy, ...|
+--------------------+
only showing top 20 rows



                                                                                

In [14]:
# validating columns imdbAverageRating and releaseYear
invalid_imdb_rating = combined_df.filter((combined_df["imdbAverageRating"] < 0) | (combined_df["imdbAverageRating"] > 10))
invalid_imdb_rating.show()

                                                                                

+-----+----+------+-----------+------+-----------------+------------+------------------+--------+
|title|type|genres|releaseYear|imdbId|imdbAverageRating|imdbNumVotes|availableCountries|platform|
+-----+----+------+-----------+------+-----------------+------------+------------------+--------+
+-----+----+------+-----------+------+-----------------+------------+------------------+--------+



[Stage 20:>                                                         (0 + 1) / 1]                                                                                

In [15]:
invalid_year = combined_df.filter((combined_df["releaseYear"] < 1900) | (combined_df["releaseYear"].isNull()))
# invalid_year.count() # 219
invalid_year.show()



+--------------------+-----+--------------------+-----------+------+-----------------+------------+--------------------+---------+
|               title| type|              genres|releaseYear|imdbId|imdbAverageRating|imdbNumVotes|  availableCountries| platform|
+--------------------+-----+--------------------+-----------+------+-----------------+------------+--------------------+---------+
|Rudra: Secret of ...|movie|                NULL|       NULL|  NULL|             NULL|        NULL|AE, AG, AL, AO, A...|  Netflix|
|  Ahead of the Curve|movie|                NULL|       NULL|  NULL|             NULL|        NULL|AU, CA, FJ, GB, G...|  Netflix|
|                NULL|   tv|                NULL|       NULL|  NULL|             NULL|        NULL|          PH, SG, TW|  Netflix|
|                NULL|   tv|Drama, Crime, Mys...|       NULL|  NULL|             NULL|        NULL|                  CA|  Netflix|
|                NULL|   tv|           Animation|       NULL|  NULL|             NU

                                                                                

In [16]:
# drop rows where all specified columns are null, because they don't give us enough information
# columns_to_drop = ["genres", "releaseYear", "imdbId", "imdbAverageRating"]
# cleaned_df = combined_df.dropna(subset=columns_to_drop, how="all")

In [17]:
# split combined_df into two datasets based on the 'type' column
movies_df = combined_df.filter(combined_df["type"] == "movie")
tv_df = combined_df.filter(combined_df["type"] == "tv")

In [18]:
# drop type column
movies_df = movies_df.drop("type")
tv_df = tv_df.drop("type")

In [19]:
movies_df.show()

+--------------------+--------------------+-----------+---------+-----------------+------------+--------------------+--------+
|               title|              genres|releaseYear|   imdbId|imdbAverageRating|imdbNumVotes|  availableCountries|platform|
+--------------------+--------------------+-----------+---------+-----------------+------------+--------------------+--------+
|        Forrest Gump|      Drama, Romance|       1994|tt0109830|              8.8|     2316975|                  MX| Netflix|
|   The Fifth Element|Action, Adventure...|       1997|tt0119116|              7.6|      517225|          AT, CH, DE| Netflix|
|   Kill Bill: Vol. 1|Action, Crime, Th...|       2003|tt0266697|              8.2|     1222077|AE, AL, AO, AT, A...| Netflix|
|             Jarhead|Biography, Drama,...|       2005|tt0418763|              7.0|      211593|AD, AE, AG, AL, A...| Netflix|
|          Unforgiven|      Drama, Western|       1992|tt0105695|              8.2|      443878|AU, BA, BG, CZ,

### Data analysis and insights

This analysis focuses on movies and TV shows available in Czech Republic (all the platforms), filtered from the dataset, and provides insights based on international IMDb ratings.

In [20]:
# function that filters df for a specific release year and checks if the country is in the availableCountries column
def filter_by_year_and_country(df, column_year, column_countries, year, country):
    updated_df = df.filter(
        (col(column_year) == year) & 
        (array_contains(split(col(column_countries), ", "), country))
    )
    updated_df = updated_df.drop(col(column_countries))
    return updated_df

In [21]:
cz_movies = filter_by_year_and_country(movies_df, "releaseYear", "availableCountries", 2024, "CZ")
cz_tv = filter_by_year_and_country(tv_df, "releaseYear", "availableCountries", 2024, "CZ")

#### Number of movies and TV shows available on each platform in Czechia

In [22]:
cz_movies.groupBy("platform").count().orderBy("count", ascending=False).show()



+------------+-----+
|    platform|count|
+------------+-----+
|     Netflix|  269|
|Amazon Prime|   55|
|     HBO Max|   37|
|   Apple TV+|    7|
+------------+-----+



                                                                                

In [23]:
cz_tv.groupBy("platform").count().orderBy("count", ascending=False).show()



+------------+-----+
|    platform|count|
+------------+-----+
|     Netflix|  215|
|     HBO Max|   70|
|Amazon Prime|   66|
|   Apple TV+|   35|
+------------+-----+



[Stage 29:>                                                         (0 + 1) / 1]                                                                                

#### The best-rated genres of movies and TV series available in Czechia

In [24]:
# filtering movies and TV genres
movies_genres = cz_movies.withColumn("genre", explode(split(col("genres"), ", ")))
tv_genres = cz_tv.withColumn("genre", explode(split(col("genres"), ", ")))

In [25]:
print("The most frequently occurring genres in movies:")
movies_genres.groupBy("genre").count().orderBy("count", ascending=False).show()

The most frequently occurring genres in movies:




+-----------+-----+
|      genre|count|
+-----------+-----+
|     Comedy|  130|
|      Drama|  122|
|Documentary|  101|
|     Action|   71|
|      Crime|   57|
|   Thriller|   44|
|    Romance|   42|
|  Adventure|   33|
|     Horror|   23|
|  Animation|   19|
|  Biography|   19|
|      Sport|   17|
|    Mystery|   16|
|      Music|   15|
|    History|   12|
|    Fantasy|   11|
|     Sci-Fi|   11|
|     Family|    6|
| Reality-TV|    3|
|        War|    3|
+-----------+-----+
only showing top 20 rows



                                                                                

In [26]:
print("The most frequently occurring genres in TV shows:")
tv_genres.groupBy("genre").count().orderBy("count", ascending=False).show()

The most frequently occurring genres in TV shows:




+-----------+-----+
|      genre|count|
+-----------+-----+
|      Drama|  139|
|Documentary|  108|
|     Comedy|   94|
|      Crime|   75|
| Reality-TV|   55|
|     Action|   37|
|  Adventure|   37|
|   Thriller|   36|
|  Animation|   36|
|    Romance|   29|
|    Mystery|   26|
|      Sport|   18|
|  Biography|   18|
|    History|   15|
|  Game-Show|   11|
|    Fantasy|   10|
|     Family|    7|
|     Sci-Fi|    7|
|     Horror|    7|
|      Music|    5|
+-----------+-----+
only showing top 20 rows



                                                                                

#### TOP 10 best-rated genres for TV shows and movies available in Czechia

In [27]:
top10_genres_movies = movies_genres.groupBy("genre") \
    .agg(avg("imdbAverageRating").alias("avg_rating")) \
    .orderBy(col("avg_rating").desc()) \
    .limit(10)

top10_genres_tv = tv_genres.groupBy("genre") \
    .agg(avg("imdbAverageRating").alias("avg_rating")) \
    .orderBy(col("avg_rating").desc()) \
    .limit(10)

In [28]:
print("TOP10 rated genres for TV series:")
top10_genres_tv.show()

TOP10 rated genres for TV series:




+-----------+------------------+
|      genre|        avg_rating|
+-----------+------------------+
|      Sport| 7.666666666666667|
|      Music|7.5200000000000005|
|    Western|               7.4|
|     Family| 7.266666666666667|
|  Biography|7.2555555555555555|
|    Fantasy|              7.05|
|Documentary| 6.998979591836735|
|    Romance| 6.927586206896552|
|     Action| 6.864864864864865|
|      Crime|6.8069444444444445|
+-----------+------------------+



                                                                                

In [29]:
print("TOP10 rated genres for movies:")
top10_genres_movies.show()

TOP10 rated genres for movies:




+-----------+-----------------+
|      genre|       avg_rating|
+-----------+-----------------+
|  Talk-Show|             7.15|
|      Short|              7.1|
|      Music|7.010000000000001|
|Documentary|6.802150537634407|
|        War|              6.8|
|  Biography|6.447368421052633|
|      Sport|6.394117647058823|
|     Family|6.166666666666667|
|  Animation|6.144444444444444|
|    History|6.081818181818182|
+-----------+-----------------+



                                                                                

#### TOP10 rated movies and TV series available in Czechia

In [30]:
top10_movies = cz_movies.select("title", "imdbAverageRating", "platform").filter(col("imdbAverageRating").isNotNull()) \
    .orderBy(col("imdbAverageRating").desc()) \
    .limit(10)

top10_tv = cz_tv.select("title", "imdbAverageRating", "platform").filter(col("imdbAverageRating").isNotNull()) \
    .orderBy(col("imdbAverageRating").desc()) \
    .limit(10)

In [31]:
print("TOP10 rated movies:")
top10_movies.show(truncate=False)

TOP10 rated movies:




+-------------------------------+-----------------+------------+
|title                          |imdbAverageRating|platform    |
+-------------------------------+-----------------+------------+
|Modern Masters                 |8.7              |Netflix     |
|Dune: Part Two                 |8.5              |HBO Max     |
|Meiyazhagan                    |8.5              |Netflix     |
|Maharaja                       |8.5              |Netflix     |
|The Remarkable Life of Ibelin  |8.4              |Netflix     |
|Giannis: The Marvelous Journey |8.3              |Amazon Prime|
|Hai, Romania!                  |8.1              |Netflix     |
|Olivia Rodrigo: GUTS World Tour|8.0              |Netflix     |
|Stevie Van Zandt: Disciple     |7.9              |HBO Max     |
|The Greatest Night in Pop      |7.9              |Netflix     |
+-------------------------------+-----------------+------------+





In [32]:
print("TOP10 rated TV series:")
top10_tv.show(truncate=False)

TOP10 rated TV series:




+----------------------------------------+-----------------+------------+
|title                                   |imdbAverageRating|platform    |
+----------------------------------------+-----------------+------------+
|The Tragically Hip: No Dress Rehearsal  |9.4              |Amazon Prime|
|Game 7                                  |8.9              |Amazon Prime|
|The Penguin                             |8.8              |HBO Max     |
|The Comeback: 2004 Boston Red Sox       |8.7              |Netflix     |
|Botched Bariatrics                      |8.6              |HBO Max     |
|Faceoff: Inside the NHL                 |8.5              |Amazon Prime|
|Culinary Class Wars                     |8.5              |Netflix     |
|Earthsounds                             |8.5              |Apple TV+   |
|Fallout                                 |8.4              |Amazon Prime|
|Turning Point: The Bomb and the Cold War|8.4              |Netflix     |
+-------------------------------------



#### Average ratings by platform for movies and TV series available in Czechia

In [33]:
movies_platform = cz_movies.groupBy("platform") \
    .agg(avg("imdbAverageRating").alias("avg_rating")) \
    .orderBy(col("avg_rating").desc())

tv_platform = cz_tv.groupBy("platform") \
    .agg(avg("imdbAverageRating").alias("avg_rating")) \
    .orderBy(col("avg_rating").desc())

In [34]:
print("Average ratings by platform for movies:")
movies_platform.show()

Average ratings by platform for movies:




+------------+-----------------+
|    platform|       avg_rating|
+------------+-----------------+
|   Apple TV+|6.657142857142857|
|     HBO Max|6.441935483870968|
|     Netflix|6.060629921259841|
|Amazon Prime|5.593478260869567|
+------------+-----------------+



                                                                                

In [35]:
print("Average ratings by platform for TV series:")
tv_platform.show()

Average ratings by platform for TV series:




+------------+-----------------+
|    platform|       avg_rating|
+------------+-----------------+
|   Apple TV+|7.014285714285714|
|     HBO Max|6.780645161290322|
|Amazon Prime|6.736206896551724|
|     Netflix|6.712077294685991|
+------------+-----------------+



                                                                                