In [2]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


In [3]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install pyspark
!pip install pyspark

# Set up Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("SparkSQL") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()


Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=616bc4f46f1fb52b44bf03530f68eb05c5a825717294ce9494d13a0b9a063ce0
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [4]:
# Import packages
from pyspark.sql import SparkSession
import time
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import current_date, expr



# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [5]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
# Path to the CSV files in Google Drive
show_data_path = '/content/drive/My Drive/showData.csv'
movie_data_path = '/content/drive/My Drive/movieData.csv'

In [7]:
# Load the CSV files into DataFrames
show_df = pd.read_csv(show_data_path)
movie_df = pd.read_csv(movie_data_path)


In [8]:
# Display the first few rows of showData.csv
show_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_score
0,0,ts300399,Five Came Back: The Reference Films,1945,TV-MA,51,['documentation'],['US'],1.0,
1,1,ts22164,Monty Python's Flying Circus,1969,TV-14,30,"['comedy', 'european']",['GB'],4.0,8.8
2,2,ts45948,Monty Python's Fliegender Zirkus,1972,TV-MA,43,['comedy'],[],1.0,8.1
3,3,ts20681,Seinfeld,1989,TV-PG,24,['comedy'],['US'],9.0,8.9
4,4,ts22082,Knight Rider,1982,TV-PG,51,"['scifi', 'action', 'crime', 'drama']",['US'],4.0,6.9


In [9]:
# Display the first few rows of movieData.csv
movie_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,release_year,age_certification,runtime,genres,production_countries,imdb_score
0,0,tm84618,Taxi Driver,1976,R,114,"['drama', 'crime']",['US'],8.2
1,1,tm154986,Deliverance,1972,R,109,"['drama', 'action', 'thriller', 'european']",['US'],7.7
2,2,tm127384,Monty Python and the Holy Grail,1975,PG,91,"['fantasy', 'action', 'comedy']",['GB'],8.2
3,3,tm120801,The Dirty Dozen,1967,,150,"['war', 'action']","['GB', 'US']",7.7
4,4,tm70993,Life of Brian,1979,R,94,['comedy'],['GB'],8.0


In [10]:
# Replace NaN IMDb scores with None in show_df
show_df['imdb_score'] = show_df['imdb_score'].apply(lambda x: None if pd.isnull(x) else x)

# Replace NaN IMDb scores with None in movie_df
movie_df['imdb_score'] = movie_df['imdb_score'].apply(lambda x: None if pd.isnull(x) else x)

In [11]:
# Convert cleaned Pandas DataFrames to Spark DataFrames
spark_show_df_cleaned = spark.createDataFrame(show_df)
spark_movie_df_cleaned = spark.createDataFrame(movie_df)

# Create temporary views for the cleaned Spark DataFrames
spark_show_df_cleaned.createOrReplaceTempView("showData_cleaned")
spark_movie_df_cleaned.createOrReplaceTempView("movieData_cleaned")

In [12]:
# Display the schema of the Spark DataFrame
spark_show_df_cleaned.printSchema()



root
 |-- Unnamed: 0: long (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- age_certification: string (nullable = true)
 |-- runtime: long (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- seasons: double (nullable = true)
 |-- imdb_score: double (nullable = true)



In [13]:
# Display the schema of the Spark DataFrame
spark_movie_df_cleaned.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- age_certification: string (nullable = true)
 |-- runtime: long (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- imdb_score: double (nullable = true)



# What are the top-rated TV shows and movies based on IMDb scores?

In [14]:
# Finding top rated tv shows and movies, excluding rows with NaN IMDb scores
top_rated = spark.sql("""
    SELECT 'TV Show' AS type, title, imdb_score
    FROM showData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    UNION ALL
    SELECT 'Movie' AS type, title, imdb_score
    FROM movieData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    ORDER BY imdb_score DESC
    LIMIT 10
""")
top_rated.show()


+-------+--------------------+----------+
|   type|               title|imdb_score|
+-------+--------------------+----------+
|TV Show|            #ABtalks|       9.6|
|TV Show|        Breaking Bad|       9.5|
|TV Show|            Khawatir|       9.5|
|TV Show|          Our Planet|       9.3|
|TV Show|Avatar: The Last ...|       9.3|
|TV Show|          Reply 1988|       9.2|
|  Movie|Chhota Bheem & Kr...|       9.1|
|  Movie|               Major|       9.1|
|TV Show|        Kota Factory|       9.1|
|TV Show|           My Mister|       9.1|
+-------+--------------------+----------+



# Which genres are most common among TV shows and movies?

In [15]:
result = spark.sql("""
    SELECT type, trim('\"' FROM genre) AS genre, COUNT(*) AS count
    FROM
      (SELECT 'TV Show' AS type, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM movieData_cleaned)
    WHERE genre != ""
    GROUP BY type, genre
    ORDER BY count DESC
""")
# Show the result
result.show()

+-------+---------------+-----+
|   type|          genre|count|
+-------+---------------+-----+
|  Movie|        'drama'| 1876|
|  Movie|       'comedy'| 1571|
|TV Show|        'drama'| 1092|
|  Movie|     'thriller'|  825|
|TV Show|       'comedy'|  754|
|  Movie|       'action'|  718|
|  Movie|      'romance'|  698|
|  Movie|'documentation'|  611|
|  Movie|        'crime'|  545|
|TV Show|       'action'|  439|
|TV Show|    'animation'|  423|
|TV Show|     'thriller'|  403|
|TV Show|        'crime'|  391|
|TV Show|        'scifi'|  385|
|  Movie|       'family'|  351|
|  Movie|     'european'|  344|
|TV Show|'documentation'|  341|
|TV Show|       'family'|  331|
|TV Show|      'fantasy'|  315|
|  Movie|      'fantasy'|  315|
+-------+---------------+-----+
only showing top 20 rows



# How does the distribution of IMDb scores differ between TV shows and movies?

In [16]:
from pyspark.sql.functions import col

# Calculate the average IMDb score for TV shows, excluding NaN values
tv_show_avg = spark.sql("""
    SELECT AVG(imdb_score) AS avg_imdb_score
    FROM showData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
""")
tv_show_avg_score = tv_show_avg.collect()[0]["avg_imdb_score"]

# Calculate the average IMDb score for movies, excluding NaN values
movie_avg = spark.sql("""
    SELECT AVG(imdb_score) AS avg_imdb_score
    FROM movieData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
""")
movie_avg_score = movie_avg.collect()[0]["avg_imdb_score"]

# Display the results
print("Average IMDb score for TV shows:", tv_show_avg_score)
print("Average IMDb score for movies:", movie_avg_score)


Average IMDb score for TV shows: 6.977926766374413
Average IMDb score for movies: 6.246748323126284


In [17]:
# Calculate the average IMDb score for TV shows and movies, excluding NaN values
result = spark.sql("""
    SELECT 'All' AS type,
           FORMAT_NUMBER(AVG(imdb_score), 2) AS avg_imdb_score
    FROM
    (
        SELECT imdb_score FROM showData_cleaned WHERE NOT isnan(imdb_score)
        UNION ALL
        SELECT imdb_score FROM movieData_cleaned WHERE NOT isnan(imdb_score)
    )
""")
result.show()


+----+--------------+
|type|avg_imdb_score|
+----+--------------+
| All|          6.51|
+----+--------------+



# What is the average runtime of TV shows and movies?

In [18]:
result = spark.sql("""
    SELECT 'TV Show' AS type, AVG(runtime) AS avg_runtime
    FROM showData_cleaned
    WHERE runtime IS NOT NULL
    UNION ALL
    SELECT 'Movie' AS type, AVG(runtime) AS avg_runtime
    FROM movieData_cleaned
    WHERE runtime IS NOT NULL
    GROUP BY type
""")
result.show()


+-------+------------------+
|   type|       avg_runtime|
+-------+------------------+
|TV Show|38.978157644824314|
|  Movie| 98.21367521367522|
+-------+------------------+



# How has the annual release pattern of TV shows and movies evolved over the span of 10 years, from 2013 to 2022?

In [19]:
# Query to count the number of TV shows and movies released each year
result = spark.sql("""
    SELECT type, release_year, COUNT(*) AS count
    FROM
      (SELECT 'TV Show' AS type, release_year FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, release_year FROM movieData_cleaned)
    WHERE release_year IS NOT NULL
    GROUP BY type, release_year
    ORDER BY release_year DESC, type DESC
    LIMIT 20
""")

# Show the result
result.show()


+-------+------------+-----+
|   type|release_year|count|
+-------+------------+-----+
|TV Show|        2022|  171|
|  Movie|        2022|  200|
|TV Show|        2021|  314|
|  Movie|        2021|  473|
|TV Show|        2020|  314|
|  Movie|        2020|  500|
|TV Show|        2019|  311|
|  Movie|        2019|  525|
|TV Show|        2018|  300|
|  Movie|        2018|  473|
|TV Show|        2017|  179|
|  Movie|        2017|  384|
|TV Show|        2016|  134|
|  Movie|        2016|  228|
|TV Show|        2015|   89|
|  Movie|        2015|  134|
|TV Show|        2014|   44|
|  Movie|        2014|  109|
|TV Show|        2013|   35|
|  Movie|        2013|  100|
+-------+------------+-----+



# Which countries produce the most TV shows and movies?


In [20]:
# Query to count the number of TV shows and movies produced by each country
result = spark.sql("""
    SELECT production_country,
           SUM(CASE WHEN type = 'TV Show' THEN 1 ELSE 0 END) AS tv_count,
           SUM(CASE WHEN type = 'Movie' THEN 1 ELSE 0 END) AS movie_count,
           COUNT(*) AS total_count
    FROM
      (SELECT 'TV Show' AS type, explode(split(production_countries, ', ')) AS production_country FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, explode(split(production_countries, ', ')) AS production_country FROM movieData_cleaned)
    WHERE production_country != "" AND production_country NOT LIKE "%[%" AND production_country NOT LIKE "%]%"
    GROUP BY production_country
    ORDER BY total_count DESC
""")

# Show the result
result.show()


+------------------+--------+-----------+-----------+
|production_country|tv_count|movie_count|total_count|
+------------------+--------+-----------+-----------+
|              'US'|       3|         36|         39|
|              'GB'|       0|         34|         34|
|              'FR'|       2|         29|         31|
|              'DE'|       0|         20|         20|
|              'CA'|       0|         11|         11|
|              'DK'|       0|         11|         11|
|              'NL'|       0|          8|          8|
|              'CH'|       0|          7|          7|
|              'CN'|       0|          6|          6|
|              'ES'|       0|          5|          5|
|              'BE'|       0|          5|          5|
|              'SE'|       0|          5|          5|
|              'MX'|       0|          5|          5|
|              'JP'|       1|          3|          4|
|              'AU'|       1|          3|          4|
|              'LB'|       0

# Which age certification (e.g., TV-MA, PG-13) is most common among TV shows and movies?

In [21]:
result = spark.sql("""
    SELECT age_certification, COUNT(*) AS count
    FROM
      (SELECT age_certification FROM showData_cleaned WHERE age_certification IS NOT NULL AND age_certification != '' AND age_certification != 'NaN'
       UNION ALL
       SELECT age_certification FROM movieData_cleaned WHERE age_certification IS NOT NULL AND age_certification != '' AND age_certification != 'NaN')
    GROUP BY age_certification
    ORDER BY count DESC
""")
result.show()


+-----------------+-----+
|age_certification|count|
+-----------------+-----+
|            TV-MA|  883|
|                R|  556|
|            TV-14|  474|
|            PG-13|  451|
|               PG|  233|
|            TV-PG|  188|
|                G|  124|
|            TV-Y7|  120|
|             TV-Y|  107|
|             TV-G|   79|
|            NC-17|   16|
+-----------------+-----+



# What is the average number of seasons for TV shows in each age certification category?

In [22]:
result = spark.sql("""
    SELECT age_certification, AVG(seasons) AS avg_seasons
    FROM showData_cleaned
    WHERE age_certification IS NOT NULL AND age_certification != 'NaN'
    GROUP BY age_certification
    ORDER BY avg_seasons DESC
""")

result.show()


+-----------------+------------------+
|age_certification|       avg_seasons|
+-----------------+------------------+
|             TV-G|3.6202531645569622|
|            TV-Y7|3.3916666666666666|
|             TV-Y|3.0373831775700935|
|            TV-PG| 2.702127659574468|
|            TV-14| 2.413502109704641|
|            TV-MA|1.7066817667044167|
+-----------------+------------------+



# How does the average IMDb score vary by production country for TV shows and movies?

In [23]:
result = spark.sql("""
    SELECT type, production_country, AVG(imdb_score) AS avg_imdb_score
    FROM
      (SELECT 'TV Show' AS type, explode(split(production_countries, ', ')) AS production_country, imdb_score FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, explode(split(production_countries, ', ')) AS production_country, imdb_score FROM movieData_cleaned)
    WHERE production_country != "" AND production_country NOT LIKE "%[%" AND production_country NOT LIKE "%]%" AND imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    GROUP BY type, production_country
    ORDER BY avg_imdb_score DESC
""")

result.show()


+-------+------------------+------------------+
|   type|production_country|    avg_imdb_score|
+-------+------------------+------------------+
|  Movie|              'EG'|               8.1|
|  Movie|              'BS'|               8.0|
|  Movie|              'MW'|               7.6|
|  Movie|              'NO'|              7.35|
|  Movie|              'LB'|               7.3|
|  Movie|              'BR'|               7.2|
|  Movie|              'AE'|               7.1|
|  Movie|              'UY'|               7.1|
|  Movie|              'ES'|7.0200000000000005|
|  Movie|              'GH'|               7.0|
|  Movie|              'ZA'| 6.949999999999999|
|  Movie|              'CH'| 6.914285714285714|
|  Movie|              'MX'|              6.88|
|TV Show|              'FR'|              6.85|
|  Movie|              'IS'| 6.833333333333333|
|  Movie|              'SE'|              6.82|
|  Movie|              'IR'|               6.8|
|  Movie|              'JO'|            

# Which genres have the highest average IMDb score for TV shows and movies?

In [29]:
# Finding the top genres for TV shows, excluding rows with NaN IMDb scores
top_genre_tv_shows = spark.sql("""
    SELECT 'TV Show' AS type, trim('\"' FROM genre) AS genre, AVG(imdb_score) AS avg_imdb_score
    FROM showData_cleaned
    LATERAL VIEW explode(split(trim('[]' FROM genres), ', ')) AS genre
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    GROUP BY type, genre
    ORDER BY avg_imdb_score DESC
    LIMIT 1
""")

# Finding the top genres for movies, excluding rows with NaN IMDb scores
top_genre_movies = spark.sql("""
    SELECT 'Movie' AS type, trim('\"' FROM genre) AS genre, AVG(imdb_score) AS avg_imdb_score
    FROM movieData_cleaned
    LATERAL VIEW explode(split(trim('[]' FROM genres), ', ')) AS genre
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    GROUP BY type, genre
    ORDER BY avg_imdb_score DESC
    LIMIT 1
""")

# Combining the results
result = top_genre_tv_shows.unionAll(top_genre_movies)
result.show()


+-------+---------+-----------------+
|   type|    genre|   avg_imdb_score|
+-------+---------+-----------------+
|TV Show|'history'|7.477064220183486|
|  Movie|         |              7.1|
+-------+---------+-----------------+



# Are there any trends in the average runtime of TV shows and movies over the years?

In [32]:
result = spark.sql("""
    SELECT type, release_year, AVG(runtime) AS avg_runtime
    FROM
        (SELECT 'TV Show' AS type, release_year, runtime FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, release_year, runtime FROM movieData_cleaned)
    GROUP BY type, release_year
    ORDER BY release_year, type
""")

result.show()

+-------+------------+------------------+
|   type|release_year|       avg_runtime|
+-------+------------+------------------+
|TV Show|        1945|              51.0|
|  Movie|        1954|             107.5|
|  Movie|        1956|             120.0|
|  Movie|        1958|              77.0|
|  Movie|        1959|             142.0|
|  Movie|        1960|             158.0|
|  Movie|        1961|             158.0|
|  Movie|        1963|             186.0|
|  Movie|        1966|             117.0|
|  Movie|        1967|             130.0|
|  Movie|        1969|             129.0|
|TV Show|        1969|              30.0|
|  Movie|        1971|             102.0|
|  Movie|        1972|             103.0|
|TV Show|        1972|              43.0|
|  Movie|        1973|             131.0|
|  Movie|        1974|             162.0|
|  Movie|        1975|             100.5|
|  Movie|        1976|124.33333333333333|
|  Movie|        1977|             150.0|
+-------+------------+------------

# Which production countries have the highest average runtime for TV shows and movies?

In [33]:
#finding the production countries with the highest average runtime for shows
top_countries_tv_shows = spark.sql("""
    SELECT 'TV Show' AS type, production_countries, AVG(runtime) AS avg_runtime
    FROM showData_cleaned
    WHERE production_countries IS NOT NULL
    GROUP BY type, production_countries
    ORDER BY avg_runtime DESC
    LIMIT 1
""")

#finding the production countries with the highest average runtime for movies
top_countries_movies = spark.sql("""
    SELECT 'Movie' AS type, production_countries, AVG(runtime) AS avg_runtime
    FROM movieData_cleaned
    WHERE production_countries IS NOT NULL
    GROUP BY type, production_countries
    ORDER BY avg_runtime DESC
    LIMIT 1
""")

#combining the results
result= top_countries_tv_shows.unionAll(top_countries_movies)
result.show()


+-------+--------------------+-----------+
|   type|production_countries|avg_runtime|
+-------+--------------------+-----------+
|TV Show|        ['AR', 'ES']|       73.0|
|  Movie|  ['GB', 'US', 'JP']|      208.0|
+-------+--------------------+-----------+



# How does the distribution of runtime differ between TV shows and movies?

In [34]:
result= spark.sql("""
    SELECT type,
           percentile(runtime, 0.25) AS q1,
           percentile(runtime, 0.5) AS median,
           percentile(runtime, 0.75) AS q3,
           avg(runtime) AS mean,
           stddev(runtime) AS stddev
    FROM
        (SELECT 'TV Show' AS type, runtime FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, runtime FROM movieData_cleaned)
    GROUP BY type
""")

result.show()


+-------+----+------+-----+------------------+------------------+
|   type|  q1|median|   q3|              mean|            stddev|
+-------+----+------+-----+------------------+------------------+
|TV Show|25.0|  41.0| 49.0|38.978157644824314| 17.62635409803705|
|  Movie|85.0|  98.0|115.0| 98.21367521367522|30.640639686236884|
+-------+----+------+-----+------------------+------------------+



# What is the average IMDb score for TV shows and movies released in each year?


In [36]:
result = spark.sql("""
    SELECT type, release_year, AVG(imdb_score) AS avg_imdb_score
    FROM
        (SELECT 'TV Show' AS type, release_year, imdb_score FROM showData_cleaned
         WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
         UNION ALL
         SELECT 'Movie' AS type, release_year, imdb_score FROM movieData_cleaned
         WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score))
    GROUP BY type, release_year
    ORDER BY release_year, type
""")
result.show()


+-------+------------+-----------------+
|   type|release_year|   avg_imdb_score|
+-------+------------+-----------------+
|  Movie|        1954|             7.45|
|  Movie|        1956|              6.7|
|  Movie|        1958|              7.5|
|  Movie|        1959|              6.6|
|  Movie|        1960|              6.4|
|  Movie|        1961|              7.5|
|  Movie|        1963|              7.6|
|  Movie|        1966|              7.3|
|  Movie|        1967|              7.7|
|  Movie|        1969|              8.1|
|TV Show|        1969|              8.8|
|  Movie|        1971|              7.7|
|  Movie|        1972|             6.95|
|TV Show|        1972|              8.1|
|  Movie|        1973|              5.1|
|  Movie|        1974|              6.5|
|  Movie|        1975|              7.3|
|  Movie|        1976|6.666666666666667|
|  Movie|        1977|              7.5|
|  Movie|        1978|              4.4|
+-------+------------+-----------------+
only showing top

# Are there any genres that have become more or less popular over the years?

In [37]:
result= spark.sql("""
    SELECT type, release_year, genre, COUNT(*) AS genre_count
    FROM
        (SELECT 'TV Show' AS type, release_year, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, release_year, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM movieData_cleaned)
    WHERE genre != ""
    GROUP BY type, release_year, genre
    ORDER BY release_year, type, genre_count DESC
""")

result.show()

+-------+------------+---------------+-----------+
|   type|release_year|          genre|genre_count|
+-------+------------+---------------+-----------+
|TV Show|        1945|'documentation'|          1|
|  Movie|        1954|      'romance'|          2|
|  Movie|        1954|        'crime'|          1|
|  Movie|        1954|        'drama'|          1|
|  Movie|        1954|       'comedy'|          1|
|  Movie|        1956|       'action'|          1|
|  Movie|        1956|     'thriller'|          1|
|  Movie|        1956|        'drama'|          1|
|  Movie|        1956|      'romance'|          1|
|  Movie|        1958|        'drama'|          1|
|  Movie|        1958|        'crime'|          1|
|  Movie|        1958|       'comedy'|          1|
|  Movie|        1959|      'romance'|          1|
|  Movie|        1959|        'drama'|          1|
|  Movie|        1959|        'crime'|          1|
|  Movie|        1960|        'crime'|          1|
|  Movie|        1960|     'thr

# Which production countries have the most diverse genres in their TV shows and movies?


In [38]:
result = spark.sql("""
    SELECT type, production_countries, COUNT(DISTINCT genre) AS unique_genres_count
    FROM
        (SELECT 'TV Show' AS type, production_countries, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, production_countries, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM movieData_cleaned)
    WHERE genre != ""
    GROUP BY type, production_countries
    ORDER BY unique_genres_count DESC
""")

result.show()

+-------+--------------------+-------------------+
|   type|production_countries|unique_genres_count|
+-------+--------------------+-------------------+
|  Movie|              ['US']|                 19|
|TV Show|              ['US']|                 19|
|  Movie|              ['IN']|                 18|
|TV Show|              ['JP']|                 18|
|TV Show|              ['ES']|                 18|
|  Movie|              ['ES']|                 18|
|TV Show|              ['GB']|                 18|
|  Movie|              ['DE']|                 17|
|TV Show|              ['FR']|                 17|
|  Movie|        ['GB', 'US']|                 17|
|  Movie|              ['CA']|                 17|
|  Movie|              ['GB']|                 17|
|TV Show|              ['KR']|                 17|
|  Movie|              ['FR']|                 17|
|TV Show|              ['CA']|                 16|
|  Movie|        ['US', 'GB']|                 16|
|TV Show|              ['MX']| 

# Analysis

1. The top-rated TV shows and movies based on IMDb scores showcase a diverse range of compelling content. Topping the list is the TV show "#ABtalks," boasting an impressive IMDb score of 9.6. This series likely features engaging interviews or discussions, captivating audiences with its insightful content. Following closely behind is the iconic TV show "Breaking Bad" with a rating of 9.5, known for its gripping storyline and exceptional acting. Another standout TV show is "Khawatir," also rated 9.5, which may be celebrated for its emotional depth or cultural significance. These top-rated TV shows exemplify the high-quality storytelling and production values that resonate with audiences worldwide, making them must-watch titles for fans of exceptional television.

2. The analysis reveals that drama is the most common genre among both TV shows and movies, with 1,092 TV shows and 1,876 movies falling into this category. Comedy is another popular genre, with 754 TV shows and 1,571 movies classified as comedies. Thriller, action, and romance are also prevalent genres in movies, while animation, sci-fi, and crime are among the more common genres for TV shows. Interestingly, documentaries are more common in movies (611) than in TV shows (341), suggesting a stronger preference for factual storytelling in cinematic formats. This breakdown of genre popularity provides valuable insights into audience preferences and the diversity of content available in the TV and movie landscapes.

3. The analysis shows a notable difference in the distribution of IMDb scores between TV shows and movies. On average, TV shows have a higher IMDb score of approximately 6.98, compared to movies which have an average score of approximately 6.25. This suggests that, on average, TV shows are rated more positively by viewers than movies. When considering all titles together, the average IMDb score across both TV shows and movies is approximately 6.51. This comparison highlights the varying reception of TV shows and movies among viewers, indicating potentially different factors at play in audience evaluation between the two types of content.

4. The average runtime differs significantly between TV shows and movies. TV shows have an average runtime of approximately 38.98 minutes per episode, while movies have an average runtime of approximately 98.21 minutes per movie. This indicates that TV shows tend to be much shorter in duration compared to movies, which typically have longer running times due to their standalone nature.

5. The annual release pattern of TV shows and movies has shown an interesting evolution over the past decade. In 2013, there were 100 movies released compared to 35 TV shows, indicating a higher focus on movies. However, this trend shifted over the years. By 2022, the number of TV show releases surpassed that of movies, with 171 TV shows released compared to 200 movies. This shift suggests a growing emphasis on TV show production and consumption over the years, possibly driven by changing viewer preferences and the rise of streaming platforms offering more TV show content. The data indicates a fluctuating pattern, with some years showing peaks in both TV show and movie releases, while other years have lower numbers, suggesting variations in industry trends and production cycles.

6. The data shows that the United States is the largest producer of both TV shows and movies, with 3 TV shows and 36 movies in the dataset. This is not surprising given Hollywood's dominance in the global entertainment industry. The United Kingdom follows closely behind, with 34 movies produced. France, Germany, and Canada also have notable contributions to movie production, with 29, 20, and 11 movies respectively. When considering TV shows alone, France ranks second with 2 TV shows, followed by the United Kingdom with 0 TV shows. This suggests that while the United Kingdom produces a significant number of movies, France has a stronger presence in TV show production among the countries listed. Overall, these countries represent the major players in the global TV show and movie production landscape.

7. The most common age certification among TV shows and movies in the dataset is "TV-MA," which stands for "Mature Audience." This certification is often used for content that is specifically designed to be viewed by adults and may contain strong or explicit language, violence, or sexual content. "TV-MA" is followed by "R" (Restricted), "TV-14" (Parents Strongly Cautioned - Some material may be inappropriate for children under 14), and "PG-13" (Parents Strongly Cautioned - Some material may be inappropriate for children under 13), indicating a mix of content targeting different age groups. Other certifications such as "PG" (Parental Guidance Suggested), "TV-PG" (Parental Guidance Suggested - Some material may not be suitable for children), and various TV ratings are also present but are less common compared to the top certifications mentioned above.


8. The distribution of IMDb scores between TV shows and movies shows some interesting differences. On average, TV shows tend to have higher IMDb scores compared to movies. The average IMDb score for TV shows in the dataset is approximately 6.98, while for movies, it is around 6.25. This suggests that TV shows in this dataset are generally better rated by viewers than movies. However, when considering all titles (both TV shows and movies) together, the average IMDb score is approximately 6.51, indicating a moderate overall rating across all types of content.

9. The average runtime differs significantly between TV shows and movies. TV shows in the dataset have an average runtime of approximately 38.98 minutes per episode, reflecting the typical duration of a single episode in a TV series. On the other hand, movies have a much longer average runtime of around 98.21 minutes per movie, reflecting the longer format of standalone films. These averages suggest that TV shows tend to be shorter in duration compared to movies, which is expected due to the episodic nature of TV series.

10. The annual release pattern of TV shows and movies over the past decade, from 2013 to 2022, has shown some interesting trends. In 2013, there were 35 TV shows and 100 movies released. The number of releases generally increased over the years, with peaks in 2019 and 2020. In 2019, there were 311 TV shows and 525 movies released, while in 2020, there were 314 TV shows and 500 movies released. The trend seems to have slightly declined in 2022, with 171 TV shows and 200 movies released. Overall, the data suggests a growing trend in the number of TV shows and movies released annually, with some fluctuations in certain years.

11. The analysis of production countries reveals that the United States leads in the production of both TV shows and movies, with a total count of 39 productions. The United Kingdom follows closely behind with 34 productions, predominantly movies. France takes the third spot with 31 productions, including both TV shows and movies. Other countries such as Germany, Canada, and Denmark also contribute to the production, although to a lesser extent. This data highlights the dominance of a few key countries in the global TV show and movie production landscape.

12. In terms of age certification, TV-MA (Mature Audience) emerges as the most common certification among TV shows and movies, with a count of 883. This is followed by R-rated productions with a count of 556. TV-14 and PG-13 certifications are also prevalent, with counts of 474 and 451 respectively. These certifications indicate that a significant portion of TV shows and movies cater to mature audiences, with content that may be unsuitable for younger viewers without parental guidance.

13. The average number of seasons for TV shows varies across different age certification categories. TV-G shows have the highest average number of seasons at approximately 3.62, followed by TV-Y7 with an average of 3.39 seasons. TV-Y shows have an average of 3.04 seasons, while TV-PG shows have an average of 2.70 seasons. TV-14 shows have an average of 2.41 seasons, and TV-MA shows have the lowest average number of seasons at approximately 1.71. This data suggests that TV shows with more mature content tend to have fewer seasons on average compared to shows with more family-friendly content.

14. The average IMDb score varies by production country for both TV shows and movies. For movies, countries like Egypt ('EG'), Bahamas ('BS'), and Malawi ('MW') have some of the highest average IMDb scores at 8.1, 8.0, and 7.6 respectively. For TV shows, France ('FR') leads with an average IMDb score of 6.85. This data indicates that the quality of TV shows and movies, as measured by IMDb scores, can vary significantly depending on the country of production.

15. The genres with the highest average IMDb score for TV shows and movies are 'history' and an empty genre category, both with an average IMDb score of 7.1. This suggests that TV shows and movies in the 'history' genre tend to be highly rated by IMDb users, as do productions without a specified genre.

16. There are noticeable trends in the average runtime of TV shows and movies over the years. For example, in 1945, the average runtime of a TV show was 51 minutes, while in 1954, the average runtime of a movie was 107.5 minutes. Over the years, there have been fluctuations in the average runtime, with some years showing significant increases or decreases. These trends may be influenced by various factors such as audience preferences, production techniques, and industry standards.

17. The production countries with the highest average runtime for TV shows and movies are Argentina and Spain ('AR', 'ES') for TV shows with an average runtime of 73 minutes, and the United Kingdom, United States, and Japan ('GB', 'US', 'JP') for movies with an average runtime of 208 minutes. This data suggests that certain countries tend to produce TV shows and movies with longer runtimes on average compared to others.

18. The distribution of runtime differs between TV shows and movies. For TV shows, the first quartile (q1) is 25 minutes, the median is 41 minutes, and the third quartile (q3) is 49 minutes. The mean runtime is approximately 38.98 minutes, with a standard deviation of approximately 17.63 minutes. For movies, the first quartile (q1) is 85 minutes, the median is 98 minutes, and the third quartile (q3) is 115 minutes. The mean runtime is approximately 98.21 minutes, with a standard deviation of approximately 30.64 minutes. These statistics indicate that TV shows generally have shorter runtimes and less variability in runtime compared to movies.

19. The average IMDb score varies for TV shows and movies released in each year. For example, movies released in 1954 have an average IMDb score of 7.45, while movies released in 1978 have an average score of 4.4. TV shows also show variation, with a notable spike in 1969, where TV shows released that year have an average IMDb score of 8.8. This data suggests that the quality of TV shows and movies, as measured by IMDb scores, can vary significantly from year to year.

20. Examining the genre distribution of TV shows and movies released over the years reveals interesting trends. For example, in 1945, the only genre represented was 'documentation', suggesting a limited range of genres at that time. However, as time progresses, more genres become popular. For instance, in 1954, genres like 'romance', 'crime', 'drama', and 'comedy' start appearing. This trend continues, with new genres being introduced over the years. Some genres may become more popular while others may decline in popularity, reflecting changing audience preferences and industry trends.

21. The diversity of genres in TV shows and movies varies across production countries. For instance, productions from the United States and India exhibit the most diverse range of genres, with 19 different genres represented in their content. This diversity suggests a rich and varied cultural and storytelling landscape in these countries' film and television industries. Other countries like Japan, Spain, and the United Kingdom also show a high diversity of genres in their productions, each boasting 18 different genres. This indicates a global trend of diverse storytelling and audience engagement across various cultural contexts.





