In [35]:
import os
import sys
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable


In [1]:
# Install Java
!apt-get install openjdk-8-jdk-headless -qq > /dev/null

# Install pyspark
!pip install pyspark

# Set up Spark session
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .master("local[*]") \
    .appName("SparkSQL") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "2g") \
    .getOrCreate()


^C
Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=7dc209afcd266dee877c9cca9ee0e903eb735d98dbbb69fcd7da6f0a6dd4b86c
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.1


In [2]:
# Import packages
from pyspark.sql import SparkSession
import time
import pandas as pd
from pyspark.sql.functions import col
from pyspark.sql.functions import current_date, expr



# Create a SparkSession
spark = SparkSession.builder.appName("SparkSQL").getOrCreate()

In [3]:
from google.colab import drive
import pandas as pd

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
# Path to the CSV files in Google Drive
show_data_path = '/content/drive/My Drive/showData.csv'
movie_data_path = '/content/drive/My Drive/movieData.csv'

In [5]:
# Load the CSV files into DataFrames
show_df = pd.read_csv(show_data_path)
movie_df = pd.read_csv(movie_data_path)


In [6]:
# Display the first few rows of showData.csv
show_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,release_year,release_decade,age_certification,runtime,genres,production_countries,seasons,imdb_score,imdb_votes,actors,directors
0,0,ts22164,Monty Python's Flying Circus,1969,60s,TV-14,30,"['comedy', 'european']",['GB'],4.0,8.8,73424.0,"['Graham Chapman 11472 ACTOR', 'Michael Palin ...",0
1,1,ts45948,Monty Python's Fliegender Zirkus,1972,70s,TV-MA,43,['comedy'],[],1.0,8.1,2151.0,"['Graham Chapman 11472 ACTOR', 'John Cleese 15...",['Ian MacNaughton 16383 DIRECTOR']
2,2,ts20681,Seinfeld,1989,80s,TV-PG,24,['comedy'],['US'],9.0,8.9,308824.0,"['Jerry Seinfeld 32954 ACTOR', 'Jason Alexande...",0
3,3,ts22082,Knight Rider,1982,80s,TV-PG,51,"['scifi', 'action', 'crime', 'drama']",['US'],4.0,6.9,34115.0,"['David Hasselhoff 8464 ACTOR', 'Edward Mulhar...",0
4,4,ts21715,Thomas & Friends,1984,80s,TV-Y,10,"['animation', 'family', 'comedy', 'fantasy', '...",['GB'],24.0,6.5,5104.0,['Rachael Louise Miller 1381431 ACTOR'],0


In [7]:
# Display the first few rows of movieData.csv
movie_df.head()

Unnamed: 0.1,Unnamed: 0,id,title,release_year,release_decade,age_certification,runtime,genres,production_countries,imdb_score,imdb_votes,actors,directors
0,0,tm84618,Taxi Driver,1976,70s,R,114,"['drama', 'crime']",['US'],8.2,808582.0,"['Robert De Niro 3748 ACTOR', 'Jodie Foster 14...",['Martin Scorsese 3308 DIRECTOR']
1,1,tm154986,Deliverance,1972,70s,R,109,"['drama', 'action', 'thriller', 'european']",['US'],7.7,107673.0,"['Jon Voight 10103 ACTOR', 'Burt Reynolds 1302...",['John Boorman 17727 DIRECTOR']
2,2,tm127384,Monty Python and the Holy Grail,1975,70s,PG,91,"['fantasy', 'action', 'comedy']",['GB'],8.2,534486.0,"['Graham Chapman 11472 ACTOR', 'John Cleese 15...","['Terry Jones 11475 DIRECTOR', 'Terry Gilliam ..."
3,3,tm120801,The Dirty Dozen,1967,60s,0,150,"['war', 'action']","['GB', 'US']",7.7,72662.0,"['Lee Marvin 35196 ACTOR', 'Ernest Borgnine 10...",['Robert Aldrich 1063 DIRECTOR']
4,4,tm70993,Life of Brian,1979,70s,R,94,['comedy'],['GB'],8.0,395024.0,"['Graham Chapman 11472 ACTOR', 'John Cleese 15...",['Terry Jones 11475 DIRECTOR']


In [8]:
# Replace NaN IMDb scores with None in show_df
show_df['imdb_score'] = show_df['imdb_score'].apply(lambda x: None if pd.isnull(x) else x)

# Replace NaN IMDb scores with None in movie_df
movie_df['imdb_score'] = movie_df['imdb_score'].apply(lambda x: None if pd.isnull(x) else x)

In [9]:
# Convert cleaned Pandas DataFrames to Spark DataFrames
spark_show_df_cleaned = spark.createDataFrame(show_df)
spark_movie_df_cleaned = spark.createDataFrame(movie_df)

# Create temporary views for the cleaned Spark DataFrames
spark_show_df_cleaned.createOrReplaceTempView("showData_cleaned")
spark_movie_df_cleaned.createOrReplaceTempView("movieData_cleaned")

In [10]:
# Display the schema of the Spark DataFrame
spark_show_df_cleaned.printSchema()



root
 |-- Unnamed: 0: long (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- release_decade: string (nullable = true)
 |-- age_certification: string (nullable = true)
 |-- runtime: long (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- seasons: double (nullable = true)
 |-- imdb_score: double (nullable = true)
 |-- imdb_votes: double (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)



In [11]:
# Display the schema of the Spark DataFrame
spark_movie_df_cleaned.printSchema()

root
 |-- Unnamed: 0: long (nullable = true)
 |-- id: string (nullable = true)
 |-- title: string (nullable = true)
 |-- release_year: long (nullable = true)
 |-- release_decade: string (nullable = true)
 |-- age_certification: string (nullable = true)
 |-- runtime: long (nullable = true)
 |-- genres: string (nullable = true)
 |-- production_countries: string (nullable = true)
 |-- imdb_score: double (nullable = true)
 |-- imdb_votes: double (nullable = true)
 |-- actors: string (nullable = true)
 |-- directors: string (nullable = true)



# What are the top-rated TV shows and movies based on IMDb scores?

In [12]:
# Finding top rated tv shows and movies, excluding rows with NaN IMDb scores
top_rated = spark.sql("""
    SELECT 'TV Show' AS type, title, imdb_score
    FROM showData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    UNION ALL
    SELECT 'Movie' AS type, title, imdb_score
    FROM movieData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    ORDER BY imdb_score DESC
    LIMIT 10
""")
top_rated.show()


+-------+--------------------+----------+
|   type|               title|imdb_score|
+-------+--------------------+----------+
|TV Show|            #ABtalks|       9.6|
|TV Show|        Breaking Bad|       9.5|
|TV Show|            Khawatir|       9.5|
|TV Show|          Our Planet|       9.3|
|TV Show|Avatar: The Last ...|       9.3|
|TV Show|          Reply 1988|       9.2|
|  Movie|               Major|       9.1|
|  Movie|Chhota Bheem & Kr...|       9.1|
|TV Show|      The Last Dance|       9.1|
|TV Show|           My Mister|       9.1|
+-------+--------------------+----------+



# Which genres are most common among TV shows and movies?

In [13]:
result = spark.sql("""
    SELECT type, trim('\"' FROM genre) AS genre, COUNT(*) AS count
    FROM
      (SELECT 'TV Show' AS type, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM movieData_cleaned)
    WHERE genre != ""
    GROUP BY type, genre
    ORDER BY count DESC
""")
# Show the result
result.show()

+-------+---------------+-----+
|   type|          genre|count|
+-------+---------------+-----+
|  Movie|        'drama'| 1805|
|  Movie|       'comedy'| 1493|
|TV Show|        'drama'| 1022|
|  Movie|     'thriller'|  792|
|TV Show|       'comedy'|  725|
|  Movie|       'action'|  688|
|  Movie|      'romance'|  683|
|  Movie|'documentation'|  539|
|  Movie|        'crime'|  533|
|TV Show|       'action'|  421|
|TV Show|    'animation'|  401|
|TV Show|     'thriller'|  388|
|TV Show|        'crime'|  376|
|TV Show|        'scifi'|  360|
|  Movie|     'european'|  329|
|  Movie|       'family'|  328|
|TV Show|       'family'|  324|
|TV Show|'documentation'|  320|
|TV Show|      'fantasy'|  315|
|  Movie|      'fantasy'|  304|
+-------+---------------+-----+
only showing top 20 rows



# How does the distribution of IMDb scores differ between TV shows and movies?

In [14]:
from pyspark.sql.functions import col

# Calculate the average IMDb score for TV shows, excluding NaN values
tv_show_avg = spark.sql("""
    SELECT AVG(imdb_score) AS avg_imdb_score
    FROM showData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
""")
tv_show_avg_score = tv_show_avg.collect()[0]["avg_imdb_score"]

# Calculate the average IMDb score for movies, excluding NaN values
movie_avg = spark.sql("""
    SELECT AVG(imdb_score) AS avg_imdb_score
    FROM movieData_cleaned
    WHERE imdb_score IS NOT NULL AND NOT isnan(imdb_score)
""")
movie_avg_score = movie_avg.collect()[0]["avg_imdb_score"]

# Display the results
print("Average IMDb score for TV shows:", tv_show_avg_score)
print("Average IMDb score for movies:", movie_avg_score)


Average IMDb score for TV shows: 6.977926766374413
Average IMDb score for movies: 6.2467483231262815


In [15]:
# Calculate the average IMDb score for TV shows and movies, excluding NaN values
result = spark.sql("""
    SELECT 'All' AS type,
           FORMAT_NUMBER(AVG(imdb_score), 2) AS avg_imdb_score
    FROM
    (
        SELECT imdb_score FROM showData_cleaned WHERE NOT isnan(imdb_score)
        UNION ALL
        SELECT imdb_score FROM movieData_cleaned WHERE NOT isnan(imdb_score)
    )
""")
result.show()


+----+--------------+
|type|avg_imdb_score|
+----+--------------+
| All|          6.51|
+----+--------------+



# What is the average runtime of TV shows and movies?

In [16]:
result = spark.sql("""
    SELECT 'TV Show' AS type, AVG(runtime) AS avg_runtime
    FROM showData_cleaned
    WHERE runtime IS NOT NULL
    UNION ALL
    SELECT 'Movie' AS type, AVG(runtime) AS avg_runtime
    FROM movieData_cleaned
    WHERE runtime IS NOT NULL
    GROUP BY type
""")
result.show()


+-------+------------------+
|   type|       avg_runtime|
+-------+------------------+
|TV Show|38.765858690046414|
|  Movie|101.06620005832605|
+-------+------------------+



# How has the annual release pattern of TV shows and movies evolved over the span of 10 years, from 2013 to 2022?

In [17]:
# Query to count the number of TV shows and movies released each year
result = spark.sql("""
    SELECT type, release_year, COUNT(*) AS count
    FROM
      (SELECT 'TV Show' AS type, release_year FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, release_year FROM movieData_cleaned)
    WHERE release_year IS NOT NULL
    GROUP BY type, release_year
    ORDER BY release_year DESC, type DESC
    LIMIT 20
""")

# Show the result
result.show()


+-------+------------+-----+
|   type|release_year|count|
+-------+------------+-----+
|TV Show|        2022|  156|
|  Movie|        2022|  180|
|TV Show|        2021|  289|
|  Movie|        2021|  429|
|TV Show|        2020|  257|
|  Movie|        2020|  424|
|TV Show|        2019|  283|
|  Movie|        2019|  465|
|TV Show|        2018|  286|
|  Movie|        2018|  446|
|TV Show|        2017|  173|
|  Movie|        2017|  367|
|TV Show|        2016|  130|
|  Movie|        2016|  213|
|TV Show|        2015|   88|
|  Movie|        2015|  124|
|TV Show|        2014|   41|
|  Movie|        2014|  103|
|TV Show|        2013|   35|
|  Movie|        2013|   92|
+-------+------------+-----+



# Which countries produce the most TV shows and movies?


In [18]:
# Query to count the number of TV shows and movies produced by each country
result = spark.sql("""
    SELECT production_country,
           SUM(CASE WHEN type = 'TV Show' THEN 1 ELSE 0 END) AS tv_count,
           SUM(CASE WHEN type = 'Movie' THEN 1 ELSE 0 END) AS movie_count,
           COUNT(*) AS total_count
    FROM
      (SELECT 'TV Show' AS type, explode(split(production_countries, ', ')) AS production_country FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, explode(split(production_countries, ', ')) AS production_country FROM movieData_cleaned)
    WHERE production_country != "" AND production_country NOT LIKE "%[%" AND production_country NOT LIKE "%]%"
    GROUP BY production_country
    ORDER BY total_count DESC
""")

# Show the result
result.show()


+------------------+--------+-----------+-----------+
|production_country|tv_count|movie_count|total_count|
+------------------+--------+-----------+-----------+
|              'US'|       3|         33|         36|
|              'GB'|       0|         34|         34|
|              'FR'|       2|         27|         29|
|              'DE'|       0|         19|         19|
|              'CA'|       0|         10|         10|
|              'DK'|       0|         10|         10|
|              'NL'|       0|          8|          8|
|              'CH'|       0|          7|          7|
|              'CN'|       0|          6|          6|
|              'ES'|       0|          5|          5|
|              'BE'|       0|          5|          5|
|              'SE'|       0|          5|          5|
|              'MX'|       0|          5|          5|
|              'JP'|       1|          3|          4|
|              'AU'|       1|          3|          4|
|              'PL'|       0

# Which age certification (e.g., TV-MA, PG-13) is most common among TV shows and movies?

In [19]:
result = spark.sql("""
    SELECT age_certification, COUNT(*) AS count
    FROM
      (SELECT age_certification FROM showData_cleaned WHERE age_certification IS NOT NULL AND age_certification != '' AND age_certification != 'NaN'
       UNION ALL
       SELECT age_certification FROM movieData_cleaned WHERE age_certification IS NOT NULL AND age_certification != '' AND age_certification != 'NaN')
    GROUP BY age_certification
    ORDER BY count DESC
""")
result.show()


+-----------------+-----+
|age_certification|count|
+-----------------+-----+
|                0| 2335|
|            TV-MA|  833|
|                R|  530|
|            TV-14|  441|
|            PG-13|  435|
|               PG|  223|
|            TV-PG|  173|
|            TV-Y7|  111|
|                G|   99|
|             TV-Y|   98|
|             TV-G|   75|
|            NC-17|   15|
+-----------------+-----+



# What is the average number of seasons for TV shows in each age certification category?

In [20]:
result = spark.sql("""
    SELECT age_certification, AVG(seasons) AS avg_seasons
    FROM showData_cleaned
    WHERE age_certification IS NOT NULL AND age_certification != 'NaN'
    GROUP BY age_certification
    ORDER BY avg_seasons DESC
""")

result.show()


+-----------------+------------------+
|age_certification|       avg_seasons|
+-----------------+------------------+
|             TV-G|3.5866666666666664|
|            TV-Y7|3.5225225225225225|
|             TV-Y| 3.122448979591837|
|            TV-PG| 2.820809248554913|
|            TV-14| 2.507936507936508|
|            TV-MA|1.7346938775510203|
|                0|1.5528846153846154|
+-----------------+------------------+



# How does the average IMDb score vary by production country for TV shows and movies?

In [21]:
result = spark.sql("""
    SELECT type, production_country, AVG(imdb_score) AS avg_imdb_score
    FROM
      (SELECT 'TV Show' AS type, explode(split(production_countries, ', ')) AS production_country, imdb_score FROM showData_cleaned
       UNION ALL
       SELECT 'Movie' AS type, explode(split(production_countries, ', ')) AS production_country, imdb_score FROM movieData_cleaned)
    WHERE production_country != "" AND production_country NOT LIKE "%[%" AND production_country NOT LIKE "%]%" AND imdb_score IS NOT NULL AND NOT isnan(imdb_score)
    GROUP BY type, production_country
    ORDER BY avg_imdb_score DESC
""")

result.show()


+-------+------------------+------------------+
|   type|production_country|    avg_imdb_score|
+-------+------------------+------------------+
|  Movie|              'EG'|               8.1|
|  Movie|              'BS'|               8.0|
|  Movie|              'MW'|               7.6|
|  Movie|              'NO'|              7.35|
|  Movie|              'LB'|               7.3|
|  Movie|              'BR'|               7.2|
|  Movie|              'AE'|               7.1|
|  Movie|              'UY'|               7.1|
|  Movie|              'ES'|7.0200000000000005|
|  Movie|              'GH'|               7.0|
|  Movie|              'ZA'| 6.949999999999999|
|  Movie|              'CH'| 6.914285714285714|
|  Movie|              'MX'|              6.88|
|TV Show|              'FR'|              6.85|
|  Movie|              'IS'| 6.833333333333333|
|  Movie|              'SE'|              6.82|
|  Movie|              'IR'|               6.8|
|  Movie|              'JO'|            

Which genres have the highest average IMDb score for TV shows and movies?

In [25]:
#finding the top genres for shows
top_genre_tv_shows = spark.sql("""
    SELECT 'TV Show' AS type, trim('\"' FROM genre) AS genre, AVG(imdb_score) AS avg_imdb_score
    FROM showData_cleaned
    LATERAL VIEW explode(split(trim('[]' FROM genres), ', ')) AS genre
    GROUP BY type, genre
    ORDER BY avg_imdb_score DESC
    LIMIT 1
""")

#finding the top genres for movies
top_genre_movies = spark.sql("""
    SELECT 'Movie' AS type, trim('\"' FROM genre) AS genre, AVG(imdb_score) AS avg_imdb_score
    FROM movieData_cleaned
    LATERAL VIEW explode(split(trim('[]' FROM genres), ', ')) AS genre
    GROUP BY type, genre
    ORDER BY avg_imdb_score DESC
    LIMIT 1
""")
#combining the results
result = top_genre_tv_shows.unionAll(top_genre_movies)
result.show()

+-------+---------+-----------------+
|   type|    genre|   avg_imdb_score|
+-------+---------+-----------------+
|TV Show|'history'|7.477064220183486|
|  Movie|         |              7.1|
+-------+---------+-----------------+



Are there any trends in the average runtime of TV shows and movies over the years?

In [26]:
result = spark.sql("""
    SELECT type, release_year, AVG(runtime) AS avg_runtime
    FROM
        (SELECT 'TV Show' AS type, release_year, runtime FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, release_year, runtime FROM movieData_cleaned)
    GROUP BY type, release_year
    ORDER BY release_year, type
""")

result.show()

+-------+------------+------------------+
|   type|release_year|       avg_runtime|
+-------+------------+------------------+
|  Movie|        1954|             107.5|
|  Movie|        1956|             120.0|
|  Movie|        1958|              77.0|
|  Movie|        1959|             142.0|
|  Movie|        1960|             158.0|
|  Movie|        1961|             158.0|
|  Movie|        1963|             186.0|
|  Movie|        1966|             117.0|
|  Movie|        1967|             130.0|
|  Movie|        1969|             129.0|
|TV Show|        1969|              30.0|
|  Movie|        1971|             102.0|
|  Movie|        1972|             103.0|
|TV Show|        1972|              43.0|
|  Movie|        1973|             131.0|
|  Movie|        1974|             162.0|
|  Movie|        1975|             100.5|
|  Movie|        1976|124.33333333333333|
|  Movie|        1977|             150.0|
|  Movie|        1978|             141.0|
+-------+------------+------------

Which production countries have the highest average runtime for TV shows and movies?

In [30]:
#finding the production countries with the highest average runtime for shows
top_countries_tv_shows = spark.sql("""
    SELECT 'TV Show' AS type, production_countries, AVG(runtime) AS avg_runtime
    FROM showData_cleaned
    WHERE production_countries IS NOT NULL
    GROUP BY type, production_countries
    ORDER BY avg_runtime DESC
    LIMIT 1
""")

#finding the production countries with the highest average runtime for movies
top_countries_movies = spark.sql("""
    SELECT 'Movie' AS type, production_countries, AVG(runtime) AS avg_runtime
    FROM movieData_cleaned
    WHERE production_countries IS NOT NULL
    GROUP BY type, production_countries
    ORDER BY avg_runtime DESC
    LIMIT 1
""")

#combining the results
result= top_countries_tv_shows.unionAll(top_countries_movies)
result.show()


+-------+--------------------+-----------+
|   type|production_countries|avg_runtime|
+-------+--------------------+-----------+
|TV Show|        ['AR', 'ES']|       73.0|
|  Movie|['CA', 'HU', 'MX'...|      164.0|
+-------+--------------------+-----------+



How does the distribution of runtime differ between TV shows and movies?

In [31]:
result= spark.sql("""
    SELECT type,
           percentile(runtime, 0.25) AS q1,
           percentile(runtime, 0.5) AS median,
           percentile(runtime, 0.75) AS q3,
           avg(runtime) AS mean,
           stddev(runtime) AS stddev
    FROM
        (SELECT 'TV Show' AS type, runtime FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, runtime FROM movieData_cleaned)
    GROUP BY type
""")

result.show()


+-------+----+------+-----+------------------+------------------+
|   type|  q1|median|   q3|              mean|            stddev|
+-------+----+------+-----+------------------+------------------+
|TV Show|25.0|  41.0| 49.0|38.765858690046414|17.207135160629196|
|  Movie|88.0| 100.0|116.0|101.06620005832605|27.864493182803944|
+-------+----+------+-----+------------------+------------------+



What is the average IMDb score for TV shows and movies released in each year?


In [32]:
result= spark.sql("""
    SELECT type, release_year, AVG(imdb_score) AS avg_imdb_score
    FROM
        (SELECT 'TV Show' AS type, release_year, imdb_score FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, release_year, imdb_score FROM movieData_cleaned)
    GROUP BY type, release_year
    ORDER BY release_year, type
""")
result.show()

+-------+------------+-----------------+
|   type|release_year|   avg_imdb_score|
+-------+------------+-----------------+
|  Movie|        1954|             7.45|
|  Movie|        1956|              6.7|
|  Movie|        1958|              7.5|
|  Movie|        1959|              6.6|
|  Movie|        1960|              6.4|
|  Movie|        1961|              7.5|
|  Movie|        1963|              7.6|
|  Movie|        1966|              7.3|
|  Movie|        1967|              7.7|
|  Movie|        1969|              8.1|
|TV Show|        1969|              8.8|
|  Movie|        1971|              7.7|
|  Movie|        1972|             6.95|
|TV Show|        1972|              8.1|
|  Movie|        1973|              5.1|
|  Movie|        1974|              6.5|
|  Movie|        1975|              7.3|
|  Movie|        1976|6.666666666666667|
|  Movie|        1977|              7.5|
|  Movie|        1978|              4.4|
+-------+------------+-----------------+
only showing top

Are there any genres that have become more or less popular over the years?

In [33]:
result= spark.sql("""
    SELECT type, release_year, genre, COUNT(*) AS genre_count
    FROM
        (SELECT 'TV Show' AS type, release_year, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, release_year, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM movieData_cleaned)
    WHERE genre != ""
    GROUP BY type, release_year, genre
    ORDER BY release_year, type, genre_count DESC
""")

result.show()

+-----+------------+----------+-----------+
| type|release_year|     genre|genre_count|
+-----+------------+----------+-----------+
|Movie|        1954| 'romance'|          2|
|Movie|        1954|  'comedy'|          1|
|Movie|        1954|   'crime'|          1|
|Movie|        1954|   'drama'|          1|
|Movie|        1956|  'action'|          1|
|Movie|        1956|'thriller'|          1|
|Movie|        1956|   'drama'|          1|
|Movie|        1956| 'romance'|          1|
|Movie|        1958|   'drama'|          1|
|Movie|        1958|   'crime'|          1|
|Movie|        1958|  'comedy'|          1|
|Movie|        1959| 'romance'|          1|
|Movie|        1959|   'drama'|          1|
|Movie|        1959|   'crime'|          1|
|Movie|        1960|   'crime'|          1|
|Movie|        1960|'thriller'|          1|
|Movie|        1960|   'drama'|          1|
|Movie|        1961|   'drama'|          1|
|Movie|        1961|     'war'|          1|
|Movie|        1961|  'action'| 

Which production countries have the most diverse genres in their TV shows and movies?


In [34]:
result = spark.sql("""
    SELECT type, production_countries, COUNT(DISTINCT genre) AS unique_genres_count
    FROM
        (SELECT 'TV Show' AS type, production_countries, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM showData_cleaned
         UNION ALL
         SELECT 'Movie' AS type, production_countries, explode(split(trim('[]' FROM genres), ', ')) AS genre FROM movieData_cleaned)
    WHERE genre != ""
    GROUP BY type, production_countries
    ORDER BY unique_genres_count DESC
""")

result.show()

+-------+--------------------+-------------------+
|   type|production_countries|unique_genres_count|
+-------+--------------------+-------------------+
|  Movie|              ['US']|                 19|
|TV Show|              ['US']|                 19|
|  Movie|              ['IN']|                 18|
|TV Show|              ['JP']|                 18|
|TV Show|              ['ES']|                 18|
|  Movie|              ['ES']|                 18|
|TV Show|              ['GB']|                 18|
|  Movie|        ['GB', 'US']|                 17|
|TV Show|              ['FR']|                 17|
|  Movie|              ['DE']|                 17|
|  Movie|              ['CA']|                 17|
|  Movie|              ['GB']|                 17|
|TV Show|              ['KR']|                 17|
|  Movie|              ['FR']|                 17|
|TV Show|              ['IT']|                 16|
|TV Show|              ['IN']|                 16|
|  Movie|              ['MX']| 