# Importing and exploring the data

In [20]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *

In [21]:
spark = SparkSession \
    .builder \
    .appName("imdb") \
    .getOrCreate()

spark.version

'3.5.1'

In [22]:
df_ratings = spark.read.csv("./data/title_ratings.tsv", header=True, sep='\t', inferSchema=True)
df_ratings.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



In [38]:
df_basics = spark.read.csv("./data/title_basics.tsv", header=True, sep='\t',quote='')
df_basics.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [24]:
df_basics.head(5)

[Row(tconst='tt0000001', titleType='short', primaryTitle='Carmencita', originalTitle='Carmencita', isAdult='0', startYear='1894', endYear='\\N', runtimeMinutes='1', genres='Documentary,Short'),
 Row(tconst='tt0000002', titleType='short', primaryTitle='Le clown et ses chiens', originalTitle='Le clown et ses chiens', isAdult='0', startYear='1892', endYear='\\N', runtimeMinutes='5', genres='Animation,Short'),
 Row(tconst='tt0000003', titleType='short', primaryTitle='Pauvre Pierrot', originalTitle='Pauvre Pierrot', isAdult='0', startYear='1892', endYear='\\N', runtimeMinutes='4', genres='Animation,Comedy,Romance'),
 Row(tconst='tt0000004', titleType='short', primaryTitle='Un bon bock', originalTitle='Un bon bock', isAdult='0', startYear='1892', endYear='\\N', runtimeMinutes='12', genres='Animation,Short'),
 Row(tconst='tt0000005', titleType='short', primaryTitle='Blacksmith Scene', originalTitle='Blacksmith Scene', isAdult='0', startYear='1893', endYear='\\N', runtimeMinutes='1', genres='C

In [25]:
df_basics.select('titleType').distinct().show()

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
|     tvPilot|
| radioSeries|
|radioEpisode|
+------------+



# Start Of questions

In [26]:
amount_movies = df_basics \
                .filter(
                    (col('startYear') == '2015') &
                    (col('titleType').isin('tvMovie','movie'))
                ) \
                .select(col('tconst')) \
                .distinct() \
                .count() 
amount_movies

19987

In [27]:
most_frequent_gender = df_basics \
            .withColumn('genres_splited',explode(split(col('genres'),','))) \
            .groupBy('genres_splited') \
            .count() \
            .orderBy(desc('count')) \
            .show()

+--------------+-------+
|genres_splited|  count|
+--------------+-------+
|         Drama|2247995|
|        Comedy|1653725|
|         Short|1021850|
|     Talk-Show| 900198|
|   Documentary| 764885|
|       Romance| 724729|
|            \N| 643012|
|        Family| 571470|
|          News| 524662|
|    Reality-TV| 423455|
|     Animation| 406284|
|         Music| 394008|
|         Crime| 351447|
|        Action| 334580|
|     Adventure| 324325|
|     Game-Show| 252533|
|         Adult| 242704|
|         Sport| 178594|
|       Fantasy| 174119|
|       Mystery| 162448|
+--------------+-------+
only showing top 20 rows



In [28]:
best_rating_avg = df_ratings \
    .join(df_basics,'tconst','left') \
    .withColumn('genres_splited',explode(split(col('genres'),','))) \
    .groupBy('genres_splited') \
    .agg(mean('averageRating').alias('avg_rating')) \
    .orderBy(desc('avg_rating')) \
    .show()

+--------------+------------------+
|genres_splited|        avg_rating|
+--------------+------------------+
|       History| 7.353780102645078|
|   Documentary|7.2401985355545975|
|     Biography| 7.175531914893612|
|       Mystery| 7.170086406897919|
|         Crime| 7.159842868485965|
|     Adventure|  7.10762970335177|
|       Fantasy|7.0951456508453745|
|     Animation| 7.089381171483263|
|       Western| 7.080683426568716|
|        Family| 7.070054926034488|
|         Drama| 7.040979155040174|
|           War| 7.009115134414925|
|        Action|7.0070981387478986|
|         Sport| 6.966792418526435|
|        Comedy|6.9600165509183896|
|         Music| 6.927469624015719|
|    Reality-TV| 6.892611170895959|
|     Game-Show| 6.876828101904184|
|       Romance|  6.86401616470397|
|         Short| 6.791292438368536|
+--------------+------------------+
only showing top 20 rows



In [35]:
adventure_game_best_rating = df_ratings \
    .join(df_basics,'tconst','left') \
    .withColumn('genres_splited',explode(split(col('genres'),','))) \
    .filter((col('genres_splited') == 'Adventure') & (col('titleType') == 'videoGame') & (col('startYear')=='2020')) \
    .groupBy('primaryTitle') \
    .agg(mean('averageRating').alias('avg_rating')) \
    .orderBy(desc('avg_rating')) \
    .show()

+--------------------+----------+
|        primaryTitle|avg_rating|
+--------------------+----------+
|     Half-Life: Alyx|       9.5|
|   Ghost of Tsushima|       9.3|
|               Omori|       9.2|
|Final Fantasy VII...|       9.1|
|Ori and the Will ...|       9.1|
|Mega Man Zero/ZX ...|       8.9|
|There Is No Game:...|       8.9|
|Yakuza: Like a Dr...|       8.8|
|Xenoblade Chronic...|       8.8|
|       Demon's Souls|       8.8|
|   Pixel Ripped 1995|       8.7|
|        Doom Eternal|       8.7|
|Spider-Man: Miles...|       8.6|
|    Astro's Playroom|       8.6|
|Call of Duty: Mod...|       8.5|
|               Haven|       8.5|
|Animal Crossing: ...|       8.4|
|      Desperados III|       8.4|
|        Pumpkin Jack|       8.4|
|Assassin's Creed ...|       8.3|
+--------------------+----------+
only showing top 20 rows



In [None]:
different_movie_titles = df_basics \
                        .select('primaryTitle') \
                        .distinct() \
                        .count()

different_movie_titles

3931670

In [40]:
adult_content_duration_avg = df_basics \
                            .filter(col('isAdult')==1) \
                            .describe() \
                            .show()         

+-------+---------+---------+--------------------+--------------------+--------------------+------------------+------------------+-----------------+------+
|summary|   tconst|titleType|        primaryTitle|       originalTitle|             isAdult|         startYear|           endYear|   runtimeMinutes|genres|
+-------+---------+---------+--------------------+--------------------+--------------------+------------------+------------------+-----------------+------+
|  count|   250127|   250127|              250127|              250127|              250127|            250127|            250127|           250127|250127|
|   mean|     NULL|     NULL|            Infinity|            Infinity|                 1.0|2009.9655151651386|2013.4869358669835|92.79938555059914|  NULL|
| stddev|     NULL|     NULL|                 NaN|                 NaN|8.419114939288556...|  9.11649111677599|6.1985363378455185|57.18982244754774|  NULL|
|    min|tt0060313|    movie|"A Different Side...|"A Different S

In [None]:
movies_w_diff_title = df_basics \
                    .filter(col('primaryTitle') != col('originalTitle')) \
                    .distinct() \
                    .count()
movies_w_diff_title

125056

In [None]:
most_long_title_name = df_basics \
                        .orderBy(length(col("primaryTitle")).desc()) \
                        .show()

"""
If we consider the filter "movie" consider in the question n12, then we have tconst = 'tt8682654', which is not available in the possible answers.
df_basics \
                        .filter(col('titleType').isin('movie','tvMovie')) \
                        .orderBy(length(col("primaryTitle")).desc()) \
                        .show()
"""

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt12985206|    video|Otoko wa chi _ ko...|Otoko wa chi _ ko...|      1|     2020|     \N|            99|               Adult|
|tt13007080|    video|Creampie Voluntee...|Creampie Voluntee...|      1|     2015|     \N|           238|               Adult|
| tt7727908|    video|Yufukuna kurashi ...|Yufukuna kurashi ...|      1|     2013|     \N|           129|         Adult,Drama|
|tt12870546|    video|An Ultra K-Cup Ti...|An Ultra K-Cup Ti...|      1|     2020|     \N|           118|               Adult|
|tt12866892|    video|I'm A Dirty Old M...|I'm A Dirty Old M...|      1|     2016|     \N|            \N|      

'\nIf we consider the filter "movie" consider in the question n12, then we have tconst = \'tt8682654\', which is not available in the possible answers.\ndf_basics                         .filter(col(\'titleType\').isin(\'movie\',\'tvMovie\'))                         .orderBy(length(col("primaryTitle")).desc())                         .show()\n'

In [None]:
most_voted_movie = df_ratings.orderBy(desc('numVotes')).show()

+---------+-------------+--------+
|   tconst|averageRating|numVotes|
+---------+-------------+--------+
|tt0111161|          9.3| 2449517|
|tt0468569|          9.0| 2405191|
|tt1375666|          8.8| 2157649|
|tt0137523|          8.8| 1930108|
|tt0110912|          8.9| 1898801|
|tt0109830|          8.8| 1893438|
|tt0944947|          9.2| 1861842|
|tt0133093|          8.7| 1743487|
|tt0120737|          8.8| 1723007|
|tt0167260|          8.9| 1701824|
|tt0068646|          9.2| 1695582|
|tt0816692|          8.6| 1602096|
|tt1345836|          8.4| 1570542|
|tt0903747|          9.4| 1567066|
|tt0167261|          8.7| 1538629|
|tt0114369|          8.6| 1507762|
|tt1853728|          8.4| 1418048|
|tt0172495|          8.5| 1391926|
|tt0372784|          8.2| 1354062|
|tt0361748|          8.3| 1324955|
+---------+-------------+--------+
only showing top 20 rows



In [None]:
worst_note_movie = df_ratings.orderBy(asc('averageRating')).show()

+----------+-------------+--------+
|    tconst|averageRating|numVotes|
+----------+-------------+--------+
| tt5778264|          1.0|      27|
| tt1990974|          1.0|      13|
| tt1143217|          1.0|      40|
| tt2139143|          1.0|      36|
| tt0545951|          1.0|      14|
| tt1995289|          1.0|     450|
|tt11442214|          1.0|      19|
| tt1964233|          1.0|      10|
| tt0175201|          1.0|      19|
| tt1998956|          1.0|      34|
|tt11253082|          1.0|      78|
| tt1934806|          1.0|    1665|
| tt0588498|          1.0|     118|
| tt1998958|          1.0|      33|
|tt11450002|          1.0|       9|
| tt1936801|          1.0|      68|
| tt5964314|          1.0|      10|
| tt2011220|          1.0|      17|
| tt1085265|          1.0|      28|
| tt1936803|          1.0|      38|
+----------+-------------+--------+
only showing top 20 rows

