In [1]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, length

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [33]:
# spark.stop()
spark = SparkSession.builder.appName("imdb-practice").getOrCreate()

Carrega arquivos

In [3]:
df_titles = spark.read.csv('title_basics.tsv', header=True, inferSchema=True, sep='\t')

In [4]:
df_ratings = spark.read.csv('title_ratings.tsv', header=True, inferSchema=True, sep='\t')

In [11]:
# df_titles.head()
# df_titles.describe(['tconst']).show()
df_titles.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [44]:
(df_titles
 # .filter(df_titles.genres.contains('Adventure'))
 .show(2))

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|  Animation,Short|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
only showing top 2 rows



In [22]:
df_ratings.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



Quantos filmes (incluindo os da televisão) foram lançados no ano de 2015?

In [16]:
df_titles.select(['titleType']).distinct().show()

+------------+
|   titleType|
+------------+
|    tvSeries|
|tvMiniSeries|
|     tvMovie|
|   tvEpisode|
|       movie|
|   tvSpecial|
|       video|
|   videoGame|
|     tvShort|
|       short|
| radioSeries|
|radioEpisode|
|     tvPilot|
+------------+



In [17]:
df_titles.filter((df_titles.startYear.isNotNull()) &
                 (df_titles.startYear == 2015) &
                 (df_titles.titleType.isin("movie", "tvMovie"))).count()


19987

Qual o gênero de títulos mais frequente?

In [19]:
(df_titles
 .groupBy('genres')
 .count()
 .sort(col("count").desc())
 .show(3))

+------+------+
|genres| count|
+------+------+
| Drama|880649|
|    \N|643012|
|Comedy|568956|
+------+------+
only showing top 3 rows



Qual o gênero com a melhor nota média de títulos?

In [21]:
df_join = df_titles.join(df_ratings, ['tconst'])
df_join.cache()

DataFrame[tconst: string, titleType: string, primaryTitle: string, originalTitle: string, isAdult: string, startYear: string, endYear: string, runtimeMinutes: string, genres: string, averageRating: double, numVotes: int]

In [22]:
df_join.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)
 |-- averageRating: double (nullable = true)
 |-- numVotes: integer (nullable = true)



In [23]:
(df_join
 .groupBy(df_join.genres)
 .avg('averageRating')
 .sort(col("avg(averageRating)").desc())
 .show(2))

+--------------------+------------------+
|              genres|avg(averageRating)|
+--------------------+------------------+
|Comedy,History,We...|               9.8|
|Biography,Comedy,...|               9.8|
+--------------------+------------------+
only showing top 2 rows



Qual o vídeo game do gênero aventura mais bem avaliado em 2020?

In [36]:
df_titles.select('genres').distinct().show()

+--------------------+
|              genres|
+--------------------+
|        Comedy,Sport|
|  Action,War,Western|
|Action,Adventure,...|
|Documentary,Drama...|
|Adult,Comedy,Musical|
|  Crime,Horror,Short|
|Animation,Sci-Fi,War|
|Documentary,News,...|
| Adult,Horror,Sci-Fi|
|Fantasy,Horror,Mu...|
| Music,Musical,Short|
| Documentary,Western|
|Adventure,Family,...|
|Comedy,Drama,Western|
|Game-Show,Reality-TV|
|Biography,Reality...|
|Film-Noir,Horror,...|
|Fantasy,Mystery,T...|
|Adventure,Crime,M...|
|Documentary,Myste...|
+--------------------+
only showing top 20 rows



In [49]:
(df_join.filter((col('genres').contains('Adventure')) &
                (df_join.titleType == 'videoGame') &
                (df_join.startYear == '2020'))
 .sort(df_ratings.averageRating.desc())
 .show(1)
 )

+----------+---------+---------------+---------------+-------+---------+-------+--------------+--------------------+----------+-------------+--------+
|    tconst|titleType|   primaryTitle|  originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|    tconst|averageRating|numVotes|
+----------+---------+---------------+---------------+-------+---------+-------+--------------+--------------------+----------+-------------+--------+
|tt11321196|videoGame|Half-Life: Alyx|Half-Life: Alyx|      0|     2020|     \N|            \N|Action,Adventure,...|tt11321196|          9.5|     506|
+----------+---------+---------------+---------------+-------+---------+-------+--------------+--------------------+----------+-------------+--------+
only showing top 1 row



Quantos títulos de filmes diferentes existem? Use df_titles.select('primaryTitle').distinct().count().

In [5]:
df_titles.select('primaryTitle').distinct().count()

3931670

Qual a duração média dos filmes com conteúdo adulto? Use uma combinação de filter() e describe().

In [7]:
df_titles.filter(df_titles.isAdult == 1).describe().show()

+-------+---------+---------+--------------------+--------------------+-------+------------------+------------------+-----------------+------+
|summary|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|         startYear|           endYear|   runtimeMinutes|genres|
+-------+---------+---------+--------------------+--------------------+-------+------------------+------------------+-----------------+------+
|  count|   250127|   250127|              250127|              250127| 250127|            250127|            250127|           250127|250127|
|   mean|     null|     null|            Infinity|            Infinity|    1.0|2009.9655151651386|2013.4869358669835|92.79938555059914|  null|
| stddev|     null|     null|                 NaN|                 NaN|    0.0| 9.116491116782225| 6.198536337845558|57.18982244754777|  null|
|    min|tt0060313|    movie|"A Different Side...|"A Different Side...|      1|              1901|              1980|                1|Action|

Quantos filmes têm o título atual (“primary”) diferente do título original? Use uma combinação de filter, e count().

In [8]:
df_titles.filter(df_titles.primaryTitle != df_titles.originalTitle).count()

125056

Qual o filme que tem o nome mais longo? Dica: consulte https://sparkbyexamples.com/spark/spark-using-length-size-of-a-dataframe-column/ Links to an external site.e use algo como df_titles.orderBy(length(col("primaryTitle")).desc()).

In [9]:
df_titles.orderBy(length(col("primaryTitle")).desc()).show(1)

+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+
|    tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|genres|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+
|tt12985206|    video|Otoko wa chi _ ko...|Otoko wa chi _ ko...|      1|     2020|     \N|            99| Adult|
+----------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+
only showing top 1 row



Qual filme tem a maior quantidade de votos? Dica: Use describe().

In [29]:
(df_join.
 filter(df_join.titleType.isin("movie", "tvMovie"))
 # .describe(['numVotes'])
 .describe(['tconst','numVotes'])
 .show())

+-------+---------+------------------+
|summary|   tconst|          numVotes|
+-------+---------+------------------+
|  count|   267542|            267542|
|   mean|     null| 3413.902878800338|
| stddev|     null|32827.997305655605|
|    min|tt0000502|                 5|
|    max|tt9916538|           2449517|
+-------+---------+------------------+



In [31]:
df_join.filter(df_join.numVotes == '2449517').show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+-------------+--------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|genres|averageRating|numVotes|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+-------------+--------+
|tt0111161|    movie|The Shawshank Red...|The Shawshank Red...|      0|     1994|     \N|           142| Drama|          9.3| 2449517|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+------+-------------+--------+



Qual é a menor nota média de um filme? Use describe().

In [32]:
df_join.describe('averageRating').show()

+-------+------------------+
|summary|     averageRating|
+-------+------------------+
|  count|           1182639|
|   mean|  6.91702835776597|
| stddev|1.3974964575775894|
|    min|               1.0|
|    max|              10.0|
+-------+------------------+

