In [77]:
import os
import sys

from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, col

# Required for Spark to find Python executable
os.environ['PYSPARK_PYTHON'] = sys.executable
os.environ['PYSPARK_DRIVER_PYTHON'] = sys.executable

In [81]:
# spark.stop()
spark = (SparkSession
         .builder
         .config("spark.executor.memory", "4g")
         .config("spark.driver.memory", "4g")
         .appName("imdb-practice").getOrCreate())

Carrega arquivos

In [82]:
df_titles = spark.read.csv('title_basics.tsv', header=True, inferSchema=True, sep='\t')

In [60]:
df_ratings = spark.read.csv('title_ratings.tsv', header=True, inferSchema=True, sep='\t')

In [11]:
# df_titles.head()
# df_titles.describe(['tconst']).show()
df_titles.printSchema()

root
 |-- tconst: string (nullable = true)
 |-- titleType: string (nullable = true)
 |-- primaryTitle: string (nullable = true)
 |-- originalTitle: string (nullable = true)
 |-- isAdult: string (nullable = true)
 |-- startYear: string (nullable = true)
 |-- endYear: string (nullable = true)
 |-- runtimeMinutes: string (nullable = true)
 |-- genres: string (nullable = true)



In [44]:
(df_titles
 # .filter(df_titles.genres.contains('Adventure'))
 .show(2))

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|           genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|  Animation,Short|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+-----------------+
only showing top 2 rows



In [None]:
df_ratings.printSchema()

Monta o cache com os filmes que tem data de lancamento setada

In [83]:
df_titles_with_start_year = (df_titles
                             .filter(df_titles.startYear != '\\N'))
df_titles_with_start_year.cache()

DataFrame[tconst: string, titleType: string, primaryTitle: string, originalTitle: string, isAdult: string, startYear: string, endYear: string, runtimeMinutes: string, genres: string]

Qual o primeiro filme lancado?

Qual Ãºltimo filme lancado?

In [50]:
(df_titles_with_start_year
 .sort(df_titles.startYear.desc())
 .show(1))

+----------+---------+------------+-------------+-------+---------+-------+--------------+-------------------+
|    tconst|titleType|primaryTitle|originalTitle|isAdult|startYear|endYear|runtimeMinutes|             genres|
+----------+---------+------------+-------------+-------+---------+-------+--------------+-------------------+
|tt12072406|tvEpisode|       Pilot|        Pilot|      0|     2028|     \N|            \N|Action,Comedy,Drama|
+----------+---------+------------+-------------+-------+---------+-------+--------------+-------------------+
only showing top 1 row



Qual primeiro e ultimo filme lancados juntos?

In [62]:
df_first = (df_titles_with_start_year
            .sort(df_titles.startYear.asc())
            .limit(1)
            )

df_last = (df_titles_with_start_year
           .sort(df_titles.startYear.desc())
           .limit(1)
           )

df_first_last = df_first.union(df_last)

df_first_last.show()

+----------+---------+----------------+----------------+-------+---------+-------+--------------+-------------------+
|    tconst|titleType|    primaryTitle|   originalTitle|isAdult|startYear|endYear|runtimeMinutes|             genres|
+----------+---------+----------------+----------------+-------+---------+-------+--------------+-------------------+
| tt3155794|    short|Passage de Venus|Passage de Venus|      0|     1874|     \N|             1|  Documentary,Short|
|tt12072406|tvEpisode|           Pilot|           Pilot|      0|     2028|     \N|            \N|Action,Comedy,Drama|
+----------+---------+----------------+----------------+-------+---------+-------+--------------+-------------------+



Quantos filmes foram lancados por ano?

In [65]:
(df_titles_with_start_year
 .filter(df_titles_with_start_year.startYear >= 2010)
 .groupBy(df_titles_with_start_year.startYear)
 .count()
 .sort('startYear')
 .show())

+---------+------+
|startYear| count|
+---------+------+
|     2010|236565|
|     2011|268829|
|     2012|299152|
|     2013|320186|
|     2014|340980|
|     2015|358054|
|     2016|378041|
|     2017|398751|
|     2018|402244|
|     2019|384053|
|     2020|350984|
|     2021|220823|
|     2022|  6981|
|     2023|   465|
|     2024|    83|
|     2025|    17|
|     2026|    10|
|     2027|     7|
|     2028|     3|
+---------+------+



Quebre os generos em mais de uma linha

In [74]:
# Documentary,Short

(df_titles_with_start_year
 .select('tconst', split(df_titles_with_start_year.genres, ',').alias('genres'))
 .withColumn('genres', explode('genres'))
 .show(10))

+---------+-----------+
|   tconst|     genres|
+---------+-----------+
|tt0000001|Documentary|
|tt0000001|      Short|
|tt0000002|  Animation|
|tt0000002|      Short|
|tt0000003|  Animation|
|tt0000003|     Comedy|
|tt0000003|    Romance|
|tt0000004|  Animation|
|tt0000004|      Short|
|tt0000005|     Comedy|
+---------+-----------+
only showing top 10 rows



Qual o genero que mais possui filmes?

In [89]:
(df_titles_with_start_year
 .select('tconst',
         split(df_titles_with_start_year.genres, ',').alias('genres'))  # transforma genres em array
 .withColumn('genres', explode('genres'))  # quebra o array em varias linhas duplicando os filmes quando necessario
 .groupBy('genres')  # agrupa pelo genero
 .count()  # adicionado a coluna com o count
 .sort('count', ascending=False)  # ordena o count de maneira decrescente
 .show(10))  # print os primeiros registros

+-----------+-------+
|     genres|  count|
+-----------+-------+
|      Drama|1870032|
|     Comedy|1503149|
|      Short| 997854|
|  Talk-Show| 769095|
|Documentary| 727559|
|    Romance| 632349|
|         \N| 572955|
|     Family| 499757|
|       News| 454613|
| Reality-TV| 388890|
+-----------+-------+
only showing top 10 rows



In [85]:
# Valida o resultado acima
df_titles_with_start_year.filter(col('genres').contains('Drama')).count()

1870032