# Trabalho Prático - Módulo II Big Data

## Imports

In [167]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col
import pyspark.sql.functions as f
import pyspark.sql.types as t
from pyspark.sql.functions import desc #desc_nulls_last
from pyspark.sql.window import Window

In [28]:
import findspark
findspark.init()

In [29]:
spark = SparkSession.builder.config('spark.executor.memory', '8G').getOrCreate()

### Caminho dos arquivos

In [30]:
imdb_path = 'C:\\Users\\barba\\Documents\\desafio\\data\\imdb\\'

In [31]:
imdb_path

'C:\\Users\\barba\\Documents\\desafio\\data\\imdb\\'

In [32]:
df_titles = spark.read.csv(imdb_path + 'title_basics', header=True, sep='\t')


In [33]:
df_titles.limit(5).show()

+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|   tconst|titleType|        primaryTitle|       originalTitle|isAdult|startYear|endYear|runtimeMinutes|              genres|
+---------+---------+--------------------+--------------------+-------+---------+-------+--------------+--------------------+
|tt0000001|    short|          Carmencita|          Carmencita|      0|     1894|     \N|             1|   Documentary,Short|
|tt0000002|    short|Le clown et ses c...|Le clown et ses c...|      0|     1892|     \N|             5|     Animation,Short|
|tt0000003|    short|      Pauvre Pierrot|      Pauvre Pierrot|      0|     1892|     \N|             4|Animation,Comedy,...|
|tt0000004|    short|         Un bon bock|         Un bon bock|      0|     1892|     \N|            12|     Animation,Short|
|tt0000005|    short|    Blacksmith Scene|    Blacksmith Scene|      0|     1893|     \N|             1|        Comedy

In [34]:
df_ratings = spark.read.csv(imdb_path + 'title_ratings', header=True, sep='\t')

In [35]:
df_ratings.limit(5).toPandas()

Unnamed: 0,tconst,averageRating,numVotes
0,tt0000001,5.7,1826
1,tt0000002,6.0,234
2,tt0000003,6.5,1585
3,tt0000004,6.1,152
4,tt0000005,6.2,2405


### Quantos filmes (incluindo os da televisão) foram lançados no ano de 2015?

In [36]:
(
    df_titles
    .withColumn('startYear', f.col('startYear').cast('int'))
    .filter('startYear == 2015')
    .groupby('titleType')
    .agg(f.count(f.col('startYear')).alias('ano_2015'))
    .toPandas()
)


Unnamed: 0,titleType,ano_2015
0,tvSeries,10173
1,tvMiniSeries,2300
2,tvMovie,3563
3,tvEpisode,262588
4,movie,16471
5,tvSpecial,1280
6,video,13548
7,videoGame,1203
8,tvShort,642
9,short,46086


In [119]:
(
    df_titles
    .withColumn('startYear', f.col('startYear').cast('int'))
    .filter('(titleType IN("tvMovie", "movie")) AND startYear == 2015')
#     .groupby('titleType')
    .agg(f.count(f.col('startYear')).alias('filmes_lancados_2015'),)
    .toPandas()
)


Unnamed: 0,filmes_lancados_2015
0,20034


Qual o gênero de títulos mais frequente?
Dica: Utilize as funções split e explode.

In [38]:
  ( 
    df_titles
    .withColumn('genres_array', f.split(f.col('genres'), ','))
    .withColumn('genres_unico', f.explode(f.col('genres_array')))
    .groupby('genres_unico')
    .agg(f.count(f.col('genres_unico')).alias('total'),)
    .orderBy(desc('total'))
    .toPandas()
)

Unnamed: 0,genres_unico,total
0,Drama,2299982
1,Comedy,1676614
2,Short,1031687
3,Talk-Show,883421
4,Documentary,772857
5,Romance,733744
6,\N,640745
7,Family,585333
8,News,547293
9,Reality-TV,432424


Qual o gênero com a melhor nota média de títulos?

In [39]:
(
    df_titles
    .join(df_ratings, 'tconst', 'left')
    .withColumn('genres_array', f.split(f.col('genres'), ','))
    .withColumn('genres_unico', f.explode(f.col('genres_array')))
    .limit(5)
    .toPandas()
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,averageRating,numVotes,genres_array,genres_unico
0,tt0000658,short,The Puppet's Nightmare,Le cauchemar de Fantoche,0,1908,\N,2,"Animation,Short",6.5,215.0,"[Animation, Short]",Animation
1,tt0000658,short,The Puppet's Nightmare,Le cauchemar de Fantoche,0,1908,\N,2,"Animation,Short",6.5,215.0,"[Animation, Short]",Short
2,tt0000839,short,The Curse of Money,The Curse of Money,0,1909,\N,\N,"Drama,Short",,,"[Drama, Short]",Drama
3,tt0000839,short,The Curse of Money,The Curse of Money,0,1909,\N,\N,"Drama,Short",,,"[Drama, Short]",Short
4,tt0001170,short,A Cowboy's Vindication,A Cowboy's Vindication,0,1910,\N,\N,"Short,Western",,,"[Short, Western]",Short


In [40]:
 (
    df_titles
    .join(df_ratings, 'tconst', 'left')
    .withColumn('genres_array', f.split(f.col('genres'), ','))
    .withColumn('averageRating', f.col('averageRating').cast('float'))
    .withColumn('genres_unico', f.explode(f.col('genres_array')))
    .groupby('genres_unico')
    .agg(f.avg(f.col('averageRating')).alias('maior_media'),)
    .orderBy(desc('maior_media'))
    .toPandas()
)

Unnamed: 0,genres_unico,maior_media
0,History,7.354228
1,Documentary,7.242024
2,Biography,7.170921
3,Mystery,7.162031
4,Crime,7.160669
5,Adventure,7.112291
6,Fantasy,7.101233
7,Animation,7.095853
8,Family,7.081125
9,Western,7.065772


Qual o vídeo game do gênero aventura mais bem avaliado em 2020?

In [44]:
 (
    df_titles
    .join(df_ratings, 'tconst', 'left')
    .filter('titleType == "videoGame"')
    .withColumn('averageRating', f.col('averageRating').cast('float'))
    .withColumn('genres_array', f.split(f.col('genres'), ','))
    .withColumn('genres_unico', f.explode(f.col('genres_array')))
    .withColumn('primaryTitle', f.col('primaryTitle'))
    .withColumn('originalTitle', f.col('originalTitle'))
    .groupby('genres_unico')
    .agg(f.avg(f.col('averageRating')).alias('maior_media'),)
    .orderBy(desc('maior_media'))
    .toPandas()
)

Unnamed: 0,genres_unico,maior_media
0,Drama,7.614069
1,History,7.514634
2,Mystery,7.426744
3,Biography,7.35
4,Crime,7.348819
5,Fantasy,7.288354
6,Adventure,7.286398
7,Horror,7.233473
8,Comedy,7.193514
9,War,7.169014


In [52]:
 (
    df_titles
    .join(df_ratings, 'tconst', 'left')
    .filter('titleType == "videoGame" and startYear == 2020')
    .withColumn('genres_array', f.split(f.col('genres'), ','))
    .withColumn('genres_unico', f.explode(f.col('genres_array')))
    .withColumn('primaryTitle', f.col('primaryTitle'))
    .withColumn('originalTitle', f.col('originalTitle'))
    .filter('genres_unico == "Adventure"')
    .groupby('originalTitle')
    .agg(f.avg(f.col('averageRating')).alias('maior_media'),)
    .orderBy(desc('maior_media'))
    .toPandas()
)

Unnamed: 0,originalTitle,maior_media
0,Half-Life: Alyx,9.5
1,Ghost of Tsushima,9.3
2,Omori,9.2
3,Final Fantasy VII Remake,9.1
4,Ori and the Will of the Wisps,9.1
...,...,...
296,For the Love of Nachos,
297,Tintin Match,
298,Gigycube,
299,Kazakh Drive,


Qual seria a forma mais adequada de preencher dados nulos da coluna “col1” com o valor da coluna “col2”?

In [72]:
(
    df_titles
    .replace('\\N', None, subset=['startYear', 'endYear', 'runtimeMinutes'])
    .filter("startYear is null and runtimeMinutes is not null")
#     .na.fill('Não se sabe', subset=['startYear'])
#     .orderBy(f.asc_nulls_first('endYear'))
#     .na.drop(subset=['startYear'])
    .withColumn('coalesce_test', f.coalesce(f.col("startYear"), f.col("runtimeMinutes"), f.lit('Sem ano')))
    .limit(5)
    .toPandas()
)

Unnamed: 0,tconst,titleType,primaryTitle,originalTitle,isAdult,startYear,endYear,runtimeMinutes,genres,coalesce_test
0,tt0067098,tvEpisode,Willi Forst,Willi Forst,0,,,55,\N,55
1,tt0073399,movie,Atlantic City Jackpot,The Money,0,,,88,"Action,Drama",88
2,tt0090238,movie,Unternehmen Geigenkasten,Unternehmen Geigenkasten,0,,,82,"Crime,Family",82
3,tt0098828,tvSeries,Idol Angel Yohkoso Yohko,Idol Angel Yohkoso Yohko,0,,,23,"Animation,Comedy",23
4,tt0102172,tvEpisode,Episode #1.2,Episode #1.2,0,,,81,"Drama,History",81


Qual o percentual de títulos do gênero comédia lançados em 2018 em relação ao total de títulos lançados nesse ano?

Dica: Utilize as funções Split, explode e uma window function.

In [151]:
 
    w = Window.partitionBy('genres_unico').orderBy('startYear').rowsBetween(Window.unboundedPreceding, Window.currentRow)
    (
    df_titles
    .withColumn('startYear', f.col('startYear').cast('int'))
    .withColumn('genres_array', f.split(f.col('genres'), ','))
    .withColumn('genres_unico', f.explode(f.col('genres_array')))
    .withColumn('primaryTitle', f.col('primaryTitle'))
    .withColumn('originalTitle', f.col('originalTitle'))
    .filter('(startYear == 2018 OR startYear == 2021) AND genres_unico == "Comedy"')
    .groupby('genres_unico','startYear')
    .agg(f.expr('avg(originalTitle) as total_filmes'))
    .withColumn('variacao % 2021 - 2018', f.round(f.mean('total_filmes').over(w), 3))
    .toPandas()
)
 

Unnamed: 0,genres_unico,startYear,total_filmes,variacao % 2021 - 2018
0,Comedy,2018,5316.888679,5316.889
1,Comedy,2021,3875.542735,4596.216


In [158]:
from unidecode import unidecode
from pyspark.sql.types import StringType


In [169]:
unidecode('àáâçéõü')
def unidecode_function(string):
    if not string:
        return None
    else:
        return unidecode(string)

unidecode_udf = f.udf(unidecode_function, returnType=StringType())


In [178]:

def sqr_divide(value):

    return (value**2)/2

sqr_divide_udf = f.udf(sqr_divide)

In [179]:
(
    df_ratings
    .withColumn('averageRating', f.col('averageRating').cast('double'))
    .select(sqr_divide_udf('averageRating').alias('averageRating'))
    .agg(f.mean('averageRating').alias('averageRating'))
    .show()
)

+-----------------+
|    averageRating|
+-----------------+
|24.93777977614497|
+-----------------+

