<a href="https://colab.research.google.com/github/Erike-Simon/CESAR-AED/blob/main/ProcDados_spark_movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install --upgrade pyspark

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=41e14419df3fd6a4171181186600143076495858cf71b421405ad005e68c269c
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [None]:
import os
import pandas as pd
import pyspark.sql.functions as F

from pyspark.sql import SparkSession

os.environ['PYSPARK_SUBMIT_ARGS'] = '\
      --driver-memory 4G \
      --executor-memory 4G \
      pyspark-shell'

# Não utilizar matplotlib como engine de gráficos e usar plotly
pd.options.plotting.backend = "plotly"

In [None]:
# Criando um cluster local com 1 executor e a quantidade de threads igual a quantidade de cores de CPU disponíveis

spark = SparkSession.builder\
    .master("local[*]")\
    .getOrCreate()
spark

In [None]:
# Comando para desativar os recursos do spark
# spark.stop()

## Explorando os datasets

In [None]:
ROOT_DATA_PATH = "drive/MyDrive/data/ml-25m"

In [None]:
movies_df = spark.read.csv(f'{ROOT_DATA_PATH}/movies.csv', header=True, inferSchema=True)
movies_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [None]:
type(movies_df)

pyspark.sql.dataframe.DataFrame

In [None]:
# Exibe as colunas e os tipos
movies_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- genres: string (nullable = true)



In [None]:
tags_df = spark.read.csv(f'{ROOT_DATA_PATH}/tags.csv', header=True, inferSchema=True)
tags_df.show(5)

+------+-------+----------------+----------+
|userId|movieId|             tag| timestamp|
+------+-------+----------------+----------+
|     3|    260|         classic|1439472355|
|     3|    260|          sci-fi|1439472256|
|     4|   1732|     dark comedy|1573943598|
|     4|   1732|  great dialogue|1573943604|
|     4|   7569|so bad it's good|1573943455|
+------+-------+----------------+----------+
only showing top 5 rows



In [None]:
tags_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)



Esse trecho de código converte os valores de uma coluna chamada *'timestamp'*, que provavelmente estão em formato Unixtime, para um formato de objeto Timestamp, facilitando assim a manipulação e análise de datas e horários no DataFrame tags_df.

'F' refere-se ao módulo functions do PySpark, que é comumente importado e renomeado como F para facilitar o uso. `F.to_timestamp()` e `F.from_unixtime()` são funções desse módulo que são usadas para manipular valores de data/hora em colunas do DataFrame. Ao usar o alias F, podemos acessar essas funções de forma mais concisa, tornando o código mais legível e compacto.

In [None]:
tags_df = tags_df.withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp')))
tags_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [None]:
tags_df.show(5)

+------+-------+----------------+-------------------+
|userId|movieId|             tag|          timestamp|
+------+-------+----------------+-------------------+
|     3|    260|         classic|2015-08-13 13:25:55|
|     3|    260|          sci-fi|2015-08-13 13:24:16|
|     4|   1732|     dark comedy|2019-11-16 22:33:18|
|     4|   1732|  great dialogue|2019-11-16 22:33:24|
|     4|   7569|so bad it's good|2019-11-16 22:30:55|
+------+-------+----------------+-------------------+
only showing top 5 rows



In [None]:
ratings_df = spark.read.csv(f'{ROOT_DATA_PATH}/ratings.csv', header=True, inferSchema=True)\
    .withColumn('timestamp', F.to_timestamp(F.from_unixtime('timestamp')))
ratings_df.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    296|   5.0|2006-05-17 15:34:04|
|     1|    306|   3.5|2006-05-17 12:26:57|
|     1|    307|   5.0|2006-05-17 12:27:08|
|     1|    665|   5.0|2006-05-17 15:13:40|
|     1|    899|   3.5|2006-05-17 12:21:50|
+------+-------+------+-------------------+
only showing top 5 rows



In [None]:
ratings_df.printSchema()

root
 |-- userId: integer (nullable = true)
 |-- movieId: integer (nullable = true)
 |-- rating: double (nullable = true)
 |-- timestamp: timestamp (nullable = true)



In [None]:
links_df = spark.read.csv(f'{ROOT_DATA_PATH}/links.csv', header=True, inferSchema=True)
links_df.show(5)

+-------+------+------+
|movieId|imdbId|tmdbId|
+-------+------+------+
|      1|114709|   862|
|      2|113497|  8844|
|      3|113228| 15602|
|      4|114885| 31357|
|      5|113041| 11862|
+-------+------+------+
only showing top 5 rows



In [None]:
links_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- imdbId: integer (nullable = true)
 |-- tmdbId: integer (nullable = true)



In [None]:
gtags_df = spark.read.csv(f'{ROOT_DATA_PATH}/genome-tags.csv', header=True, inferSchema=True)
gtags_df.show(5)

+-----+------------+
|tagId|         tag|
+-----+------------+
|    1|         007|
|    2|007 (series)|
|    3|18th century|
|    4|       1920s|
|    5|       1930s|
+-----+------------+
only showing top 5 rows



In [None]:
gtags_df.printSchema()

root
 |-- tagId: integer (nullable = true)
 |-- tag: string (nullable = true)



In [None]:
gscores_df = spark.read.csv(f'{ROOT_DATA_PATH}/genome-scores.csv', header=True, inferSchema=True)
gscores_df.show(5)

+-------+-----+--------------------+
|movieId|tagId|           relevance|
+-------+-----+--------------------+
|      1|    1|0.028749999999999998|
|      1|    2|0.023749999999999993|
|      1|    3|              0.0625|
|      1|    4| 0.07574999999999998|
|      1|    5|             0.14075|
+-------+-----+--------------------+
only showing top 5 rows



In [None]:
gscores_df.printSchema()

root
 |-- movieId: integer (nullable = true)
 |-- tagId: integer (nullable = true)
 |-- relevance: double (nullable = true)



## Executando merge de dados

In [None]:
# método do PySpark que já retorna a quantidade de linhas que um dataframe tem
movies_df.count()

62423

In [None]:
links_df.count()

62423

In [None]:
movies_df = movies_df.join(links_df, on='movieId', how='inner')
movies_df = movies_df.cache()
movies_df.show(5)

+-------+--------------------+--------------------+------+------+
|movieId|               title|              genres|imdbId|tmdbId|
+-------+--------------------+--------------------+------+------+
|      1|    Toy Story (1995)|Adventure|Animati...|114709|   862|
|      2|      Jumanji (1995)|Adventure|Childre...|113497|  8844|
|      3|Grumpier Old Men ...|      Comedy|Romance|113228| 15602|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|114885| 31357|
|      5|Father of the Bri...|              Comedy|113041| 11862|
+-------+--------------------+--------------------+------+------+
only showing top 5 rows



In [None]:
movies_df.count()

62423

O método `.cache()` é utilizado em Spark para armazenar em cache um DataFrame na memória ou em disco, dependendo da configuração do ambiente. Quando um DataFrame é armazenado em cache, ele é mantido na memória ou em disco para acesso rápido em operações subsequentes.

Ao chamar `.cache()` em um DataFrame, o Spark tenta armazenar o DataFrame em cache na memória, se houver espaço disponível. Se não houver memória suficiente para armazenar todos os dados, o Spark armazenará parte dos dados em cache na memória e o restante será armazenado em disco.

## Quais são os top 10 filmes mais avaliados?

In [None]:
ratings_df.show(5)

+------+-------+------+-------------------+
|userId|movieId|rating|          timestamp|
+------+-------+------+-------------------+
|     1|    296|   5.0|2006-05-17 15:34:04|
|     1|    306|   3.5|2006-05-17 12:26:57|
|     1|    307|   5.0|2006-05-17 12:27:08|
|     1|    665|   5.0|2006-05-17 15:13:40|
|     1|    899|   3.5|2006-05-17 12:21:50|
+------+-------+------+-------------------+
only showing top 5 rows



In [None]:
%%time

ratings_df.count()

CPU times: user 122 ms, sys: 13 ms, total: 135 ms
Wall time: 18.1 s


25000095

In [None]:
%%time

df = ratings_df.groupBy('movieId')\
    .count()\
    .withColumnRenamed('count', 'ratings_count')\
    .orderBy(F.desc('ratings_count'))\
    .limit(10)\
    .cache() # Para salvar o df computado na memória RAM dos workers do cluster. Sai imediatamente.

CPU times: user 7.68 ms, sys: 0 ns, total: 7.68 ms
Wall time: 152 ms


In [None]:
%%time
# Antes do .cache()
df.show()

+-------+-------------+
|movieId|ratings_count|
+-------+-------------+
|    356|        81491|
|    318|        81482|
|    296|        79672|
|    593|        74127|
|   2571|        72674|
|    260|        68717|
|    480|        64144|
|    527|        60411|
|    110|        59184|
|   2959|        58773|
+-------+-------------+

CPU times: user 189 ms, sys: 25.7 ms, total: 214 ms
Wall time: 36.2 s


In [None]:
%%time
# depois do .cache()
df.show()

+-------+-------------+
|movieId|ratings_count|
+-------+-------------+
|    356|        81491|
|    318|        81482|
|    296|        79672|
|    593|        74127|
|   2571|        72674|
|    260|        68717|
|    480|        64144|
|    527|        60411|
|    110|        59184|
|   2959|        58773|
+-------+-------------+

CPU times: user 2.49 ms, sys: 918 µs, total: 3.41 ms
Wall time: 285 ms


In [None]:
# Converte um dataframe Spark para um dataframa Pandas
df.toPandas()

Unnamed: 0,movieId,ratings_count
0,356,81491
1,318,81482
2,296,79672
3,593,74127
4,2571,72674
5,260,68717
6,480,64144
7,527,60411
8,110,59184
9,2959,58773


In [None]:
type(df)

pyspark.sql.dataframe.DataFrame

In [None]:
type(df.toPandas())

pandas.core.frame.DataFrame

In [None]:
# Merge com risco de estouro de memória no cenário de existirem muitos filmes no dataset

df.toPandas()\
    .merge(
        movies_df.select('movieId', 'title').toPandas(),
        on='movieId',
        how='inner'
    )

Unnamed: 0,movieId,ratings_count,title
0,356,81491,Forrest Gump (1994)
1,318,81482,"Shawshank Redemption, The (1994)"
2,296,79672,Pulp Fiction (1994)
3,593,74127,"Silence of the Lambs, The (1991)"
4,2571,72674,"Matrix, The (1999)"
5,260,68717,Star Wars: Episode IV - A New Hope (1977)
6,480,64144,Jurassic Park (1993)
7,527,60411,Schindler's List (1993)
8,110,59184,Braveheart (1995)
9,2959,58773,Fight Club (1999)


In [None]:
# É melhor fazer o merge no cluster

df = movies_df.select('movieId', 'title')\
    .join(df, on='movieId', how='inner')\
    .orderBy(F.desc('ratings_count'))
df.toPandas()

Unnamed: 0,movieId,title,ratings_count
0,356,Forrest Gump (1994),81491
1,318,"Shawshank Redemption, The (1994)",81482
2,296,Pulp Fiction (1994),79672
3,593,"Silence of the Lambs, The (1991)",74127
4,2571,"Matrix, The (1999)",72674
5,260,Star Wars: Episode IV - A New Hope (1977),68717
6,480,Jurassic Park (1993),64144
7,527,Schindler's List (1993),60411
8,110,Braveheart (1995),59184
9,2959,Fight Club (1999),58773


## Atividade Turma

### Quais são os top 10 filmes com maior total da soma das avaliações?

In [None]:
%%time

df_task = ratings_df.groupBy('movieId')\
    .sum('rating')\
    .withColumnRenamed('sum(rating)', 'ratings_sum')\
    .orderBy(F.desc('ratings_sum'))\
    .limit(10)\
    .cache()
df_task.toPandas()

CPU times: user 13.6 ms, sys: 0 ns, total: 13.6 ms
Wall time: 127 ms


Unnamed: 0,movieId,ratings_sum
0,318,359627.0
1,296,333739.0
2,356,329876.5
3,593,307726.5
4,2571,301895.0
5,260,283127.0
6,527,256600.5
7,2959,248510.5
8,1196,237711.0
9,50,237207.5


In [None]:
df_task_result = movies_df.select('movieId', 'title')\
    .join(df_task, on='movieId', how='inner')\
    .orderBy(F.desc('ratings_sum'))
df_task_result.toPandas()

Unnamed: 0,movieId,title,ratings_sum
0,318,"Shawshank Redemption, The (1994)",359627.0
1,296,Pulp Fiction (1994),333739.0
2,356,Forrest Gump (1994),329876.5
3,593,"Silence of the Lambs, The (1991)",307726.5
4,2571,"Matrix, The (1999)",301895.0
5,260,Star Wars: Episode IV - A New Hope (1977),283127.0
6,527,Schindler's List (1993),256600.5
7,2959,Fight Club (1999),248510.5
8,1196,Star Wars: Episode V - The Empire Strikes Back...,237711.0
9,50,"Usual Suspects, The (1995)",237207.5


## Qual é a quantidade de avaliações de cada valor de avaliação dos top 10 filmes mais avaliados?

In [None]:
ratings_df.limit(5).toPandas()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,296,5.0,2006-05-17 15:34:04
1,1,306,3.5,2006-05-17 12:26:57
2,1,307,5.0,2006-05-17 12:27:08
3,1,665,5.0,2006-05-17 15:13:40
4,1,899,3.5,2006-05-17 12:21:50


In [None]:
df.show()

+-------+--------------------+-------------+
|movieId|               title|ratings_count|
+-------+--------------------+-------------+
|    356| Forrest Gump (1994)|        81491|
|    318|Shawshank Redempt...|        81482|
|    296| Pulp Fiction (1994)|        79672|
|    593|Silence of the La...|        74127|
|   2571|  Matrix, The (1999)|        72674|
|    260|Star Wars: Episod...|        68717|
|    480|Jurassic Park (1993)|        64144|
|    527|Schindler's List ...|        60411|
|    110|   Braveheart (1995)|        59184|
|   2959|   Fight Club (1999)|        58773|
+-------+--------------------+-------------+



In [None]:
top_10_movieIds = df.select(F.collect_list('movieId')).first()[0]
top_10_movieIds

[356, 318, 296, 593, 2571, 260, 480, 527, 110, 2959]

In [None]:
ratings_df.count()

25000095

In [None]:
ratings_df.where(F.col('movieId').isin(top_10_movieIds)).count()

700675

In [None]:
%%time

df = ratings_df.where(F.col('movieId').isin(top_10_movieIds))\
    .groupBy('movieId', 'rating')\
    .count()\
    .withColumnRenamed('count', 'ratings_value_count')\
    .join(df, on='movieId', how='inner')\
    .orderBy(F.desc('ratings_count'), F.desc('ratings_value_count'))\
    .cache()
df.where('movieId = 356').toPandas()

CPU times: user 216 ms, sys: 21.7 ms, total: 238 ms
Wall time: 38.2 s


Unnamed: 0,movieId,rating,ratings_value_count,title,ratings_count
0,356,5.0,25918,Forrest Gump (1994),81491
1,356,4.0,23348,Forrest Gump (1994),81491
2,356,3.0,10380,Forrest Gump (1994),81491
3,356,4.5,9609,Forrest Gump (1994),81491
4,356,3.5,6185,Forrest Gump (1994),81491
5,356,2.0,2449,Forrest Gump (1994),81491
6,356,2.5,1569,Forrest Gump (1994),81491
7,356,1.0,1159,Forrest Gump (1994),81491
8,356,1.5,450,Forrest Gump (1994),81491
9,356,0.5,424,Forrest Gump (1994),81491


In [None]:
df.count()

100

## Atividade Turma

### Quais são os top 10 filmes com maior quantidade de avaliações com valor 5?

In [None]:
%%time

df_task = ratings_df.where(F.col('rating') == 5)\
    .groupBy('movieId')\
    .count()\
    .withColumnRenamed('count', 'count_rating_5')\
    .orderBy(F.desc('count_rating_5'))\
    .limit(10)\
    .cache()
df_task.toPandas()

CPU times: user 218 ms, sys: 22 ms, total: 240 ms
Wall time: 40.3 s


Unnamed: 0,movieId,count_rating_5
0,318,39553
1,296,32169
2,356,25918
3,260,25804
4,2571,25482
5,527,24853
6,593,24801
7,858,24418
8,50,21585
9,2959,21486


In [None]:
%%time

df_task = movies_df.select('movieId', 'title')\
    .join(df_task, on='movieId', how='inner')\
    .orderBy(F.desc('count_rating_5'))
df_task.toPandas()

CPU times: user 16.6 ms, sys: 2.31 ms, total: 18.9 ms
Wall time: 869 ms


Unnamed: 0,movieId,title,count_rating_5
0,318,"Shawshank Redemption, The (1994)",39553
1,296,Pulp Fiction (1994),32169
2,356,Forrest Gump (1994),25918
3,260,Star Wars: Episode IV - A New Hope (1977),25804
4,2571,"Matrix, The (1999)",25482
5,527,Schindler's List (1993),24853
6,593,"Silence of the Lambs, The (1991)",24801
7,858,"Godfather, The (1972)",24418
8,50,"Usual Suspects, The (1995)",21585
9,2959,Fight Club (1999),21486


## Quais são os top 10 filmes mais avaliados que são do gênero "Children"?

In [None]:
movies_df.limit(5).toPandas()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862


'%' no começo e no final da string dentro do método `.like()` significa que o padrão de string pode ser encontrado no começo, no meio ou no final.

In [None]:
# Filtrando os filmes que possuem o gênero Children
movies_df.where(F.col('genres').like('%Children%')).limit(5).toPandas()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,8,Tom and Huck (1995),Adventure|Children,112302,45325
3,13,Balto (1995),Adventure|Animation|Children,112453,21032
4,27,Now and Then (1995),Children|Drama,114011,9263


In [None]:
%%time

movies_df.where(F.col('genres').like('%Children%'))\
    .join(ratings_df, on='movieId', how='inner')\
    .groupBy('title')\
    .count()\
    .orderBy(F.desc('count'))\
    .limit(10)\
    .toPandas()

CPU times: user 172 ms, sys: 26.9 ms, total: 199 ms
Wall time: 32.2 s


Unnamed: 0,title,count
0,Toy Story (1995),57309
1,Aladdin (1992),43387
2,"Lion King, The (1994)",42745
3,Shrek (2001),42303
4,Beauty and the Beast (1991),35723
5,Finding Nemo (2003),34712
6,E.T. the Extra-Terrestrial (1982),34602
7,"Monsters, Inc. (2001)",34572
8,Babe (1995),31456
9,"Incredibles, The (2004)",30562


## Quantos filmes cada gênero possui?

In [None]:
movies_df.limit(5).toPandas()

Unnamed: 0,movieId,title,genres,imdbId,tmdbId
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357
4,5,Father of the Bride Part II (1995),Comedy,113041,11862


In [None]:
movies_df.select(F.split('genres', '[|]')).limit(5).toPandas()

Unnamed: 0,"split(genres, [|], -1)"
0,"[Adventure, Animation, Children, Comedy, Fantasy]"
1,"[Adventure, Children, Fantasy]"
2,"[Comedy, Romance]"
3,"[Comedy, Drama, Romance]"
4,[Comedy]


In [None]:
movies_df.select(F.explode(F.split('genres', '[|]'))).limit(8).toPandas()

Unnamed: 0,col
0,Adventure
1,Animation
2,Children
3,Comedy
4,Fantasy
5,Adventure
6,Children
7,Fantasy


In [None]:
%%time

movies_df.select(F.explode(F.split('genres', '[|]')))\
    .groupBy('col')\
    .count()\
    .orderBy(F.desc('count'))\
    .withColumnRenamed('col', 'genre')\
    .toPandas()\
    .set_index('genre')\
    .plot(kind='bar')

CPU times: user 894 ms, sys: 151 ms, total: 1.05 s
Wall time: 2.39 s


## Atividade Turma

### Quais são as médias de avaliações dos 10 filmes mais avaliados do gênero Drama? Ordene crescente pelas médias.

In [None]:
%%time
# Filtragem dos filmes do gênero Drama
drama_movies_df = movies_df.where(F.col('genres').like('%Drama%'))
drama_movies_df.count()

CPU times: user 3 ms, sys: 137 µs, total: 3.14 ms
Wall time: 324 ms


25606

In [None]:
%%time

drama_movies_ratings_df = ratings_df.join(drama_movies_df, on='movieId', how='inner').cache()
drama_movies_ratings_df.count()

CPU times: user 732 ms, sys: 89.6 ms, total: 821 ms
Wall time: 2min 26s


10962833

In [None]:
%%time
# Top 10 mais avaliados
df_task = drama_movies_ratings_df.groupBy('movieId')\
    .count()\
    .withColumnRenamed('count', 'ratings_count')\
    .orderBy(F.desc('ratings_count'))\
    .limit(10)\
    .cache()
df_task.toPandas()

CPU times: user 13.2 ms, sys: 273 µs, total: 13.4 ms
Wall time: 157 ms


Unnamed: 0,movieId,ratings_count
0,356,81491
1,318,81482
2,296,79672
3,527,60411
4,110,59184
5,2959,58773
6,2858,53689
7,858,52498
8,7153,50797
9,150,48377


In [None]:
top_10_drama = df_task.toPandas()['movieId'].tolist()
top_10_drama

[356, 318, 296, 527, 110, 2959, 2858, 858, 7153, 150]

In [None]:
%%time

df_task = drama_movies_ratings_df.where(F.col('movieId').isin(top_10_drama))\
    .groupBy('title')\
    .mean('rating')\
    .withColumnRenamed('avg(rating)', 'ratings_mean')\
    .orderBy(F.asc('ratings_mean'))\
    .cache()
df_task.toPandas()

CPU times: user 15.8 ms, sys: 0 ns, total: 15.8 ms
Wall time: 236 ms


Unnamed: 0,title,ratings_mean
0,Apollo 13 (1995),3.873556
1,Braveheart (1995),4.002273
2,Forrest Gump (1994),4.048011
3,"Lord of the Rings: The Return of the King, The...",4.09034
4,American Beauty (1999),4.10734
5,Pulp Fiction (1994),4.188912
6,Fight Club (1999),4.228311
7,Schindler's List (1993),4.247579
8,"Godfather, The (1972)",4.324336
9,"Shawshank Redemption, The (1994)",4.413576
