In [4]:
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark.sql.window import Window

In [5]:
spark = (
      SparkSession.builder \
      .master("local") \
      .appName("Spark") \
      .getOrCreate()
 )

In [6]:
df = spark.read.csv('movies.csv', header=True, inferSchema=True)
rating = spark.read.csv('ratings.csv', header=True, inferSchema=True)

In [63]:
rating.toPandas().isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [80]:
df.columns

['movieId', 'title', 'genres', 'userId', 'rating', 'timestamp']

In [7]:
df = df.join(rating, 'movieId','Left') #Junção dos DFs

In [83]:
df = df.withColumn('rating', (col('rating')).cast(FloatType())) #tranformando a coluna em float

In [None]:
df.printSchema()

In [85]:
df = df.drop(col('userId'), col('timestamp')) #removendo colunas desnecessárias

In [87]:
df = df.dropDuplicates() #removendo duplicados

In [88]:
df = df.withColumn("ano", regexp_extract(col("title"), r"\((\d{4})\)", 1).cast("int")) #Criar coluna com o ano do filme (que está no titulo!!!!)

In [None]:
genres = df.groupBy('genres').agg(count('*').cast(IntegerType()).alias('total_movies')).orderBy('total_movies',ascending=False) #Contando a quantidade de filmes por genero
genres.show()

In [None]:
top_genres = genres.filter(col('total_movies')>900)
top_genres.show()

In [132]:
percent = df.withColumn('%', when(col('rating')== 5,1).otherwise(0)).groupBy('genres')\
            .agg(((100 * (sum('%')/count('%'))).alias('best_gen')))

In [133]:
percent = percent.withColumn('best_gen', format_number(round(col('best_gen'),2),2))

In [None]:
percent.show()

In [144]:
#percent.filter(col("genres").isin("Comedy","Drama","Comedy|Romance","Comedy|Drama","Drama|Romance","Comedy|Drama|Romance")).show()
top_genres_percent = top_genres.join(percent, 'genres', 'left')

In [None]:
top_genres_percent.orderBy('total_movies').show()

In [None]:
df.filter('genres == "Comedy|Drama|Romance"').where('rating == 5').orderBy('ano', ascending=False).show(truncate=False)

In [None]:
percent.orderBy('best_gen', ascending=False).show(900,truncate=False)

In [18]:
df.filter('genres = "Comedy"').where('userID = 608').where('timestamp = 1117408069').orderBy('timestamp').show()

+-------+--------------------+------+------+------+----------+
|movieId|               title|genres|userId|rating| timestamp|
+-------+--------------------+------+------+------+----------+
|    441|Dazed and Confuse...|Comedy|   608|   4.5|1117408069|
+-------+--------------------+------+------+------+----------+



In [19]:
df.filter((col('genres')=='Comedy')&(col('userId')==608)|(col('timestamp') ==1117408069)).orderBy('timestamp').show()

+-------+--------------------+------+------+------+----------+
|movieId|               title|genres|userId|rating| timestamp|
+-------+--------------------+------+------+------+----------+
|    471|Hudsucker Proxy, ...|Comedy|   608|   1.5|1117161794|
|   3948|Meet the Parents ...|Comedy|   608|   4.0|1117162040|
|   2423|Christmas Vacatio...|Comedy|   608|   2.0|1117162130|
|   6807|Monty Python's Th...|Comedy|   608|   2.0|1117336709|
|   6188|   Old School (2003)|Comedy|   608|   3.5|1117336762|
|   3253|Wayne's World (1992)|Comedy|   608|   3.0|1117337026|
|    441|Dazed and Confuse...|Comedy|   608|   4.5|1117408069|
|    514|     Ref, The (1994)|Comedy|   608|   2.5|1117408127|
|   1485|    Liar Liar (1997)|Comedy|   608|   3.0|1117408328|
|   1080|Monty Python's Li...|Comedy|   608|   2.0|1117408365|
|    785|      Kingpin (1996)|Comedy|   608|   2.5|1117408389|
|    333|    Tommy Boy (1995)|Comedy|   608|   4.0|1117408457|
|   4816|    Zoolander (2001)|Comedy|   608|   3.0|1117

In [8]:
df.show(truncate=False)

+-------+----------------+-------------------------------------------+------+------+----------+
|movieId|title           |genres                                     |userId|rating|timestamp |
+-------+----------------+-------------------------------------------+------+------+----------+
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|610   |5.0   |1479542900|
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|609   |3.0   |847221025 |
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|608   |2.5   |1117408267|
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|607   |4.0   |964744033 |
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|606   |2.5   |1349082950|
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|605   |4.0   |1277097561|
|1      |Toy Story (1995)|Adventure|Animation|Children|Comedy|Fantasy|604   |3.0   |832079851 |
|1      |Toy Story (1995)|Adventure|Anim