# 1. Import necessary libraries

In [11]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, regexp_extract, regexp_replace, split, from_unixtime, collect_list
import pyspark.sql.functions as F

# 2. Creating a Spark Session

In [2]:
spark = SparkSession.builder.appName('data-exploration').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/28 12:14:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# 3. Define CSV file paths

In [3]:
import os

# Get the current directory
current_dir = os.getcwd()

# Calculate the base directory (one level up)
base_dir = os.path.abspath(os.path.join(current_dir, ".."))

# Construct the file path in a parameterized way:
links_csv_path = os.path.join(base_dir, "data", "raw", "ml-latest-small", "links.csv")
movies_csv_path = os.path.join(base_dir, "data", "raw", "ml-latest-small", "movies.csv")
ratings_csv_path = os.path.join(base_dir, "data", "raw", "ml-latest-small", "ratings.csv")
tags_csv_path = os.path.join(base_dir, "data", "raw", "ml-latest-small", "tags.csv")

print("File paths are :")
print(f" {links_csv_path} \n {movies_csv_path} \n {ratings_csv_path} \n {tags_csv_path}")

File paths are :
 /Users/test/PycharmProjects/spark-data-ingestion-cleaning/data/raw/ml-latest-small/links.csv 
 /Users/test/PycharmProjects/spark-data-ingestion-cleaning/data/raw/ml-latest-small/movies.csv 
 /Users/test/PycharmProjects/spark-data-ingestion-cleaning/data/raw/ml-latest-small/ratings.csv 
 /Users/test/PycharmProjects/spark-data-ingestion-cleaning/data/raw/ml-latest-small/tags.csv


# 4. Analysing Movies CSV file

## 4.1 Read and display sample & count

In [4]:
movies_df = spark.read.csv(movies_csv_path, header="true")

movies_df.show(10, False)

print(f"Movies count : {movies_df.count()}")

+-------+----------------------------------+-------------------------------------------+
|movieId|title                             |genres                                     |
+-------+----------------------------------+-------------------------------------------+
|1      |Toy Story (1995)                  |Adventure|Animation|Children|Comedy|Fantasy|
|2      |Jumanji (1995)                    |Adventure|Children|Fantasy                 |
|3      |Grumpier Old Men (1995)           |Comedy|Romance                             |
|4      |Waiting to Exhale (1995)          |Comedy|Drama|Romance                       |
|5      |Father of the Bride Part II (1995)|Comedy                                     |
|6      |Heat (1995)                       |Action|Crime|Thriller                      |
|7      |Sabrina (1995)                    |Comedy|Romance                             |
|8      |Tom and Huck (1995)               |Adventure|Children                         |
|9      |Sudden Death

## 4.2 Verify distinct matches count

In [15]:
print(f"Movies distinct count : {movies_df.distinct().count()}")

print(f"Movies 'movieId' distinct count : {movies_df.select('movieId').distinct().count()}")

print(f"Movies 'title' distinct count : {movies_df.select('title').distinct().count()} (---different)")

print(f"Movies 'genres' distinct count : {movies_df.select('genres').distinct().count()} (--can have duplicates)")

Movies distinct count : 9742
Movies 'movieId' distinct count : 9742
Movies 'title' distinct count : 9737 (---different)
Movies 'genres' distinct count : 951 (--can have duplicates)


## 4.3 Lets analyse duplicate titles

In [16]:
print("Titles that are duplicates are ")

duplicate_titles_df = movies_df.groupBy('title').count().orderBy('count', ascending=False)

duplicate_titles_df.show(10, False)

Titles that are duplicates are 
+--------------------------------------+-----+
|title                                 |count|
+--------------------------------------+-----+
|Eros (2004)                           |2    |
|Confessions of a Dangerous Mind (2002)|2    |
|Emma (1996)                           |2    |
|Saturn 3 (1980)                       |2    |
|War of the Worlds (2005)              |2    |
|Fair Game (1995)                      |1    |
|If Lucy Fell (1996)                   |1    |
|Three Wishes (1995)                   |1    |
|Heavenly Creatures (1994)             |1    |
|Snow White and the Seven Dwarfs (1937)|1    |
+--------------------------------------+-----+
only showing top 10 rows



In [17]:
# Lets see the duplicate title rows
duplicate_titles = duplicate_titles_df.filter(col('count') > 1).select('title').collect()

duplicate_titles = [row['title'] for row in duplicate_titles]

movies_df.filter(col('title').isin(duplicate_titles)).orderBy('title').show(20, False)

print('Movies have same titles but different slight different genres')

+-------+--------------------------------------+-----------------------------------+
|movieId|title                                 |genres                             |
+-------+--------------------------------------+-----------------------------------+
|6003   |Confessions of a Dangerous Mind (2002)|Comedy|Crime|Drama|Thriller        |
|144606 |Confessions of a Dangerous Mind (2002)|Comedy|Crime|Drama|Romance|Thriller|
|838    |Emma (1996)                           |Comedy|Drama|Romance               |
|26958  |Emma (1996)                           |Romance                            |
|32600  |Eros (2004)                           |Drama                              |
|147002 |Eros (2004)                           |Drama|Romance                      |
|2851   |Saturn 3 (1980)                       |Adventure|Sci-Fi|Thriller          |
|168358 |Saturn 3 (1980)                       |Sci-Fi|Thriller                    |
|34048  |War of the Worlds (2005)              |Action|Adventure|

## 4.4 Top 20 Genres

In [18]:
print("Top 20 genres are ")

movies_df.groupBy('genres').count().orderBy('count', ascending=False).show(20, False)

Top 20 genres are 
+-----------------------+-----+
|genres                 |count|
+-----------------------+-----+
|Drama                  |1053 |
|Comedy                 |946  |
|Comedy|Drama           |435  |
|Comedy|Romance         |363  |
|Drama|Romance          |349  |
|Documentary            |339  |
|Comedy|Drama|Romance   |276  |
|Drama|Thriller         |168  |
|Horror                 |167  |
|Horror|Thriller        |135  |
|Crime|Drama            |134  |
|Crime|Drama|Thriller   |125  |
|Drama|War              |114  |
|Comedy|Crime           |101  |
|Action|Comedy          |92   |
|Thriller               |84   |
|Children|Comedy        |74   |
|Comedy|Horror          |69   |
|Action|Crime|Thriller  |66   |
|Action|Adventure|Sci-Fi|66   |
+-----------------------+-----+
only showing top 20 rows



## 4.5 Extract year from title

In [19]:
movies_df = movies_df.withColumn("year", regexp_extract("title", r"\((\d{4})\)", 1).cast('int'))
movies_df.show()
movies_df.printSchema()

+-------+--------------------+--------------------+----+
|movieId|               title|              genres|year|
+-------+--------------------+--------------------+----+
|      1|    Toy Story (1995)|Adventure|Animati...|1995|
|      2|      Jumanji (1995)|Adventure|Childre...|1995|
|      3|Grumpier Old Men ...|      Comedy|Romance|1995|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|1995|
|      5|Father of the Bri...|              Comedy|1995|
|      6|         Heat (1995)|Action|Crime|Thri...|1995|
|      7|      Sabrina (1995)|      Comedy|Romance|1995|
|      8| Tom and Huck (1995)|  Adventure|Children|1995|
|      9| Sudden Death (1995)|              Action|1995|
|     10|    GoldenEye (1995)|Action|Adventure|...|1995|
|     11|American Presiden...|Comedy|Drama|Romance|1995|
|     12|Dracula: Dead and...|       Comedy|Horror|1995|
|     13|        Balto (1995)|Adventure|Animati...|1995|
|     14|        Nixon (1995)|               Drama|1995|
|     15|Cutthroat Island ...|A

## 4.6 Update title

In [20]:
movies_df = movies_df.withColumn("title", regexp_replace("title", r"\s*\(\d{4}\)", ""))
movies_df.show(truncate=False)
movies_df.printSchema()

+-------+------------------------------+-------------------------------------------+----+
|movieId|title                         |genres                                     |year|
+-------+------------------------------+-------------------------------------------+----+
|1      |Toy Story                     |Adventure|Animation|Children|Comedy|Fantasy|1995|
|2      |Jumanji                       |Adventure|Children|Fantasy                 |1995|
|3      |Grumpier Old Men              |Comedy|Romance                             |1995|
|4      |Waiting to Exhale             |Comedy|Drama|Romance                       |1995|
|5      |Father of the Bride Part II   |Comedy                                     |1995|
|6      |Heat                          |Action|Crime|Thriller                      |1995|
|7      |Sabrina                       |Comedy|Romance                             |1995|
|8      |Tom and Huck                  |Adventure|Children                         |1995|
|9      |S

## 4.7 Update genres to array of string field

In [21]:
movies_df = movies_df.withColumn("genres", split("genres", "\|"))

movies_df.show(truncate=False)
movies_df.printSchema()

+-------+------------------------------+-------------------------------------------------+----+
|movieId|title                         |genres                                           |year|
+-------+------------------------------+-------------------------------------------------+----+
|1      |Toy Story                     |[Adventure, Animation, Children, Comedy, Fantasy]|1995|
|2      |Jumanji                       |[Adventure, Children, Fantasy]                   |1995|
|3      |Grumpier Old Men              |[Comedy, Romance]                                |1995|
|4      |Waiting to Exhale             |[Comedy, Drama, Romance]                         |1995|
|5      |Father of the Bride Part II   |[Comedy]                                         |1995|
|6      |Heat                          |[Action, Crime, Thriller]                        |1995|
|7      |Sabrina                       |[Comedy, Romance]                                |1995|
|8      |Tom and Huck                  |

  movies_df = movies_df.withColumn("genres", split("genres", "\|"))


# 5. Analysing Ratings CSV file

## 5.1 Read and Display rows and count

In [26]:
ratings_df = spark.read.csv(ratings_csv_path, header="true")

ratings_df.show(10, False)

print(f"Ratings count : {ratings_df.count()}")

ratings_df.printSchema()

+------+-------+------+---------+
|userId|movieId|rating|timestamp|
+------+-------+------+---------+
|1     |1      |4.0   |964982703|
|1     |3      |4.0   |964981247|
|1     |6      |4.0   |964982224|
|1     |47     |5.0   |964983815|
|1     |50     |5.0   |964982931|
|1     |70     |3.0   |964982400|
|1     |101    |5.0   |964980868|
|1     |110    |4.0   |964982176|
|1     |151    |5.0   |964984041|
|1     |157    |5.0   |964984100|
+------+-------+------+---------+
only showing top 10 rows

Ratings count : 100836
root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- rating: string (nullable = true)
 |-- timestamp: string (nullable = true)



## 5.2 Unique users and movies

In [27]:
print(f"Unique User ids: {ratings_df.select('userId').distinct().count()}")

Unique User ids: 610


In [28]:
print(f"Unique Movie ids (=equals movies count from above): {ratings_df.select('movieId').distinct().count()}")

Unique Movie ids (=equals movies count from above): 9724


In [29]:
print(f"Unique ratings: {ratings_df.select('rating').distinct().count()}")

Unique ratings: 10


## 5.3 Process timestamp to convert it

In [30]:
ratings_df = ratings_df.withColumn("timestamp", from_unixtime(col("timestamp").cast("long")))

ratings_df.show(20, False)

+------+-------+------+-------------------+
|userId|movieId|rating|timestamp          |
+------+-------+------+-------------------+
|1     |1      |4.0   |2000-07-31 00:15:03|
|1     |3      |4.0   |2000-07-30 23:50:47|
|1     |6      |4.0   |2000-07-31 00:07:04|
|1     |47     |5.0   |2000-07-31 00:33:35|
|1     |50     |5.0   |2000-07-31 00:18:51|
|1     |70     |3.0   |2000-07-31 00:10:00|
|1     |101    |5.0   |2000-07-30 23:44:28|
|1     |110    |4.0   |2000-07-31 00:06:16|
|1     |151    |5.0   |2000-07-31 00:37:21|
|1     |157    |5.0   |2000-07-31 00:38:20|
|1     |163    |5.0   |2000-07-31 00:30:50|
|1     |216    |5.0   |2000-07-30 23:50:08|
|1     |223    |3.0   |2000-07-30 23:46:25|
|1     |231    |5.0   |2000-07-30 23:49:39|
|1     |235    |4.0   |2000-07-30 23:45:08|
|1     |260    |5.0   |2000-07-30 23:58:00|
|1     |296    |3.0   |2000-07-31 00:19:27|
|1     |316    |3.0   |2000-07-31 00:08:30|
|1     |333    |5.0   |2000-07-30 23:49:39|
|1     |349    |4.0   |2000-07-3

## 5.4 Ratings per movie (count)

In [31]:
ratings_per_movie_df = ratings_df.groupBy('movieId').count().orderBy('count', ascending=False)

ratings_per_movie_df.show(20, False)

+-------+-----+
|movieId|count|
+-------+-----+
|356    |329  |
|318    |317  |
|296    |307  |
|593    |279  |
|2571   |278  |
|260    |251  |
|480    |238  |
|110    |237  |
|589    |224  |
|527    |220  |
|2959   |218  |
|1      |215  |
|1196   |211  |
|2858   |204  |
|50     |204  |
|47     |203  |
|780    |202  |
|150    |201  |
|1198   |200  |
|4993   |198  |
+-------+-----+
only showing top 20 rows



## 5.5 Map ratings to movies

In [32]:
ratings_movie_df = ratings_per_movie_df.join(movies_df, on='movieId', how='inner')

# ratings_movie_df.orderBy('count', ascending=False).show(20, False)

ratings_movie_df.select('movieId', 'count', 'title', 'year').orderBy('count', ascending=False).show(20, False)

+-------+-----+-----------------------------------------------------------------------+----+
|movieId|count|title                                                                  |year|
+-------+-----+-----------------------------------------------------------------------+----+
|356    |329  |Forrest Gump                                                           |1994|
|318    |317  |Shawshank Redemption, The                                              |1994|
|296    |307  |Pulp Fiction                                                           |1994|
|593    |279  |Silence of the Lambs, The                                              |1991|
|2571   |278  |Matrix, The                                                            |1999|
|260    |251  |Star Wars: Episode IV - A New Hope                                     |1977|
|480    |238  |Jurassic Park                                                          |1993|
|110    |237  |Braveheart                                             

## 5.6 Sum of all ratings per movie

In [33]:
ratings_sum_per_movie_df = ratings_df.groupBy('movieId').agg(F.sum(col("rating")).alias('sum_of_all_ratings'))

ratings_sum_per_movie_df.orderBy('sum_of_all_ratings', ascending=False).show(20, False)

+-------+------------------+
|movieId|sum_of_all_ratings|
+-------+------------------+
|318    |1404.0            |
|356    |1370.0            |
|296    |1288.5            |
|2571   |1165.5            |
|593    |1161.0            |
|260    |1062.0            |
|110    |955.5             |
|2959   |931.5             |
|527    |929.5             |
|480    |892.5             |
|1196   |889.5             |
|589    |889.5             |
|50     |864.5             |
|1      |843.0             |
|1198   |841.5             |
|2858   |827.5             |
|858    |823.5             |
|4993   |813.0             |
|1210   |811.0             |
|47     |807.0             |
+-------+------------------+
only showing top 20 rows



## 5.7 Top 30 movies by ratings sum

In [34]:
ratings_sum_movie_df = ratings_sum_per_movie_df.join(movies_df, on='movieId', how='inner')

# ratings_sum_movie_df.orderBy('sum_of_all_ratings', ascending=False).show(20, False)

ratings_sum_movie_df.select('movieId', 'sum_of_all_ratings', 'title', 'year').orderBy('sum_of_all_ratings', ascending=False).show(30, False)

+-------+------------------+-----------------------------------------------------------------------+----+
|movieId|sum_of_all_ratings|title                                                                  |year|
+-------+------------------+-----------------------------------------------------------------------+----+
|318    |1404.0            |Shawshank Redemption, The                                              |1994|
|356    |1370.0            |Forrest Gump                                                           |1994|
|296    |1288.5            |Pulp Fiction                                                           |1994|
|2571   |1165.5            |Matrix, The                                                            |1999|
|593    |1161.0            |Silence of the Lambs, The                                              |1991|
|260    |1062.0            |Star Wars: Episode IV - A New Hope                                     |1977|
|110    |955.5             |Braveheart        

## 5.8 Worst 30 movies by ratings sum

In [35]:
ratings_sum_movie_df.select('movieId', 'sum_of_all_ratings', 'title', 'year').orderBy('sum_of_all_ratings', ascending=True).show(30, False)

+-------+------------------+-----------------------------------------------------------------------------------------+----+
|movieId|sum_of_all_ratings|title                                                                                    |year|
+-------+------------------+-----------------------------------------------------------------------------------------+----+
|54274  |0.5               |I Know Who Killed Me                                                                     |2007|
|4051   |0.5               |Horrors of Spider Island (Ein Toter Hing im Netz)                                        |1960|
|86068  |0.5               |Films to Keep You Awake: The Christmas Tale (Películas para no dormir: Cuento de navidad)|2005|
|95796  |0.5               |Anaconda: The Offspring                                                                  |2008|
|7312   |0.5               |Follow Me, Boys!                                                                         |1966|
|110773 

## 5.9 Top 5 star rating movies

In [75]:
ten_star_movie_df = ratings_df.withColumn('rating', col('rating').cast('float'))

ten_star_movie_df = ten_star_movie_df.filter(col('rating') == 5.0) \
                                     .groupBy('movieId') \
                                     .count()

ten_star_movie_df.printSchema()
ten_star_movie_df.orderBy('count', ascending=False).show(20, False)

root
 |-- movieId: string (nullable = true)
 |-- count: long (nullable = false)

+-------+-----+
|movieId|count|
+-------+-----+
|318    |153  |
|296    |123  |
|356    |116  |
|2571   |109  |
|260    |104  |
|527    |92   |
|593    |92   |
|858    |88   |
|2959   |81   |
|1196   |80   |
|110    |80   |
|50     |71   |
|1198   |69   |
|2858   |65   |
|1210   |65   |
|589    |64   |
|4993   |62   |
|2028   |61   |
|608    |57   |
|1197   |55   |
+-------+-----+
only showing top 20 rows



In [76]:
ten_star_movie_join_df = ten_star_movie_df.join(movies_df, on='movieId', how='inner')

ten_star_movie_join_df.select('movieId', 'count', 'title', 'year').orderBy('count', ascending=False).show(20, False)

+-------+-----+-----------------------------------------------------------------------+----+
|movieId|count|title                                                                  |year|
+-------+-----+-----------------------------------------------------------------------+----+
|318    |153  |Shawshank Redemption, The                                              |1994|
|296    |123  |Pulp Fiction                                                           |1994|
|356    |116  |Forrest Gump                                                           |1994|
|2571   |109  |Matrix, The                                                            |1999|
|260    |104  |Star Wars: Episode IV - A New Hope                                     |1977|
|527    |92   |Schindler's List                                                       |1993|
|593    |92   |Silence of the Lambs, The                                              |1991|
|858    |88   |Godfather, The                                         

# 6. Analyse Links CSV File

## 6.1 Read, Display, Count, PrintSchema()

In [6]:
links_df = spark.read.csv(links_csv_path, header="true")

links_df.show(10, False)

print(f"Links count : {links_df.count()}")

links_df.printSchema()

+-------+-------+------+
|movieId|imdbId |tmdbId|
+-------+-------+------+
|1      |0114709|862   |
|2      |0113497|8844  |
|3      |0113228|15602 |
|4      |0114885|31357 |
|5      |0113041|11862 |
|6      |0113277|949   |
|7      |0114319|11860 |
|8      |0112302|45325 |
|9      |0114576|9091  |
|10     |0113189|710   |
+-------+-------+------+
only showing top 10 rows

Links count : 9742
root
 |-- movieId: string (nullable = true)
 |-- imdbId: string (nullable = true)
 |-- tmdbId: string (nullable = true)



## 6.2 Distinct count 

In [7]:
print(f"Movies ID distinct : {links_df.select('movieId').distinct().count()}")

print(f"IMDB ID distinct : {links_df.select('imdbId').distinct().count()}")

print(f"TMDBID ID distinct : {links_df.select('tmdbId').distinct().count()}")

Movies ID distinct : 9742
IMDB ID distinct : 9742
TMDBID ID distinct : 9734


# 7 Analysing Tags CSV File

## 7.1 Read, Display, Count, PrintSchema()

In [8]:
tags_df = spark.read.csv(tags_csv_path, header="true")

tags_df.show(10, False)

print(f"Tags count : {tags_df.count()}")

tags_df.printSchema()

+------+-------+-----------------+----------+
|userId|movieId|tag              |timestamp |
+------+-------+-----------------+----------+
|2     |60756  |funny            |1445714994|
|2     |60756  |Highly quotable  |1445714996|
|2     |60756  |will ferrell     |1445714992|
|2     |89774  |Boxing story     |1445715207|
|2     |89774  |MMA              |1445715200|
|2     |89774  |Tom Hardy        |1445715205|
|2     |106782 |drugs            |1445715054|
|2     |106782 |Leonardo DiCaprio|1445715051|
|2     |106782 |Martin Scorsese  |1445715056|
|7     |48516  |way too long     |1169687325|
+------+-------+-----------------+----------+
only showing top 10 rows

Tags count : 3683
root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)



## 7.2 Distinct movie count

In [9]:
tags_df.select("movieId").distinct().count()

1572

## 7.3 Change datatype of timestamp

In [10]:
tags_df = tags_df.withColumn("timestamp", from_unixtime(col("timestamp").cast("long")))

tags_df.show(10, False)
tags_df.printSchema()

+------+-------+-----------------+-------------------+
|userId|movieId|tag              |timestamp          |
+------+-------+-----------------+-------------------+
|2     |60756  |funny            |2015-10-25 00:59:54|
|2     |60756  |Highly quotable  |2015-10-25 00:59:56|
|2     |60756  |will ferrell     |2015-10-25 00:59:52|
|2     |89774  |Boxing story     |2015-10-25 01:03:27|
|2     |89774  |MMA              |2015-10-25 01:03:20|
|2     |89774  |Tom Hardy        |2015-10-25 01:03:25|
|2     |106782 |drugs            |2015-10-25 01:00:54|
|2     |106782 |Leonardo DiCaprio|2015-10-25 01:00:51|
|2     |106782 |Martin Scorsese  |2015-10-25 01:00:56|
|7     |48516  |way too long     |2007-01-25 06:38:45|
+------+-------+-----------------+-------------------+
only showing top 10 rows

root
 |-- userId: string (nullable = true)
 |-- movieId: string (nullable = true)
 |-- tag: string (nullable = true)
 |-- timestamp: string (nullable = true)



## 7.4 Aggregate all tags for movie and join with movies_df to view the details

In [12]:
tags_aggregate_df = tags_df.groupBy("movieId").agg(collect_list(col("tag")).alias("tags"))

tags_aggregate_df.show(20, False)
tags_aggregate_df.printSchema()

+-------+------------------------------------------------------------+
|movieId|tags                                                        |
+-------+------------------------------------------------------------+
|1      |[pixar, pixar, fun]                                         |
|100083 |[embarassing scenes, offensive, R language, sarcasm, satire]|
|1006   |[death penalty, John Grisham]                               |
|101    |[crime, off-beat comedy, quirky]                            |
|1010   |[Disney, race]                                              |
|101142 |[animation, funny, stone age]                               |
|1013   |[twins]                                                     |
|102007 |[animation]                                                 |
|1022   |[Disney]                                                    |
|1025   |[Disney, King Arthur]                                       |
|1028   |[Disney, nanny]                                             |
|1029 

In [24]:
tags_movie_df = tags_aggregate_df.join(movies_df, on='movieId', how='left')

# tags_movie_df.show(20, False)
tags_movie_df.select('title', 'genres', 'tags').show(20, False)

+------------------------+--------------------------------------------------+------------------------------------------------------------+
|title                   |genres                                            |tags                                                        |
+------------------------+--------------------------------------------------+------------------------------------------------------------+
|Toy Story               |[Adventure, Animation, Children, Comedy, Fantasy] |[pixar, pixar, fun]                                         |
|Movie 43                |[Comedy]                                          |[embarassing scenes, offensive, R language, sarcasm, satire]|
|Chamber, The            |[Drama]                                           |[death penalty, John Grisham]                               |
|Bottle Rocket           |[Adventure, Comedy, Crime, Romance]               |[crime, off-beat comedy, quirky]                            |
|Love Bug, The           |[

## 7.5 Lets combine with the ratings sum and see what people say about top movies

In [43]:
tags_ratings_df = ratings_sum_movie_df.join(tags_movie_df.select('movieId','tags'), on='movieId', how='inner')

# tags_ratings_df.show(20, False)

tags_ratings_df.select('title', 'sum_of_all_ratings', 'genres', 'tags', 'year').orderBy('sum_of_all_ratings', ascending=False).show(20,False)

+-----------------------------------------------------------------------+------------------+-------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [44]:
tags_ratings_df.select('title', 'sum_of_all_ratings', 'genres', 'tags', 'year').orderBy('sum_of_all_ratings', ascending=True).show(20,False)

+----------------------------------------------+------------------+------------------------------+-------------------------------+----+
|title                                         |sum_of_all_ratings|genres                        |tags                           |year|
+----------------------------------------------+------------------+------------------------------+-------------------------------+----+
|Begotten                                      |0.5               |[Drama, Horror]               |[boring, psychedelic, symbolic]|1990|
|Losin' It                                     |1.0               |[Comedy]                      |[virginity]                    |1983|
|Oh, God! Book II                              |1.5               |[Comedy]                      |[religion]                     |1980|
|My Demon Lover                                |2.0               |[Comedy, Horror]              |[demons]                       |1987|
|Brandon Teena Story, The                      |