In [2]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.2.4/spark-3.2.4-bin-hadoop3.2.tgz
!tar xf spark-3.2.4-bin-hadoop3.2.tgz
!pip install -q findspark

In [3]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.2.4-bin-hadoop3.2"

In [4]:
import findspark
findspark.init()
findspark.find()

'/content/spark-3.2.4-bin-hadoop3.2'

In [10]:
# Importing libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg

In [11]:
# Initialize Spark Session
spark = SparkSession.builder.appName("Movie Tags Analysis").getOrCreate()

In [12]:
# Load the datasets into DataFrames
links_df = spark.read.csv("/content/links.csv", header=True, inferSchema=True)
tags_df = spark.read.csv("/content/tags.csv", header=True, inferSchema=True)
ratings_df = spark.read.csv("/content/ratings.csv", header=True, inferSchema=True)
movies_df = spark.read.csv("/content/movies.csv", header=True, inferSchema=True)

In [13]:
# Left Join the DataFrames
joined_df = tags_df.join(ratings_df, ["movieId"], "left").join(movies_df, ["movieId"], "left_outer")


In [14]:
# Calculate the average ratings
avg_ratings = joined_df.groupBy("tag").agg(avg("rating").alias("avg_rating"))

In [15]:
# Sort by average rating
sorted_avg_ratings = avg_ratings.orderBy("avg_rating", ascending=False)

sorted_avg_ratings.show()

+--------------------+-----------------+
|                 tag|       avg_rating|
+--------------------+-----------------+
|          procedural|              5.0|
|          creativity|              5.0|
|    free to download|              5.0|
|        human rights|              5.0|
|         no dialogue|              5.0|
|             parrots|             4.75|
|   thought provoking|             4.75|
|            jon hamm|             4.75|
|            Dystopia|             4.75|
| movies about movies|4.666666666666667|
|           prejudice|4.545454545454546|
|interracial marriage|4.545454545454546|
|        Metaphorical|              4.5|
|             freedom|              4.5|
|political right v...|              4.5|
|        good writing|              4.5|
|               crazy|              4.5|
|       individualism|              4.5|
|     black-and-white|              4.5|
|the catholic chur...|              4.5|
+--------------------+-----------------+
only showing top

Checking if the number of unique tags are equal in tags_df and sorted_avg_ratings

In [16]:
# Count the number of unique tags
unique_tags_original = tags_df.select("tag").distinct().count()
unique_tags_sorted = sorted_avg_ratings.select("tag").distinct().count()

# Check if the numbers are equal
if unique_tags_original == unique_tags_sorted:
    print("The number of unique tags is the same in both DataFrames.")
else:
    print("The number of unique tags differs between the DataFrames.")
    print(f"Original: {unique_tags_original}, Sorted: {unique_tags_sorted}")


The number of unique tags is the same in both DataFrames.


In [17]:
unique_tags_original

1589

In [18]:
unique_tags_sorted

1589