In [1]:
import pyspark
from pyspark import SparkContext
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode, split, count, desc, row_number, col, length
from pyspark.sql.window import Window
from tabulate import tabulate

In [2]:
pyspark.__version__

'3.3.2'

In [3]:
spark = (
    SparkSession
    .builder
    .appName("MovieLens Analysis dev-mode") 
    .master("local[*]")
    .config("spark.sql.adaptive.enabled", "true") 
    .config("spark.sql.shuffle.partitions", "8") 
    .getOrCreate()
)

24/04/16 17:50:50 WARN Utils: Your hostname, codespaces-f652b0 resolves to a loopback address: 127.0.0.1; using 172.16.5.4 instead (on interface eth0)
24/04/16 17:50:50 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


24/04/16 17:50:51 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
print(spark.sparkContext.defaultParallelism)

4


In [5]:
# Read ratings.csv, tags.csv, and movies.csv
ratings_df = spark.read.csv("../input_data/ratings.csv", header=True)
tags_df = spark.read.csv("../input_data/tags.csv", header=True)
movies_df = spark.read.csv("../input_data/movies.csv", header=True)

In [6]:
# Function to print null value counts for a DataFrame
def print_null_info(df, name):
    null_info = []
    for col_name in df.columns:
        null_count = df.where(col(col_name).isNull()).count()
        null_info.append((col_name, null_count))
    
    print(f"Null values in {name} DataFrame:")
    print(tabulate(null_info, headers=["Column", "Null Count"], tablefmt="pretty"))

In [7]:
print_null_info(ratings_df, "ratings")

[Stage 12:>                                                         (0 + 4) / 4]

Null values in ratings DataFrame:
+-----------+------------+
|  Column   | Null Count |
+-----------+------------+
|  userId   |     0      |
|  movieId  |     0      |
|  rating   |     0      |
| timestamp |     0      |
+-----------+------------+


                                                                                

In [61]:
print_null_info(tags_df, "tags")

Null values in tags DataFrame:
+-----------+------------+
|  Column   | Null Count |
+-----------+------------+
|  userId   |     0      |
|  movieId  |     0      |
|    tag    |     0      |
| timestamp |     0      |
+-----------+------------+


In [62]:
print_null_info(movies_df, "movies")

Null values in movies DataFrame:
+---------+------------+
| Column  | Null Count |
+---------+------------+
| movieId |     0      |
|  title  |     0      |
| genres  |     0      |
+---------+------------+


In [9]:
ratings_df.show(5)

+------+-------+------+----------+
|userId|movieId|rating| timestamp|
+------+-------+------+----------+
|     1|      2|   3.5|1112486027|
|     1|     29|   3.5|1112484676|
|     1|     32|   3.5|1112484819|
|     1|     47|   3.5|1112484727|
|     1|     50|   3.5|1112484580|
+------+-------+------+----------+
only showing top 5 rows



In [7]:
tags_df.show(5)

+------+-------+-------------+----------+
|userId|movieId|          tag| timestamp|
+------+-------+-------------+----------+
|    18|   4141|  Mark Waters|1240597180|
|    65|    208|    dark hero|1368150078|
|    65|    353|    dark hero|1368150079|
|    65|    521|noir thriller|1368149983|
|    65|    592|    dark hero|1368150078|
+------+-------+-------------+----------+
only showing top 5 rows



In [8]:
movies_df.show(5)

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
+-------+--------------------+--------------------+
only showing top 5 rows



In [10]:
def most_common_tag_for_movie_title(tags_df, movies_df):
    """
    Function finds the most common tag for a movie title.

    Args:
    - tags_df: DataFrame containing tags data
    - movies_df: DataFrame containing movies data

    Returns:
    - DataFrame containing the most common tag for the movie title
    """
    movie_tag_df = movies_df.join(tags_df, "movieId", "left")
    most_common_tag_df = (
        movie_tag_df
        .groupBy("title", "tag")
        .agg(count("*").alias("tag_count"))
        .orderBy(desc("tag_count"))
    )

    # Get the row with the highest tag count for each movie
    # Title Memento (2000) had multiple tags (nonlinear, twist ending). Handle it
    window_spec = Window.partitionBy("title").orderBy(desc("tag_count"))
    most_common_tag_df = (
        most_common_tag_df
        .withColumn("rank", row_number().over(window_spec))
        .filter(col("rank") == 1).drop("rank")
        .orderBy(desc('tag_count'))
    )
    
    return most_common_tag_df

In [11]:
most_common_tag_df = most_common_tag_for_movie_title(tags_df, movies_df)

In [12]:
most_common_tag_df.show(10, truncate=False)

+--------------------------------------------+-----------------+---------+
|title                                       |tag              |tag_count|
+--------------------------------------------+-----------------+---------+
|Pulp Fiction (1994)                         |Quentin Tarantino|185      |
|Fight Club (1999)                           |twist ending     |150      |
|Memento (2000)                              |nonlinear        |145      |
|Usual Suspects, The (1995)                  |twist ending     |139      |
|Inception (2010)                            |alternate reality|128      |
|Eternal Sunshine of the Spotless Mind (2004)|surreal          |127      |
|Matrix, The (1999)                          |sci-fi           |120      |
|Silence of the Lambs, The (1991)            |serial killer    |113      |
|Twelve Monkeys (a.k.a. 12 Monkeys) (1995)   |time travel      |109      |
|Inglourious Basterds (2009)                 |Quentin Tarantino|108      |
+------------------------

In [7]:
def most_common_genre_rated_by_user(ratings_df, movies_df):
    """
    Function finds the most common genre rated by a user.

    Args:
    - ratings_df: DataFrame containing ratings data
    - movies_df: DataFrame containing movies data

    Returns:
    - DataFrame containing the most common genre rated by a user
    """
    user_rating_df = ratings_df.join(movies_df, "movieId", "left")
    most_common_genre_df = (
        user_rating_df
        .withColumn("genre", explode(split("genres", "\\|")))
        .groupBy("userId", "genre")
        .agg(count("*").alias("genre_count"))
        .orderBy(desc("genre_count"))
    )
    # Get the row with the most common genre for each user
    # userId 104 has multiple (19). Handle it
    window_spec = Window.partitionBy("userId").orderBy(desc("genre_count"))
    most_common_genre_df = (
        most_common_genre_df
        .withColumn("rank", row_number().over(window_spec))
        .filter(col("rank") == 1).drop("rank")
        .orderBy(desc('genre_count'))
    )
    return most_common_genre_df

In [8]:
most_common_genre_df = most_common_genre_rated_by_user(ratings_df, movies_df)

In [9]:
most_common_genre_df.write.csv("../output_data/most_common_genre", header=True, mode="overwrite")

                                                                                

In [10]:
most_common_genre_df.show(20, truncate=False)

                                                                                

+------+------+-----------+
|userId|genre |genre_count|
+------+------+-----------+
|8405  |Drama |3684       |
|8963  |Drama |3240       |
|9544  |Drama |1702       |
|2261  |Drama |1479       |
|9034  |Drama |1449       |
|3907  |Drama |1407       |
|10303 |Drama |1371       |
|7201  |Drama |1359       |
|903   |Comedy|1178       |
|3318  |Comedy|1168       |
|3797  |Drama |1164       |
|4222  |Drama |1137       |
|6636  |Drama |1110       |
|6373  |Drama |1110       |
|6719  |Drama |1086       |
|4358  |Comedy|1080       |
|9087  |Drama |1051       |
|9545  |Drama |1005       |
|8647  |Comedy|976        |
|3625  |Drama |971        |
+------+------+-----------+
only showing top 20 rows



### Done