In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import CountVectorizer, StringIndexer, OneHotEncoder, VectorAssembler
from pyspark.ml.linalg import Vectors, SparseVector
from pyspark.ml import Pipeline
import numpy as np
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
# Initialize Spark Session
spark = SparkSession.builder \
    .appName("ContentBasedMovieRecommender") \
    .master("local[*]") \
    .config("spark.driver.memory", "4g") \
    .getOrCreate()

In [None]:
# Load data
movies_df = spark.read.csv("movies.csv", header=True, inferSchema=True)
tags_df = spark.read.csv("tags.csv", header=True, inferSchema=True)

In [4]:
movies_df.show()

+-------+--------------------+--------------------+
|movieId|               title|              genres|
+-------+--------------------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|
|      2|      Jumanji (1995)|Adventure|Childre...|
|      3|Grumpier Old Men ...|      Comedy|Romance|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|
|      5|Father of the Bri...|              Comedy|
|      6|         Heat (1995)|Action|Crime|Thri...|
|      7|      Sabrina (1995)|      Comedy|Romance|
|      8| Tom and Huck (1995)|  Adventure|Children|
|      9| Sudden Death (1995)|              Action|
|     10|    GoldenEye (1995)|Action|Adventure|...|
|     11|American Presiden...|Comedy|Drama|Romance|
|     12|Dracula: Dead and...|       Comedy|Horror|
|     13|        Balto (1995)|Adventure|Animati...|
|     14|        Nixon (1995)|               Drama|
|     15|Cutthroat Island ...|Action|Adventure|...|
|     16|       Casino (1995)|         Crime|Drama|
|     17|Sen

In [5]:
# Clean and transform the movies data

# Extract year from title
extract_year = F.regexp_extract(F.col("title"), r"\((\d{4})\)$", 1).cast("integer")

movies_df = movies_df.withColumn("year", extract_year)



In [6]:
movies_df.show()

+-------+--------------------+--------------------+----+
|movieId|               title|              genres|year|
+-------+--------------------+--------------------+----+
|      1|    Toy Story (1995)|Adventure|Animati...|1995|
|      2|      Jumanji (1995)|Adventure|Childre...|1995|
|      3|Grumpier Old Men ...|      Comedy|Romance|1995|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|1995|
|      5|Father of the Bri...|              Comedy|1995|
|      6|         Heat (1995)|Action|Crime|Thri...|1995|
|      7|      Sabrina (1995)|      Comedy|Romance|1995|
|      8| Tom and Huck (1995)|  Adventure|Children|1995|
|      9| Sudden Death (1995)|              Action|1995|
|     10|    GoldenEye (1995)|Action|Adventure|...|1995|
|     11|American Presiden...|Comedy|Drama|Romance|1995|
|     12|Dracula: Dead and...|       Comedy|Horror|1995|
|     13|        Balto (1995)|Adventure|Animati...|1995|
|     14|        Nixon (1995)|               Drama|1995|
|     15|Cutthroat Island ...|A

In [7]:
# Count unique genres
# First, explode the genres (split by |) to get individual genres
genre_list = movies_df.select("movieId", F.explode(F.split(F.col("genres"), "\\|")).alias("genre"))

# Count occurrences of each genre
genre_list = genre_list.groupBy("genre").count().orderBy(F.desc("count"))

# Show the results
genre_list.show(30, truncate=False)


+------------------+-----+
|genre             |count|
+------------------+-----+
|Drama             |4361 |
|Comedy            |3756 |
|Thriller          |1894 |
|Action            |1828 |
|Romance           |1596 |
|Adventure         |1263 |
|Crime             |1199 |
|Sci-Fi            |980  |
|Horror            |978  |
|Fantasy           |779  |
|Children          |664  |
|Animation         |611  |
|Mystery           |573  |
|Documentary       |440  |
|War               |382  |
|Musical           |334  |
|Western           |167  |
|IMAX              |158  |
|Film-Noir         |87   |
|(no genres listed)|34   |
+------------------+-----+



In [8]:
# Filter out movies with no genres
movies_df = movies_df.filter(F.col("genres") != "(no genres listed)")

In [26]:
# Convert genres to array
movies_df = movies_df.withColumn("genres_array", F.split(F.col("genres"), "\\|"))

# Show some stats and sample data
print(f"Number of movies: {movies_df.count()}")
movies_df.select("movieId", "title", "year", "genres_array").show(5, truncate=False)

Number of movies: 9708
+-------+----------------------------------+----+-------------------------------------------------+
|movieId|title                             |year|genres_array                                     |
+-------+----------------------------------+----+-------------------------------------------------+
|1      |Toy Story (1995)                  |1995|[Adventure, Animation, Children, Comedy, Fantasy]|
|2      |Jumanji (1995)                    |1995|[Adventure, Children, Fantasy]                   |
|3      |Grumpier Old Men (1995)           |1995|[Comedy, Romance]                                |
|4      |Waiting to Exhale (1995)          |1995|[Comedy, Drama, Romance]                         |
|5      |Father of the Bride Part II (1995)|1995|[Comedy]                                         |
+-------+----------------------------------+----+-------------------------------------------------+
only showing top 5 rows



In [10]:
movies_df = movies_df.withColumn("decade", 
                                 (F.floor(F.col("year") / 10) * 10).cast("integer"))

In [27]:
movies_df.show()

+-------+--------------------+--------------------+----+--------------------+------+
|movieId|               title|              genres|year|        genres_array|decade|
+-------+--------------------+--------------------+----+--------------------+------+
|      1|    Toy Story (1995)|Adventure|Animati...|1995|[Adventure, Anima...|  1990|
|      2|      Jumanji (1995)|Adventure|Childre...|1995|[Adventure, Child...|  1990|
|      3|Grumpier Old Men ...|      Comedy|Romance|1995|   [Comedy, Romance]|  1990|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|1995|[Comedy, Drama, R...|  1990|
|      5|Father of the Bri...|              Comedy|1995|            [Comedy]|  1990|
|      6|         Heat (1995)|Action|Crime|Thri...|1995|[Action, Crime, T...|  1990|
|      7|      Sabrina (1995)|      Comedy|Romance|1995|   [Comedy, Romance]|  1990|
|      8| Tom and Huck (1995)|  Adventure|Children|1995|[Adventure, Child...|  1990|
|      9| Sudden Death (1995)|              Action|1995|         

In [None]:
# Process tags data

# Clean tags: lowercase, remove punctuation, etc...
tags_df = tags_df.withColumn("clean_tag", 
                            F.lower(F.regexp_replace(F.col("tag"), r"[^\w\s]", "")))

# Aggregate tags per movie and filter out less common tags
movie_tags = tags_df.groupBy("movieId") \
                   .agg(F.collect_list("clean_tag").alias("tags"))



In [None]:
# Get the most common tags (we can change limit as needed, for now, I put it to 200, increasing it will include more tags)
common_tags = tags_df.groupBy("clean_tag") \
                    .count() \
                    .orderBy(F.col("count").desc()) \
                    .limit(200)
        

In [14]:
common_tags.show()

+------------------+-----+
|         clean_tag|count|
+------------------+-----+
|  in netflix queue|  131|
|       atmospheric|   41|
|             scifi|   24|
|           surreal|   24|
|         superhero|   24|
|  thoughtprovoking|   24|
|             funny|   24|
|            disney|   23|
|          religion|   22|
|            quirky|   22|
|        psychology|   21|
|          suspense|   21|
|       dark comedy|   21|
|      twist ending|   20|
|visually appealing|   20|
|             crime|   19|
|          politics|   19|
|            comedy|   19|
|             music|   17|
|       time travel|   16|
+------------------+-----+
only showing top 20 rows



In [15]:
movies_with_tags = movies_df.join(movie_tags, on="movieId", how="left")

# Fill null tag lists with empty list
movies_with_tags = movies_with_tags.withColumn(
    "tags", F.when(F.col("tags").isNull(), F.array()).otherwise(F.col("tags"))
)

In [16]:
movies_with_tags.show()

+-------+--------------------+--------------------+----+--------------------+------+--------------------+
|movieId|               title|              genres|year|        genres_array|decade|                tags|
+-------+--------------------+--------------------+----+--------------------+------+--------------------+
|      1|    Toy Story (1995)|Adventure|Animati...|1995|[Adventure, Anima...|  1990| [pixar, pixar, fun]|
|      2|      Jumanji (1995)|Adventure|Childre...|1995|[Adventure, Child...|  1990|[game, fantasy, m...|
|      3|Grumpier Old Men ...|      Comedy|Romance|1995|   [Comedy, Romance]|  1990|        [moldy, old]|
|      4|Waiting to Exhale...|Comedy|Drama|Romance|1995|[Comedy, Drama, R...|  1990|                  []|
|      5|Father of the Bri...|              Comedy|1995|            [Comedy]|  1990| [pregnancy, remake]|
|      6|         Heat (1995)|Action|Crime|Thri...|1995|[Action, Crime, T...|  1990|                  []|
|      7|      Sabrina (1995)|      Comedy|Rom

In [None]:
# One hot encoding for genres and tags and decades

# Get genres as a list
top_genres = [row["genre"] for row in genre_list.collect()]

movies_features = movies_with_tags

# Add a column for each genre (1 if present, 0 if not)
for genre in top_genres:
    movies_features = movies_features.withColumn(
        f"genre_{genre}",
        F.when(F.array_contains(F.col("genres_array"), genre), 1).otherwise(0)
    )

# Create decade features (one-hot encoded)
decades = [1990, 2000, 2010]
for decade in decades:
    movies_features = movies_features.withColumn(
        f"decade_{decade}", 
        F.when((F.col("decade") == decade), 1).otherwise(0)
    )

# Add columns for the most common tags
top_tags = [row["clean_tag"] for row in common_tags.collect()]
for tag in top_tags:
    movies_features = movies_features.withColumn(
        f"tag_{tag}",
        F.when(F.array_contains(F.col("tags"), tag), 1).otherwise(0)
    )

In [18]:
movies_features.show()

+-------+--------------------+--------------------+----+--------------------+------+--------------------+-----------+------------+--------------+------------+-------------+---------------+-----------+------------+------------+-------------+--------------+---------------+-------------+-----------------+---------+-------------+-------------+----------+---------------+------------------------+-----------+-----------+-----------+--------------------+---------------+---------+-----------+-------------+--------------------+---------+----------+------------+----------+------------+--------------+---------------+----------------+----------------------+---------+------------+----------+---------+----------+--------+---------------+---------------+------------------+----------+------------+---------+----------------+-------------+---------+----------------+--------------+--------------+-------------+---------------+---------+-------------+---------+-------------+-----------+----------+---------+

In [19]:
# Create list of all feature columns
feature_cols = []
feature_cols.extend([f"genre_{genre}" for genre in top_genres])
feature_cols.extend([f"decade_{decade}" for decade in decades])
feature_cols.extend([f"tag_{tag}" for tag in top_tags])

# Create vector assembler
assembler = VectorAssembler(inputCols=feature_cols, outputCol="features")
movie_vectors = assembler.transform(movies_features)

In [20]:
# Compute similarity matrix

# Collect the movie data and feature vectors
movie_data = movie_vectors.select("movieId", "title", "features").collect()

# Create a dictionary mapping movie IDs to their indices in the list
movie_id_to_idx = {movie.movieId: i for i, movie in enumerate(movie_data)}
movie_idx_to_id = {i: movie.movieId for i, movie in enumerate(movie_data)}
movie_id_to_title = {movie.movieId: movie.title for movie in movie_data}

# Convert feature vectors to numpy arrays
feature_matrix = np.array([movie.features.toArray() for movie in movie_data])

# Compute cosine similarity
similarity_matrix = cosine_similarity(feature_matrix)

In [None]:
# Define function to find similar movies
def find_similar_movies(movie_title, top_n=5):

    # Find the movie ID for the given title
    title_lower = movie_title.lower()
    
    
    matching_movies = [
        (id, title) for id, title in movie_id_to_title.items() 
        if title_lower in title.lower()
    ]
    
    if not matching_movies:
        print(f"Movie '{movie_title}' not found.")
        return []
    
    
    movie_id, movie_title = matching_movies[0]
    
    # Get the movie's index in our matrix
    idx = movie_id_to_idx[movie_id]
    
    # Get similarity scores for this movie with all others
    sim_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort by similarity score (descending)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get top N most similar movies (excluding itself)
    sim_scores = sim_scores[1:top_n+1]
    
    # Get the movie IDs and scores
    similar_movie_indices = [i for i, score in sim_scores]
    similar_movie_scores = [score for i, score in sim_scores]
    
    # Get the movie IDs and titles
    similar_movie_ids = [movie_idx_to_id[idx] for idx in similar_movie_indices]
    similar_movie_titles = [movie_id_to_title[id] for id in similar_movie_ids]
    
    
    results = []
    for i in range(len(similar_movie_titles)):
        results.append({
            "title": similar_movie_titles[i],
            "movieId": similar_movie_ids[i],
            "similarity_score": similar_movie_scores[i]
        })
    
    return results

In [33]:
# Example usage
similar_movies = find_similar_movies("Casino (1995)", top_n=5)
for i, movie in enumerate(similar_movies, 1):
    print(f"{i}. {movie['title']} (ID: {movie['movieId']}) - Similarity: {movie['similarity_score']:.4f}")

1. Goodfellas (1990) (ID: 1213) - Similarity: 1.0000
2. Donnie Brasco (1997) (ID: 1466) - Similarity: 1.0000
3. Carlito's Way (1993) (ID: 431) - Similarity: 0.8944
4. Shanghai Triad (Yao a yao yao dao waipo qiao) (1995) (ID: 30) - Similarity: 0.8660
5. Hate (Haine, La) (1995) (ID: 97) - Similarity: 0.8660
