In [1]:
import sys
import os
import pandas as pd
import numpy as np

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from helper_functions import create_ratings_df, get_movie_id_from_title, get_top_ten_similar_movies

# Data import

In [2]:
movies = pd.read_csv('../../data/movies.csv')
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings = pd.read_csv('../../data/ratings.csv')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Item-based collaborative filtering
We select a movie the user has just looked at, or recently rated highly, and suggest similar movies.
<br>
We define a movie as being similar by comparing the behaviour of other users.
<br>
Two movies will be declared similar if they were given a high (or low) rating by the same users.
<br>
<br>
Users A, B and C liked Fight Club and The Shawshank Redemption and disliked Love Actually.
<br>
User D liked Fight Club so we recommend The Shawshank Redemption, not Love Actually.

## Cosine Similarity
The **Cosine Similarity** measures the similarity between two vectors based on the cosine of the angle between them. It evaluates whether two vectors point in the same direction in a high-dimensional space, making it a commonly used metric for comparing user or item behavior in recommendation systems.

$$
\text{Cosine Similarity} = \frac{\sum_{i=1}^n A_i \cdot B_i}{\sqrt{\sum_{i=1}^n A_i^2} \cdot \sqrt{\sum_{i=1}^n B_i^2}}
$$

Where:
- $A$ and $B$: Two vectors (e.g., user ratings or movie feature vectors).
- $A_i$, $B_i$: Individual components of the vectors.
- The numerator is the dot product of $A$ and $B$, measuring overlap.
- The denominator is the product of the magnitudes (lengths) of $A$ and $B$, normalizing for vector size.

**How It Works**
1. Cosine similarity measures the **orientation**, not the magnitude, of the vectors.
2. It ranges from **-1** to **1**:
   - **1**: Perfect similarity (vectors point in the same direction).
   - **0**: No similarity (vectors are orthogonal).
   - **-1**: Perfect dissimilarity (vectors point in opposite directions, rarely used in recommendation contexts).

**Key Notes**
- Cosine similarity works well with **sparse data** (e.g., user ratings matrices with many missing values).
- It ignores differences in **magnitude** (e.g., a user who rates all movies 5 stars is treated similarly to one who rates them all 1 star, as long as the relative pattern is the same).
- Missing values are often treated as 0 (no interaction), but this may imply no preference, so alternative handling is sometimes necessary.


In our case we cannot fill the missing values with 0 as this will infer incorrect bad ratings from users.
<br>
Also, due to the extreme sparsity of the data, I do not believe imputing missing values from average ratings is a good idea.
<br>
Therefore, rather than use the typical scikit-learn method, sklearn.metrics.pairwise.cosine_similarity, I will create a mask to filter out sparse values and calculate the cosine similarity for each movie individually.

In [4]:
def calculate_cosine_similarity(ratings):
    
    # Fill NaN with 0 for temporary compatibility
    ratings_filled = ratings_df.fillna(0)
    # Create a mask for shared ratings
    mask = (~ratings_df.isna()).astype(int)
    # Compute the numerator (dot product of shared ratings)
    numerator = (ratings_filled * mask).T @ (ratings_filled * mask)
    # Square the ratings, apply the mask and sum over all users who rated each movie
    magnitudes = np.sqrt((ratings_filled ** 2 * mask).sum(axis=0))
    # Reshape into numpy arrays and perform element-wise multiplication
    denominator = magnitudes.values[:, None] * magnitudes.values[None, :]
    # Compute cosine similarity
    cosine_similarity_matrix = numerator / denominator
    
    return cosine_similarity_matrix

def return_cosine_similar_movies(movie_id, cosine_similarity_matrix):
    # Extract the cosine similarity values for the movie
    similar_movies = pd.DataFrame(cosine_similarity_matrix[movie_id])
    # Remove the similarity to the movie itself
    similar_movies = similar_movies[similar_movies.index != movie_id]
    # Rename the column as it will be merged with other scores later.
    #similar_movies.rename(columns={movie_id: 'Cosine_similarity'}, inplace=True)
    return similar_movies

ratings_df = create_ratings_df(ratings, fill_value=0)
cosine_similarity_matrix = calculate_cosine_similarity(ratings_df)

In [12]:
movie_title = 'fight club'

movie_id = get_movie_id_from_title(movie_title, movies)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,fight club - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2571,0.713937,"Matrix, The (1999)"
4226,0.669593,Memento (2000)
2329,0.649054,American History X (1998)
6874,0.639738,Kill Bill: Vol. 1 (2003)
4993,0.635744,"Lord of the Rings: The Fellowship of the Ring,..."
2858,0.625549,American Beauty (1999)
296,0.62322,Pulp Fiction (1994)
7153,0.622016,"Lord of the Rings: The Return of the King, The..."
79132,0.615417,Inception (2010)
7438,0.614344,Kill Bill: Vol. 2 (2004)
