In [1]:
import sys
import os
import pandas as pd
import numpy as np

# Add the parent directory to sys.path
parent_dir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parent_dir)

from helper_functions import create_user_item_matrix, get_movie_id_from_title, get_top_ten_similar_movies

# Data import

In [2]:
movies_df = pd.read_csv('../../data/movies.csv')
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
ratings_df = pd.read_csv('../../data/ratings.csv')
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


# Item-based collaborative filtering
We select a movie the user has just looked at, or recently rated highly, and suggest similar movies.
<br>
We define a movie as being similar by comparing the behaviour of other users.
<br>
Two movies will be declared similar if they were given a high (or low) rating by the same users.
<br>
<br>
Users A, B and C liked Fight Club and The Shawshank Redemption and disliked Love Actually.
<br>
User D liked Fight Club so we recommend The Shawshank Redemption, not Love Actually.

## Cosine Similarity
The **Cosine Similarity** measures the similarity between two vectors based on the cosine of the angle between them. It evaluates whether two vectors point in the same direction in a high-dimensional space, making it a commonly used metric for comparing user or item behavior in recommendation systems.

$$
\text{Cosine Similarity} = \frac{\sum_{i=1}^n A_i \cdot B_i}{\sqrt{\sum_{i=1}^n A_i^2} \cdot \sqrt{\sum_{i=1}^n B_i^2}}
$$

Where:
- $A$ and $B$: Two vectors (e.g., user ratings or movie feature vectors).
- $A_i$, $B_i$: Individual components of the vectors.
- The numerator is the dot product of $A$ and $B$, measuring overlap.
- The denominator is the product of the magnitudes (lengths) of $A$ and $B$, normalizing for vector size.

**How It Works**
1. Cosine similarity measures the **orientation**, not the magnitude, of the vectors.
2. It ranges from **-1** to **1**:
   - **1**: Perfect similarity (vectors point in the same direction).
   - **0**: No similarity (vectors are orthogonal).
   - **-1**: Perfect dissimilarity (vectors point in opposite directions, rarely used in recommendation contexts).

**Key Notes**
- Cosine similarity works well with **sparse data** (e.g., user ratings matrices with many missing values).
- It ignores differences in **magnitude** (e.g., a user who rates all movies 5 stars is treated similarly to one who rates them all 1 star, as long as the relative pattern is the same).
- Missing values are often treated as 0 (no interaction), but this may imply no preference, so alternative handling is sometimes necessary.


In our case we cannot fill the missing values with 0 as this will infer incorrect bad ratings from users.
<br>
Also, due to the extreme sparsity of the data, I do not believe imputing missing values from average ratings is a good idea.
<br>
Therefore, rather than use the typical scikit-learn method, sklearn.metrics.pairwise.cosine_similarity, I will create a mask to filter out sparse values and calculate the cosine similarity for each movie individually.

### Method 1: Apply a mask to the dataframe

In [5]:
def calculate_cosine_similarity(ratings_matrix):
    
    # Fill NaN with 0 for temporary compatibility
    ratings_filled = ratings_matrix.fillna(0)
    # Create a mask for shared ratings
    mask = (~ratings_matrix.isna()).astype(int)
    # Compute the numerator (dot product of shared ratings)
    numerator = (ratings_filled * mask).T @ (ratings_filled * mask)
    # Square the ratings, apply the mask and sum over all users who rated each movie
    magnitudes = np.sqrt((ratings_filled ** 2 * mask).sum(axis=0))
    # Reshape into numpy arrays and perform element-wise multiplication
    denominator = magnitudes.values[:, None] * magnitudes.values[None, :]
    # Compute cosine similarity
    cosine_similarity_matrix = numerator / denominator
    
    return cosine_similarity_matrix

def return_cosine_similar_movies(movie_id, cosine_similarity_matrix):
    # Extract the cosine similarity values for the movie
    similar_movies = pd.DataFrame(cosine_similarity_matrix[movie_id])
    # Remove the similarity to the movie itself
    similar_movies = similar_movies[similar_movies.index != movie_id]
    # Rename the column as it will be merged with other scores later.
    #similar_movies.rename(columns={movie_id: 'Cosine_similarity'}, inplace=True)
    return similar_movies

user_item_matrix = create_user_item_matrix(ratings_df, fill_value=0)
cosine_similarity_matrix = calculate_cosine_similarity(user_item_matrix)

### Check popular movies

In [6]:
movie_title = 'fight club'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,fight club - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
2571,0.713937,"Matrix, The (1999)"
4226,0.669593,Memento (2000)
2329,0.649054,American History X (1998)
6874,0.639738,Kill Bill: Vol. 1 (2003)
4993,0.635744,"Lord of the Rings: The Fellowship of the Ring,..."
2858,0.625549,American Beauty (1999)
296,0.62322,Pulp Fiction (1994)
7153,0.622016,"Lord of the Rings: The Return of the King, The..."
79132,0.615417,Inception (2010)
7438,0.614344,Kill Bill: Vol. 2 (2004)


In [7]:
movie_title = 'shawshank'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,shawshank - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
356,0.712993,Forrest Gump (1994)
296,0.702366,Pulp Fiction (1994)
593,0.647066,"Silence of the Lambs, The (1991)"
50,0.631787,"Usual Suspects, The (1995)"
527,0.629145,Schindler's List (1993)
2959,0.606644,Fight Club (1999)
110,0.597317,Braveheart (1995)
2571,0.589155,"Matrix, The (1999)"
150,0.578835,Apollo 13 (1995)
47,0.564136,Seven (a.k.a. Se7en) (1995)


In [8]:
movie_title = 'Godfather, The'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,"Godfather, The - similarity",title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1221,0.821773,"Godfather: Part II, The (1974)"
1213,0.664841,Goodfellas (1990)
1193,0.620536,One Flew Over the Cuckoo's Nest (1975)
260,0.595317,Star Wars: Episode IV - A New Hope (1977)
608,0.588614,Fargo (1996)
1196,0.58603,Star Wars: Episode V - The Empire Strikes Back...
2959,0.581279,Fight Club (1999)
1089,0.579059,Reservoir Dogs (1992)
296,0.57527,Pulp Fiction (1994)
2858,0.575012,American Beauty (1999)


These predictions look pretty good. Let's identify some less popular movies and see if their predictions are also good.

### Check unpopular movies

In [35]:
# Get the movies with the least ratings
ratings_per_movie = ratings_df['movieId'].value_counts()
unpopular_movies = ratings_per_movie.sort_values().head(100).index.values
for movie_id in unpopular_movies:
    movie_title = movies_df[movies_df['movieId']==movie_id].title.values[0]
    print(movie_id, movie_title)

163981 31 (2016)
81681 I Shot Jesse James (1949)
71438 Still Walking (Aruitemo aruitemo) (2008)
70015 Polytechnique (2009)
100226 Why Stop Now (2012)
57502 Cat Soup (Nekojiru-so) (2001)
51573 Meshes of the Afternoon (1943)
40491 Match Factory Girl, The (Tulitikkutehtaan tyttö) (1990)
26743 Only Yesterday (Omohide poro poro) (1991)
26717 Begotten (1990)
25782 Boudu Saved From Drowning (Boudu sauvé des eaux) (1932)
82684 Trash Humpers (2009)
8335 Make Way for Tomorrow (1937)
5028 What Time Is It There? (Ni neibian jidian) (2001)
61071 Sisterhood of the Traveling Pants 2, The (2008)
50942 Wake Up, Ron Burgundy (2004)
128832 The Last Five Years (2014)
115667 Love, Rosie (2014)
100527 Safe Haven (2013)
96975 LOL (2012)
95199 What to Expect When You're Expecting (2012)
94325 Lucky One, The (2012)
90524 Abduction (2011)
8189 Zazie dans le métro (1960)
62718 Angus, Thongs and Perfect Snogging (2008)
47516 Material Girls (2006)
3663 Puppet Master 4 (1993)
148238 A Very Murray Christmas (2015)
1

It took a lot of unpopular movies to identify some which would be useful because I need to actually recognise them and their genre to know if the recommendations are good.

1985 Halloween 4: The Return of Michael Myers (1988) - Horror

125914 Mortdecai (2015) - Johnny Depp

159849 Bo Burnham: Make Happy (2016) - Stand-up comedy/Music

50942 Wake Up, Ron Burgundy (2004) - Comedy

61071 Sisterhood of the Traveling Pants 2, The (2008) - RomCom?


In [36]:
movie_title = 'Halloween 4'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,Halloween 4 - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
3574,1.0,Carnosaur 3: Primal Species (1996)
3691,1.0,Private School (1983)
2862,1.0,Caligula (1979)
2451,1.0,"Gate, The (1987)"
790,1.0,"Unforgettable Summer, An (Un été inoubliable) ..."
633,1.0,Denise Calls Up (1995)
1336,1.0,Body Parts (1991)
3661,1.0,Puppet Master II (1991)
3678,1.0,"Man with the Golden Arm, The (1955)"
3663,1.0,Puppet Master 4 (1993)


In [37]:
movie_title = 'Mortdecai'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,Mortdecai - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
120637,0.83205,Blackhat (2015)
127096,0.648204,Project Almanac (2015)
80831,0.478913,Let Me In (2010)
103688,0.446767,"Conjuring, The (2013)"
57274,0.427569,[REC] (2007)
3727,0.40522,Near Dark (1987)
61240,0.302199,Let the Right One In (Låt den rätte komma in) ...
83134,0.293737,Tucker & Dale vs Evil (2010)
4105,0.272772,"Evil Dead, The (1981)"
54995,0.270666,Planet Terror (2007)


In [38]:
movie_title = 'Bo Burnham: Make Happy'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,Bo Burnham: Make Happy - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
134248,1.0,Hot Girls Wanted (2015)
106889,1.0,Tim's Vermeer (2013)
164540,1.0,Amanda Knox (2016)
120807,1.0,John Mulaney: New In Town (2012)
96150,1.0,"Queen of Versailles, The (2012)"
148667,1.0,John Mulaney: The Comeback Kid (2015)
163653,1.0,David Cross: Making America Great Again (2016)
91869,1.0,Being Elmo: A Puppeteer's Journey (2011)
146682,1.0,Twinsters (2015)
8264,1.0,Grey Gardens (1975)


In [39]:
movie_title = 'Wake Up, Ron Burgundy'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,"Wake Up, Ron Burgundy - similarity",title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
151315,0.929981,Ride Along 2 (2016)
58655,0.928477,Drillbit Taylor (2008)
109372,0.894427,About Last Night (2014)
94323,0.83205,Think Like a Man (2012)
143257,0.813733,Ashby (2015)
70641,0.786334,Miss March (2009)
88672,0.759257,Our Idiot Brother (2011)
106330,0.707107,Last Vegas (2013)
118924,0.707107,Top Five (2014)
103883,0.646162,2 Guns (2013)


In [40]:
movie_title = 'Sisterhood of the Traveling Pants 2'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_matrix)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

Unnamed: 0_level_0,Sisterhood of the Traveling Pants 2 - similarity,title
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
128832,1.0,The Last Five Years (2014)
43930,1.0,Just My Luck (2006)
70932,1.0,My Life in Ruins (2009)
47516,1.0,Material Girls (2006)
82152,1.0,Beastly (2011)
80858,1.0,You Again (2010)
62718,1.0,"Angus, Thongs and Perfect Snogging (2008)"
90524,1.0,Abduction (2011)
100527,1.0,Safe Haven (2013)
78264,1.0,"Back-up Plan, The (2010)"


The results are not perfect but they still look good for such unpopular movies. 

There are movies of similar genre in each top ten.

### Method 2: 

In [12]:
from surprise import KNNBasic, Dataset, Reader
from surprise.model_selection import KFold

# Load the data into a Surprise Dataset object
reader = Reader(rating_scale=(0.5, 5.0))
ratings_data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Define the algorithm
sim_options = {'name': 'cosine', 'user_based': False}
algo = KNNBasic(sim_options=sim_options)

# Set up k-fold cross-validation
kf = KFold(n_splits=5)

# Collect predictions from all folds
all_predictions = []

for trainset, testset in kf.split(ratings_data):
    # Train on the current fold
    algo.fit(trainset)
    # Predict on the test set
    predictions = algo.test(testset)
    # Append predictions to the list
    all_predictions.extend(predictions)

# Convert predictions to a DataFrame for further analysis
predictions_df = pd.DataFrame([(pred.uid, pred.iid, pred.r_ui, pred.est, pred.details['was_impossible']) 
                               for pred in all_predictions],
                              columns=['userId', 'movieId', 'actual_rating', 'predicted_rating', 'was_impossible'])

# Display the first few rows of the predictions DataFrame
print(predictions_df.head())

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
   userId  movieId  actual_rating  predicted_rating  was_impossible
0     132    31658            4.0          2.912077           False
1     448      303            3.0          2.712500           False
2     560   111384            4.0          3.501252            True
3     474     7086            3.5          3.501252            True
4     555     3591            3.0          3.175000           False


In [41]:
movie_title = 'fight club'

movie_id = get_movie_id_from_title(movie_title, movies_df)
predictions_df[predictions_df.movieId==movie_id].sort_values('predicted_rating', ascending=False).head(10)

Unnamed: 0,userId,movieId,actual_rating,predicted_rating,was_impossible
66930,515,2959,5.0,4.849518,False
50802,251,2959,5.0,4.843956,False
37289,348,2959,5.0,4.713373,False
26293,523,2959,4.5,4.67609,False
98800,122,2959,5.0,4.611642,False
8658,400,2959,5.0,4.49215,False
18800,553,2959,5.0,4.488265,False
7254,601,2959,5.0,4.487819,False
82830,417,2959,5.0,4.474373,False
64428,239,2959,5.0,4.45036,False


In [42]:
movie_title = 'fight club'

movie_id = get_movie_id_from_title(movie_title, movies_df)
cosine_similar_movies = return_cosine_similar_movies(movie_id, cosine_similarity_scores_2)
top_ten = get_top_ten_similar_movies(cosine_similar_movies, movies_df, movie_id)

top_ten = top_ten.rename(columns={movie_id: movie_title +' - similarity'})
top_ten

NameError: name 'cosine_similarity_scores_2' is not defined