# Content Based Filtering Using K-Nearest Neighbors: an NLP Approach

In [34]:
import plotly.io as pio
pio.renderers.default = 'notebook'
import plotly.express as px

import pandas as pd
import ast
import re
import tracemalloc
import json


import matplotlib.pyplot as plt
%matplotlib inline

import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import umap
from sklearn.preprocessing import MinMaxScaler
from sklearn.neighbors import NearestNeighbors



from memory_profiler import profile
%load_ext memory_profiler


from mpl_toolkits.mplot3d import Axes3D
from matplotlib import animation



The memory_profiler extension is already loaded. To reload it, use:
  %reload_ext memory_profiler


In [2]:

# only needs to be run once, makes 'movies_combined.csv', 
# however movies-database dir must be loaded first
!jupyter nbconvert --to notebook --execute 01.InitialEDA.ipynb --inplace


[NbConvertApp] Converting notebook 01.InitialEDA.ipynb to notebook
[NbConvertApp] Writing 65658 bytes to 01.InitialEDA.ipynb


In [3]:
# Load the dataset
df = pd.read_csv('movies_combined.csv')  # ensure the path is correct
display(df.head())

Unnamed: 0,movieId,title,genres,imdbId,tmdbId,imdb_id_str,ratings,tags
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,114709,862.0,tt0114709,"{""2"": 3.5, ""3"": 4.0, ""4"": 3.0, ""5"": 4.0, ""8"": ...","[""Owned"", ""imdb top 250"", ""Pixar"", ""Pixar"", ""t..."
1,2,Jumanji (1995),Adventure|Children|Fantasy,113497,8844.0,tt0113497,"{""9"": 5.0, ""12"": 2.0, ""19"": 3.5, ""20"": 2.5, ""3...","[""Robin Williams"", ""time travel"", ""fantasy"", ""..."
2,3,Grumpier Old Men (1995),Comedy|Romance,113228,15602.0,tt0113228,"{""8"": 4.0, ""12"": 2.0, ""18"": 1.5, ""23"": 5.0, ""4...","[""funny"", ""best friend"", ""duringcreditsstinger..."
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,114885,31357.0,tt0114885,"{""141"": 3.0, ""175"": 3.0, ""230"": 3.0, ""236"": 4....","[""based on novel or book"", ""chick flick"", ""div..."
4,5,Father of the Bride Part II (1995),Comedy,113041,11862.0,tt0113041,"{""18"": 4.0, ""48"": 3.0, ""61"": 3.0, ""75"": 4.0, ""...","[""aging"", ""baby"", ""confidence"", ""contraception..."


In [4]:
# Check for missing values in key columns
print(df[['title', 'genres', 'tags', 'ratings']].isnull().sum())

title          0
genres         0
tags       17172
ratings     3376
dtype: int64


In [5]:
# Define a helper function to clean and parse genre/tags fields
def clean_text_field(val):
    if pd.isnull(val):
        return ""  # if value is NaN, return empty string
    text = str(val)
    # If the text looks like a list of dicts (e.g. "[{'id': ..., 'name': 'Action'}, ...]"), parse and extract 'name'
    if text.startswith('[') and ('name' in text or 'id' in text):
        try:
            data = ast.literal_eval(text)
            if isinstance(data, list):
                # join the 'name' field of each dict (if present) into one string
                names = [d['name'] for d in data if isinstance(d, dict) and 'name' in d]
                text = " ".join(names)
        except Exception as e:
            # If parsing fails, we'll fall back to raw text
            text = str(val)

    text = text.replace(',', ' ')
    text = text.replace('|', ' ')
    
    # Remove any special characters or digits, keep only letters and spaces
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Convert to lowercase
    text = text.lower().strip()
    return text

In [6]:
# Apply cleaning to 'genre' and 'tags' columns
df['genres'] = df['genres'].apply(clean_text_field)
df['tags'] = df['tags'].apply(clean_text_field)
# Also lowercase and clean title (though title usually doesn't need heavy cleaning)
df['title'] = df['title'].fillna('').astype(str).apply(lambda x: re.sub(r'[^a-zA-Z0-9\s]', '', x).lower().strip())

# Combine title, genre, and tags into a single text corpus for each movie
df['combined_text'] = df['title'] + " " + df['genres'] + " " + df['tags']

# Inspect the combined text for a few movies
display(df[['title', 'genres', 'tags', 'combined_text']].head(3))

Unnamed: 0,title,genres,tags,combined_text
0,toy story 1995,adventure animation children comedy fantasy,,toy story 1995 adventure animation children co...
1,jumanji 1995,adventure children fantasy,,jumanji 1995 adventure children fantasy
2,grumpier old men 1995,comedy romance,funny best friend duringcreditsstinger fish...,grumpier old men 1995 comedy romance funny be...


In [7]:
df.loc[2,['title', 'genres', 'tags', 'combined_text']]

title                                        grumpier old men 1995
genres                                              comedy romance
tags             funny  best friend  duringcreditsstinger  fish...
combined_text    grumpier old men 1995 comedy romance funny  be...
Name: 2, dtype: object

In [8]:
# max_features=50000 is a hyperparameter!!!!
%memit tfidf_matrix = TfidfVectorizer(max_features=50000).fit_transform(df['combined_text'])

peak memory: 635.90 MiB, increment: 27.07 MiB


In [9]:
print("TF-IDF matrix shape:", tfidf_matrix.shape)
print("Sample TF-IDF features for first movie:\n", tfidf_matrix[0][:10])


TF-IDF matrix shape: (62423, 50000)
Sample TF-IDF features for first movie:
 <Compressed Sparse Row sparse matrix of dtype 'float64'
	with 8 stored elements and shape (1, 50000)>
  Coords	Values
  (0, 46088)	0.5896130118223643
  (0, 43255)	0.34737101142937155
  (0, 150)	0.40498751674983496
  (0, 517)	0.271668886609274
  (0, 1324)	0.29803534452081776
  (0, 7708)	0.2931339402065138
  (0, 8676)	0.16905914262912594
  (0, 14675)	0.30089439483387154


In [10]:
df['ratings'].head()

0    {"2": 3.5, "3": 4.0, "4": 3.0, "5": 4.0, "8": ...
1    {"9": 5.0, "12": 2.0, "19": 3.5, "20": 2.5, "3...
2    {"8": 4.0, "12": 2.0, "18": 1.5, "23": 5.0, "4...
3    {"141": 3.0, "175": 3.0, "230": 3.0, "236": 4....
4    {"18": 4.0, "48": 3.0, "61": 3.0, "75": 4.0, "...
Name: ratings, dtype: object

In [11]:
def extract_mean_rating_fast(rating_str):
    if pd.isnull(rating_str):
        return np.nan
    try:
        rating_dict = json.loads(rating_str)
        if isinstance(rating_dict, dict) and rating_dict:
            return sum(rating_dict.values()) / len(rating_dict)
    except Exception:
        return np.nan

# Apply faster version
df['mean_rating'] = df['ratings'].apply(extract_mean_rating_fast)

# Fill missing with global mean
df['mean_rating'] = df['mean_rating'].fillna(df['mean_rating'].mean())

# Preview result
print(df[['ratings', 'mean_rating']].head())

                                             ratings  mean_rating
0  {"2": 3.5, "3": 4.0, "4": 3.0, "5": 4.0, "8": ...     3.893708
1  {"9": 5.0, "12": 2.0, "19": 3.5, "20": 2.5, "3...     3.251527
2  {"8": 4.0, "12": 2.0, "18": 1.5, "23": 5.0, "4...     3.142028
3  {"141": 3.0, "175": 3.0, "230": 3.0, "236": 4....     2.853547
4  {"18": 4.0, "48": 3.0, "61": 3.0, "75": 4.0, "...     3.058434


In [12]:
# Initialize a scaler to normalize ratings to [0, 1]
scaler = MinMaxScaler()
df['rating_scaled'] = scaler.fit_transform(df[['mean_rating']])
print("Original mean_rating sample:", df['mean_rating'].head(3).tolist())
print("Scaled mean_rating_scaled sample:", df['rating_scaled'].head(3).tolist())


Original mean_rating sample: [3.893707794587238, 3.2515271586594023, 3.142028126058963]
Scaled mean_rating_scaled sample: [0.7541572876860527, 0.6114504797020894, 0.5871173613464362]


In [13]:
# Convert the rating array to a sparse matrix with the same number of rows
rating_sparse = sp.csr_matrix(df['rating_scaled'].values.reshape(-1, 1))
# Concatenate TF-IDF matrix with the rating column
hybrid_features = sp.hstack([tfidf_matrix, rating_sparse])
print("Hybrid feature matrix shape:", hybrid_features.shape)


Hybrid feature matrix shape: (62423, 50001)


In [14]:
i = 0  # movie index
movie_vector_sparse = hybrid_features[i]

# Convert to dense NumPy array
movie_vector_dense = movie_vector_sparse.toarray().flatten()

print(f"Feature vector for movie {i} has length: {len(movie_vector_dense)}")
print("First 20 values of the vector:", movie_vector_dense[:20])
print("Last 20 values of the vector:", movie_vector_dense[-20:]) # it's ok that there are so many 0s!

Feature vector for movie 0 has length: 50001
First 20 values of the vector: [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
Last 20 values of the vector: [0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.
 0.         0.75415729]


In [15]:
# Fit the Nearest Neighbors model on the hybrid feature matrix
model = NearestNeighbors(metric='cosine', algorithm='auto')
model.fit(hybrid_features)

In [16]:
# Function to get similar movies
def recommend_movies(movie_title, k=5):
    # Find the index of the given movie title
    # We do a case-insensitive match for convenience
    matches = df[df['title'].str.lower() == movie_title.lower()]
    if matches.empty:
        print("Movie " + movie_title + " not found in the dataset.")
        return []
    idx = matches.index[0]
    # Get the feature vector for the movie
    movie_vector = hybrid_features[idx]
    # Find k+1 neighbors (k neighbors plus the movie itself)
    distances, indices = model.kneighbors(movie_vector, n_neighbors=k+1)
    indices = indices.flatten()
    distances = distances.flatten()
    # The first neighbor will be the movie itself (distance 0). Exclude it.
    indices = indices[1:]
    distances = distances[1:]
    # Get titles of the similar movies
    recommended_titles = df['title'].iloc[indices].values
    recommended_genres = df['genres'].iloc[indices].values
    recommended_ratings = df['mean_rating'].iloc[indices].values
    print(f"Because you liked **{movie_title}**, you might also enjoy:")
    for i, (title, genre, rating, dist) in enumerate(zip(recommended_titles, 
                                                         recommended_genres, 
                                                         recommended_ratings, 
                                                         distances), 
                                                     start=1):
        # Print recommendation with some details. Distance is the cosine 
        #   distance (0 = identical, closer to 0 is more similar)
        similarity = 1 - dist # convert distance to similarity for interpretation
        print(f"{i}. {title} - Genre: {genre} - Rating: {rating:.1f}) (Similarity: {similarity:.2f})")
    return recommended_titles


In [17]:
df['title'].head(10)

0                      toy story 1995
1                        jumanji 1995
2               grumpier old men 1995
3              waiting to exhale 1995
4    father of the bride part ii 1995
5                           heat 1995
6                        sabrina 1995
7                   tom and huck 1995
8                   sudden death 1995
9                      goldeneye 1995
Name: title, dtype: object

In [18]:
# Example: get 5 recommendations similar to a given movie
recommend_movies("goldeneye 1995", k=5)

Because you liked **goldeneye 1995**, you might also enjoy:
1. goldeneye 1989 - Genre: drama - Rating: 2.0) (Similarity: 0.74)
2. waterworld 1995 - Genre: action adventure scifi - Rating: 2.9) (Similarity: 0.61)
3. aasai 1995 - Genre: action romance - Rating: 3.8) (Similarity: 0.61)
4. hackers 1995 - Genre: action adventure crime thriller - Rating: 3.2) (Similarity: 0.55)
5. heat 1995 - Genre: action crime thriller - Rating: 3.9) (Similarity: 0.54)


array(['goldeneye 1989', 'waterworld 1995', 'aasai 1995', 'hackers 1995',
       'heat 1995'], dtype=object)

In [43]:
recommend_movies("jumanji 1995", k=5)

Because you liked **jumanji 1995**, you might also enjoy:
1. jumanji welcome to the jungle 2017 - Genre: action adventure children - Rating: 3.6) (Similarity: 0.72)
2. abulele 2015 - Genre: adventure children fantasy - Rating: 4.0) (Similarity: 0.60)
3. toy story 1995 - Genre: adventure animation children comedy fantasy - Rating: 3.9) (Similarity: 0.59)
4. prehysteria 3 1995 - Genre: children comedy drama fantasy scifi - Rating: 4.0) (Similarity: 0.53)
5. windstorm 2 2015 - Genre: adventure children drama romance - Rating: 5.0) (Similarity: 0.53)


array(['jumanji welcome to the jungle 2017', 'abulele 2015',
       'toy story 1995', 'prehysteria 3 1995', 'windstorm 2 2015'],
      dtype=object)

In [19]:
def recommend_from_multiple(movie_titles, k=5):
    # Normalize movie titles for case-insensitive matching
    titles_lower = [t.lower() for t in movie_titles]
    
    # Find indices of all valid movies
    matched = df[df['title'].str.lower().isin(titles_lower)]
    
    if matched.empty:
        print("None of the input movies were found in the dataset.")
        return []
    
    matched_indices = matched.index.tolist()
    print(f"Found {len(matched_indices)} out of {len(movie_titles)} titles in the dataset.")
    
    # Average their feature vectors
    movie_vectors = hybrid_features[matched_indices]
    avg_vector = sp.csr_matrix(movie_vectors.mean(axis=0))
    
    # Find nearest neighbors to the averaged vector
    distances, indices = model.kneighbors(avg_vector, n_neighbors=k + len(matched_indices))
    indices = indices.flatten()
    distances = distances.flatten()
    
    # Exclude the seed movies from results
    filtered = [(idx, dist) for idx, dist in zip(indices, distances) if idx not in matched_indices]
    filtered = filtered[:k]  # top k not in the seed list
    
    # Display results
    print("Because you liked **" + ", ".join(movie_titles) + "**, you might also enjoy:")
    for i, (idx, dist) in enumerate(filtered, start=1):
        title = df.loc[idx, 'title']
        genre = df.loc[idx, 'genres']
        rating = df.loc[idx, 'mean_rating']
        similarity = 1 - dist
        print(f"{i}. {title} - Genre: {genre} - Rating: {rating:.1f} (Similarity: {similarity:.2f})")
    
    return df.loc[[idx for idx, _ in filtered], 'title'].values


In [20]:
recommend_from_multiple(["goldeneye 1995", "heat 1995", "hackers 1995"], k=5)


Found 3 out of 3 titles in the dataset.
Because you liked **goldeneye 1995, heat 1995, hackers 1995**, you might also enjoy:
1. aasai 1995 - Genre: action romance - Rating: 3.8 (Similarity: 0.76)
2. waterworld 1995 - Genre: action adventure scifi - Rating: 2.9 (Similarity: 0.72)
3. assassins 1995 - Genre: action crime thriller - Rating: 3.1 (Similarity: 0.68)
4. bad boys 1995 - Genre: action comedy crime drama thriller - Rating: 3.3 (Similarity: 0.67)
5. 1995 - Genre: drama romance - Rating: 3.5 (Similarity: 0.67)


array(['aasai 1995', 'waterworld 1995', 'assassins 1995', 'bad boys 1995',
       '1995'], dtype=object)

### Visualisation

In [22]:
svd_3d = TruncatedSVD(n_components=3, random_state=42)
points_3d = svd_3d.fit_transform(hybrid_features)

print("Reduced shape:", points_3d.shape)

Reduced shape: (62423, 3)


In [26]:
print("Explained variance per component:", svd_3d.explained_variance_ratio_)
print("Total explained variance:", svd_3d.explained_variance_ratio_.sum())


Explained variance per component: [0.02451182 0.01088892 0.00633331]
Total explained variance: 0.041734045519892855


In [28]:
umap_3d = umap.UMAP(n_components=3, random_state=42)
points_umap = umap_3d.fit_transform(hybrid_features)

print("UMAP shape:", points_umap.shape)

  warn(


UMAP shape: (62423, 3)


In [37]:
import pandas as pd
import plotly.express as px
import numpy as np
from sklearn.decomposition import TruncatedSVD
import umap

def plot_3d_movies(embedding="umap", color_by="genres", sample_size=3000, random_state=42):
    """
    Plot a 3D scatter of movies using UMAP or SVD reduction and color by genre or mean rating.

    Parameters:
        embedding (str): "umap" or "svd"
        color_by (str): "genres" or "mean_rating"
        sample_size (int): number of points to plot
        random_state (int): for reproducibility
    """

    assert embedding in ["umap", "svd"], "embedding must be 'umap' or 'svd'"
    assert color_by in ["genres", "mean_rating"], "color_by must be 'genres' or 'mean_rating'"

    # Dimensionality reduction
    if embedding == "umap":
        reducer = umap.UMAP(n_components=3, random_state=random_state)
    else:  # SVD
        reducer = TruncatedSVD(n_components=3, random_state=random_state)
    
    print(f"Computing 3D embedding with {embedding.upper()}...")
    points_3d = reducer.fit_transform(hybrid_features)

    # Sample for faster plotting
    sample_idx = np.random.choice(points_3d.shape[0], sample_size, replace=False)
    points_sample = points_3d[sample_idx]

    # Build DataFrame for Plotly
    df_plot = pd.DataFrame(points_sample, columns=['x', 'y', 'z'])
    df_plot['title'] = df['title'].iloc[sample_idx].values
    df_plot['genres'] = df['genres'].iloc[sample_idx].values
    df_plot['mean_rating'] = df['mean_rating'].iloc[sample_idx].values

    # Choose color scale
    color_args = dict()
    if color_by == "mean_rating":
        color_args['color_continuous_scale'] = 'Viridis'
        color_args['color'] = df_plot['mean_rating']
    else:
        color_args['color'] = df_plot['genres']

    # Create plot
    fig = px.scatter_3d(
        df_plot,
        x='x', y='y', z='z',
        hover_name='title',
        opacity=0.7,
        **color_args
    )

    fig.update_traces(marker=dict(size=2))
    fig.update_layout(
        title=f"3D Movie Projection ({embedding.upper()}) Colored by {color_by.replace('_', ' ').title()}",
        scene=dict(xaxis_title='Component 1', yaxis_title='Component 2', zaxis_title='Component 3'),
        margin=dict(l=0, r=0, b=0, t=40)
    )

    fig.show()


In [38]:
# UMAP, color by genre
plot_3d_movies(embedding="umap", color_by="genres")

Computing 3D embedding with UMAP...



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [41]:
# UMAP, color by genre
plot_3d_movies(embedding="umap", color_by="mean_rating")

Computing 3D embedding with UMAP...



'force_all_finite' was renamed to 'ensure_all_finite' in 1.6 and will be removed in 1.8.


n_jobs value 1 overridden to 1 by setting random_state. Use no seed for parallelism.



In [39]:
# SVD, color by rating
plot_3d_movies(embedding="svd", color_by="mean_rating")

Computing 3D embedding with SVD...


In [40]:
# SVD, color by genre
plot_3d_movies(embedding="svd", color_by="genres")

Computing 3D embedding with SVD...
