In [1]:
from google.colab import drive

drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
import zipfile

!mkdir -p ~/.kaggle
!cp /content/drive/MyDrive/match_prediction/kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

!kaggle datasets download -d rounakbanik/the-movies-dataset --force
# !kaggle datasets download -d excel4soccer/espn-soccer-data --force
zip_path = "the-movies-dataset.zip"
extract_path = "movies_data"

with zipfile.ZipFile(zip_path, "r") as zip_ref:
  zip_ref.extractall(extract_path)

print("Extraction completed.")





Dataset URL: https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset
License(s): CC0-1.0
Downloading the-movies-dataset.zip to /content
 56% 128M/228M [00:00<00:00, 1.33GB/s]
100% 228M/228M [00:00<00:00, 709MB/s] 
Extraction completed.


In [3]:
import pandas as pd
import numpy as np

metadata = pd.read_csv("movies_data/movies_metadata.csv")

  metadata = pd.read_csv("movies_data/movies_metadata.csv")


In [4]:
# Print plot overviews of the first 5 movies

metadata["overview"].head()

Unnamed: 0,overview
0,"Led by Woody, Andy's toys live happily in his ..."
1,When siblings Judy and Peter discover an encha...
2,A family wedding reignites the ancient feud be...
3,"Cheated on, mistreated and stepped on, the wom..."
4,Just when George Banks has recovered from his ...


In [5]:
metadata.shape

(45466, 24)

This is a NLP problem and TFIDF(Term Frequency-Inverse Document Frequency) and Cosine similarity will be use in handling this problem.

# Steps:
- Import the Tfidf module using scikit-learn;
- Remove stop words like 'the', 'an', etc. since they do not give any useful information about the topic;
- Replace not-a-number values with a blank string;
- Finally, construct the TF-IDF matrix on the data.

In [6]:
from sklearn.feature_extraction.text  import TfidfVectorizer

# Define a TF-IDF Vectorizer object. remove all english stop words
tfidf = TfidfVectorizer(stop_words='english')

# Replace NaN with empty string
metadata["overview"] = metadata["overview"].fillna("")

# Construct the required TF-IDF matrix by fitting and transforming
tfidf_matrix = tfidf.fit_transform(metadata["overview"])


In [7]:
tfidf_matrix.shape

(45466, 75827)

In [8]:
# Array mapping from feature integer indices to feature name.
tfidf.get_feature_names_out()[3000 : 3010]

array(['anabolic', 'anachronistic', 'anacleto', 'anaconda', 'anacondas',
       'anadolu', 'anaheim', 'anahí', 'anais', 'anakata'], dtype=object)

- i will use the idea of Cosine similarity to compute the similarity score b/w movies

- since i have used TF-IDF vec. i will use sklearn's linear_kernel() instead of cosine_similarities() since it is faster.

In [9]:
"""from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp

# convert to sparse to
if not sp.issparse(tfidf_matrix):
  tfidf_matrix = sp.csr_matrix(tfidf_matrix) # this saves Ram


# now operation use efficient sparse algorithm
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)"""

'from sklearn.metrics.pairwise import cosine_similarity\nimport scipy.sparse as sp\n\n# convert to sparse to\nif not sp.issparse(tfidf_matrix):\n  tfidf_matrix = sp.csr_matrix(tfidf_matrix) # this saves Ram\n\n\n# now operation use efficient sparse algorithm\ncosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)'

In [9]:
from scipy.sparse import lil_matrix
from sklearn.metrics.pairwise import cosine_similarity


def cosine_similarity_sparse(matrix, chunk_size=500, threshold=0.1):
    n = matrix.shape[0]
    # Use sparse matrix for output
    similarity_matrix = lil_matrix((n, n))

    for i in range(0, n, chunk_size):
        end_i = min(i + chunk_size, n)
        for j in range(i, n, chunk_size):  # Only compute upper triangle
            end_j = min(j + chunk_size, n)

            chunk_sim = cosine_similarity(matrix[i:end_i], matrix[j:end_j])

            # Only store similarities above threshold
            rows, cols = np.where(chunk_sim > threshold)
            for row, col in zip(rows, cols):
                similarity_matrix[i + row, j + col] = chunk_sim[row, col]
                if i != j:  # Mirror to lower triangle
                    similarity_matrix[j + col, i + row] = chunk_sim[row, col]

    return similarity_matrix.tocsr()

# Only store similarities > 0.1
cosine_sim_sparse = cosine_similarity_sparse(tfidf_matrix, threshold=0.1)

In [10]:
cosine_sim_sparse.shape

(45466, 45466)

In [11]:
cosine_sim_sparse[1]

<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 89 stored elements and shape (1, 45466)>

I am going to define a function that takes in a movie title as an input and outputs a list of the 10 most similar movies. Firstly, for this, i need a reverse mapping of movie titles and DataFrame indices. In other words, i will need a mechanism to identify the index of a movie in the metadata DataFrame, given its title.

In [12]:
indices = pd.Series(metadata.index, index=metadata["title"]).drop_duplicates()


In [13]:
indices[: 10]

Unnamed: 0_level_0,0
title,Unnamed: 1_level_1
Toy Story,0
Jumanji,1
Grumpier Old Men,2
Waiting to Exhale,3
Father of the Bride Part II,4
Heat,5
Sabrina,6
Tom and Huck,7
Sudden Death,8
GoldenEye,9


In [14]:
len(indices)

45466

# We can now define the recommendation function

Steps:

- Get the index of the movie given its title.

- Get the list of cosine similarity scores for that particular movie with all movies. Convert it into a list of tuples where the first element is its position, and the second is the similarity score.

- Sort the aforementioned list of tuples based on the similarity scores; that is, the second element.

- Get the top 10 elements of this list. Ignore the first element as it refers to self (the movie most similar to a particular movie is the movie itself).

- Return the titles corresponding to the indices of the top elements.

In [15]:
# Function that takes in movie title as input and outputs most similar movies

def get_recommendation(title, cosine_sim=cosine_sim_sparse):
  # Get the index of the movie that matches the title
  idx = indices[title]

  # Gets the similarity scores between the target movie and ALL other movies.
  sim_scores = cosine_sim[idx]

  # Convert sparse row to (index, score) pairs (only non-zero values)
  sim_scores = list(zip(sim_scores.indices, sim_scores.data))

  # Sort the movies based on the similarity scores
  sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

  # Get the scores of the 10 most similar movies
  sim_scores = sim_scores[1 : 11] # the 0 index will be the movie itself

  # Get the movie indices
  movie_indices = [i[0] for i in  sim_scores]

  return metadata["title"].iloc[movie_indices]


In [16]:
get_recommendation("The Dark Knight Rises")

Unnamed: 0,title
12481,The Dark Knight
150,Batman Forever
1328,Batman Returns
15511,Batman: Under the Red Hood
585,Batman
21194,Batman Unmasked: The Psychology of the Dark Kn...
9230,Batman Beyond: Return of the Joker
18035,Batman: Year One
19792,"Batman: The Dark Knight Returns, Part 1"
3095,Batman: Mask of the Phantasm
