In [35]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [34]:
# Source of the dataset from which 450 entries were sampled: https://www.kaggle.com/datasets/victorsoeiro/netflix-tv-shows-and-movies?resource=download
df = pd.read_csv('titles.csv')
df.head()

Unnamed: 0,id,title,type,description,release_year,age_certification,runtime,genres,production_countries,seasons,imdb_id,imdb_score,imdb_votes,tmdb_popularity,tmdb_score
0,ts38805,Beat Bugs,SHOW,"Jay, Kumi, Crick, Buzz, and Walter are best fr...",2016,TV-Y,15,"['family', 'animation', 'comedy', 'fantasy']",['AU'],3.0,tt4716268,7.6,524.0,6.682,6.6
1,tm1077038,Chhota Bheem: Journey to Petra,MOVIE,"The king has been away to meet an old friend, ...",2011,G,67,['animation'],['IN'],,tt6543194,,,,
2,tm1064065,Black Crab,MOVIE,"In a post-apocalyptic world, six soldiers on a...",2022,,114,"['scifi', 'thriller', 'war', 'action', 'drama']",['SE'],,tt6708668,5.6,19159.0,207.948,6.3
3,tm204541,Dark Waters,MOVIE,"Ragab, a poor sailor, returns home to Alexandr...",1956,,120,"['action', 'drama', 'romance', 'thriller']",['EG'],,tt0049761,6.7,610.0,0.849,5.9
4,tm176111,The Great Magician,MOVIE,"In the years after the Revolution, China broke...",2011,,128,"['thriller', 'drama', 'romance', 'comedy']","['CN', 'HK']",,tt1869721,5.9,1774.0,6.144,6.1


In [10]:
# Replace NaN values with an empty string. Prevents errors while using TFIDF
df['description'] = df['description'].fillna('')

In [11]:
# Creating the TF-IDF vectorizer
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['description'])

In [26]:
def getSimilarity(description, n=5):
  """
    Finds the top 5 similar items based on similarity to the movie descriptions.

    Args:
    description (str): The ID of the item to compare against.
    n (int): Number of similar items to return. In this case, 5.

    Returns:
    list: Top n, i.e., 5 similar items with their similarity scores.
    """
  # Transforms the input description
  input_vector = tfidf.transform([description])

  # Computes cosine similarity
  cosine_sim = cosine_similarity(input_vector, tfidf_matrix).flatten()

  # Gets the indices of the top n similar items
  sim_indices = cosine_sim.argsort()[::-1][:n]

  # Gets the similarity scores
  sim_scores = cosine_sim[sim_indices]

  # Creates a list of similar items with their similarity scores and descriptions
  similar_items = [(df.iloc[i]['id'], df.iloc[i]['title'], df.iloc[i]['description'], score)
                    for i, score in zip(sim_indices, sim_scores)]

  return similar_items


In [39]:
# Taking the input from the user for their desired description
input_description = input("Enter the description: ")

# Calls the getSimilarity function with the user input
similar_items = getSimilarity(input_description)

print(f"\n Top 5 Similar Items to: {input_description}\n")

# Prints the matching titles in a readable format
for i, (id_, title_, desc_, score_) in enumerate(similar_items):
    print(f"\n*Item {i+1}:*")
    print(f"ID: {id_}")
    print(f"Title: {title_}")
    print("Description:")
    print(desc_)
    print(f"Similarity Score: **{score_:.4f}**")

Enter the description: I love thrilling action movies set in space, with a comedic twist.

 Top 5 Similar Items to: I love thrilling action movies set in space, with a comedic twist.


*Item 1:*
ID: ts258321
Title: Ada Twist, Scientist
Description:
Ada Twist, a young scientist who will explore helping people through scientific discovery, collaboration and friendship.
Similarity Score: **0.1322**

*Item 2:*
ID: ts252856
Title: Challenger: The Final Flight
Description:
Engineers, officials and the crew members' families provide their perspective on the 1986 Space Shuttle Challenger disaster and its aftermath.
Similarity Score: **0.1318**

*Item 3:*
ID: tm1039571
Title: The Upshaws - The Afterparty
Description:
The comedic dream team from "The Upshaws" discuss the hit series, share some anecdotes from their own upbringings and play a little swag-centric game.
Similarity Score: **0.1260**

*Item 4:*
ID: ts253685
Title: Secret of Success
Description:
An aspiring actor uses a magical book to