In [1]:
import kagglehub
from kagglehub import KaggleDatasetAdapter
import os
import shutil
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer


# Download dataset from Kaggle
kagglehub.dataset_download('jrobischon/wikipedia-movie-plots')

# Move dataset to working directory
source_path = "/root/.cache/kagglehub/datasets/jrobischon/wikipedia-movie-plots/versions/1/wiki_movie_plots_deduped.csv"
destination_path = os.path.join(os.getcwd(), "wiki_movie_plots_deduped.csv")

if os.path.exists(source_path):
    try:
        shutil.move(source_path, destination_path)
        print(f"File moved successfully to: {destination_path}")
    except Exception as e:
        print(f"Error moving file: {e}")
else:
    print(f"Source file not found: {source_path}")

Downloading from https://www.kaggle.com/api/v1/datasets/download/jrobischon/wikipedia-movie-plots?dataset_version_number=1...


100%|██████████| 29.9M/29.9M [00:00<00:00, 113MB/s] 

Extracting files...





File moved successfully to: /content/wiki_movie_plots_deduped.csv


In [2]:


# Load the dataset
df = pd.read_csv("wiki_movie_plots_deduped.csv")

# Select relevant columns
df = df[['Title', 'Plot']]
df.dropna(inplace=True)

# Download required NLTK data
nltk.download('wordnet')
nltk.download('omw-1.4')
lemmatizer = WordNetLemmatizer()

# Define cleaning and lemmatization function
def clean_and_lemmatize(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)  # Remove special characters
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(tokens)

# Apply text cleaning and lemmatization
df["Plot"] = df["Plot"].apply(clean_and_lemmatize)

print("Dataset Loaded. Shape:", df.shape)
print(df.head())


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.dropna(inplace=True)
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Dataset Loaded. Shape: (34886, 2)
                              Title  \
0            Kansas Saloon Smashers   
1     Love by the Light of the Moon   
2           The Martyred Presidents   
3  Terrible Teddy, the Grizzly King   
4            Jack and the Beanstalk   

                                                Plot  
0  a bartender is working at a saloon serving dri...  
1  the moon painted with a smiling face hang over...  
2  the film just over a minute long is composed o...  
3  lasting just second and consisting of two shot...  
4  the earliest known adaptation of the classic f...  


In [3]:
# Convert the movie plots to a TF-IDF matrix
vectorizer = TfidfVectorizer(stop_words="english",
                             max_df=0.85,  # Reduce dominance of overly common words
                             min_df=3,     # Ignore words appearing in very few plots
                             ngram_range=(1,2),  # Capture single words + bigrams
                             sublinear_tf=True)  # Scale frequent words for better balance


tfidf_matrix = vectorizer.fit_transform(df["Plot"])

print("TF-IDF Matrix Shape:", tfidf_matrix.shape)




TF-IDF Matrix Shape: (34886, 362984)


In [4]:


# Define the recommendation function
def get_recommendations(user_query, top_n=5):
    # Clean and lemmatize the query just like the dataset
    user_query_cleaned = clean_and_lemmatize(user_query)
    query_vec = vectorizer.transform([user_query_cleaned])

    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Normalize the similarity scores
    normalized_similarities = similarities / similarities.max()

    top_indices = normalized_similarities.argsort()[::-1][:top_n]
    recommendations = df.iloc[top_indices].copy()
    recommendations["similarity_score"] = normalized_similarities[top_indices]
    return recommendations



In [5]:
# Example query
user_input = "I love thrilling action movies set in space, with a comedic twist."
recommendations = get_recommendations(user_input, top_n=5)

# Print results
print("User Query:", user_input, "\n")
print("Top Recommendations:")
for i, row in recommendations.iterrows():
    print(f"{row['Title']} (Score: {row['similarity_score']:.3f})")
    print(f"Plot: {row['Plot'][:]}...\n")


User Query: I love thrilling action movies set in space, with a comedic twist. 

Top Recommendations:
Native (Score: 1.000)
Plot: pilot in space are drawn towards a distant music...

Nine Lives Are Not Enough (Score: 0.955)
Plot: a reporter try to solve a series of boardinghouse murder the dramatic main plot murder action is intermixed with farce and slapstick comedic element...

Power (Score: 0.924)
Plot: the plot is simple a man who wa the lookalike of an honest police officer in his previous life is eager to join the police he get the opportunityand what follows next is a thrilling ride of comedy with kharaj and action along with a flashback to his previous life which wa thought to be another person altogether...

Rasta (Score: 0.828)
Plot: a fastpaced thriller with mithun providing the twist...

Spy (Score: 0.827)
Plot: the movie is set in the year month before the german invasion of russia the two protagonist nkvd officer dorin and oktyabrsky are hunting a german spy in moscow the