In [3]:
import pandas as pd

# Load the dataset
df = pd.read_csv("titles.csv")

# Show the first few rows to understand the structure
print(df.head())


         id                                title   type  \
0  ts300399  Five Came Back: The Reference Films   SHOW   
1   tm84618                          Taxi Driver  MOVIE   
2  tm154986                          Deliverance  MOVIE   
3  tm127384      Monty Python and the Holy Grail  MOVIE   
4  tm120801                      The Dirty Dozen  MOVIE   

                                         description  release_year  \
0  This collection includes 12 World War II-era p...          1945   
1  A mentally unstable Vietnam War veteran works ...          1976   
2  Intent on seeing the Cahulawassee River before...          1972   
3  King Arthur, accompanied by his squire, recrui...          1975   
4  12 American military prisoners in World War II...          1967   

  age_certification  runtime                                       genres  \
0             TV-MA       51                            ['documentation']   
1                 R      114                           ['drama', 'crim

In [5]:
def preprocess_text(text):
    # Check if the text is a string, else return an empty string or some default value
    if isinstance(text, str):
        # Convert text to lowercase
        text = text.lower()
        # Remove non-alphanumeric characters
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    else:
        text = ""
    return text

# Apply preprocessing to the 'description' column
df['processed_description'] = df['description'].apply(preprocess_text)


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize the vectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and transform the descriptions
tfidf_matrix = vectorizer.fit_transform(df['processed_description'])

# Now we'll need to process the user query the same way.
def vectorize_user_query(query):
    query = preprocess_text(query)
    return vectorizer.transform([query])


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

def get_top_recommendations(query, top_n=5):
    # Vectorize the user query
    query_vec = vectorize_user_query(query)
    
    # Compute cosine similarity between the query and all movie descriptions
    similarities = cosine_similarity(query_vec, tfidf_matrix).flatten()

    # Get the indices of the top N most similar movies
    top_indices = similarities.argsort()[-top_n:][::-1]

    # Get the top N movie titles and similarity scores
    recommendations = [(df.iloc[i]['title'], similarities[i]) for i in top_indices]
    
    return recommendations


In [8]:
# Example query
query = "I love thrilling action movies set in space, with a comedic twist."

# Get recommendations
recommendations = get_top_recommendations(query, top_n=5)

# Display recommendations
for title, score in recommendations:
    print(f"Title: {title}, Similarity Score: {score:.4f}")


Title: Fukrey Boyzzz: Space Mein Fukrapanti, Similarity Score: 0.1698
Title: A StoryBots Space Adventure, Similarity Score: 0.1696
Title: The Wonderful: Stories from the Space Station, Similarity Score: 0.1580
Title: Gattaca, Similarity Score: 0.1540
Title: Marco Luque - We are together, Similarity Score: 0.1450
