In [1]:
# Install required libraries
!pip install pandas numpy scikit-learn



In [2]:
import pandas as pd
import json
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# ===========================
# Part 1: Loading the Dataset
# ===========================

def load_dataset(csv_path):
    """
    Load the movie dataset CSV.
    
    Parameters:
      csv_path (str): Path to the CSV file.
    
    Returns:
      DataFrame: Loaded pandas DataFrame.
    """
    def remove_generic_terms(text):
        """
        Remove generic terms like 'movie', 'movies', and 'film' from the text.
        """
        # Use regex to remove these words; \b ensures we match whole words.
        return re.sub(r'\b(movie|movies|film)\b', '', text, flags=re.IGNORECASE)
        
    df = pd.read_csv(csv_path)
    df['overview'] = df['overview'].apply(remove_generic_terms)
    return df


csv_path = 'sampled_movies.csv'
df = load_dataset(csv_path)
print("Dataset loaded, shape:", df.shape)


# ===========================
# Part 2: Vectorising the Text
# ===========================

def vectorize_content(df, text_field='overview'):
    """
    Create TF-IDF vectors from the text content in the specified column.
    
    Parameters:
      df (DataFrame): The movies DataFrame.
      text_field (str): The column name containing text to vectorise.
    
    Returns:
      vectorizer: The fitted TfidfVectorizer.
      tfidf_matrix: TF-IDF feature matrix for the text.
    """
    vectorizer = TfidfVectorizer(stop_words='english')
    tfidf_matrix = vectorizer.fit_transform(df[text_field])
    return vectorizer, tfidf_matrix


vectorizer, tfidf_matrix = vectorize_content(df, text_field='overview')
print("TF-IDF matrix shape:", tfidf_matrix.shape)


# =========================================
# Part 3: Recommendation with Cosine Similarity
# =========================================

def get_recommendations(query, vectorizer, tfidf_matrix, df, top_n=5):
    """
    Given a user query, compute cosine similarity between the query vector and all movie vectors,
    and return the top_n movie recommendations.
    
    Parameters:
      query (str): The user's text input.
      vectorizer: Fitted TfidfVectorizer.
      tfidf_matrix: TF-IDF feature matrix for the movies.
      df (DataFrame): The movies DataFrame.
      top_n (int): Number of top recommendations to return.
    
    Returns:
      DataFrame: Top recommended movies along with similarity scores.
    """
    
    query_vec = vectorizer.transform([query])
    cosine_sim = cosine_similarity(query_vec, tfidf_matrix).flatten()
    
    top_indices = np.argsort(cosine_sim)[::-1][:top_n]
    
    recommendations = df.iloc[top_indices].copy()
    recommendations['similarity_score'] = cosine_sim[top_indices]
    return recommendations

user_query = "I like action movies set in space"
recommendations = get_recommendations(user_query, vectorizer, tfidf_matrix, df, top_n=5)

pd.set_option('display.max_colwidth', None)
print("\nTop Recommendations:")
print(recommendations[[ 'movie_name', 'release_date', 'all_genres', 'overview']])


Dataset loaded, shape: (500, 5)
TF-IDF matrix shape: (500, 15672)

Top Recommendations:
                      movie_name release_date  \
371             Making Mr. Right   1987-04-03   
230      Battle Beyond the Stars   1980-09-08   
266  The Valley of Vanishing Men   1942-12-12   
227                         Pulp      1972-11   
358      Monster from Green Hell      1957-05   

                                                                                          all_genres  \
371                                           Romantic comedy, Science Fiction, Romance Film, Comedy   
230                                         Parody, Science Fiction, Indie, Space western, Adventure   
266                                                                       Adventure, Black-and-white   
227  Parody, Thriller, Mystery, Crime Fiction, Comedy, Crime Thriller, Drama, Black comedy, Suspense   
358             Natural horror films, Science Fiction, Horror, Sci-Fi Horror, Creature Film, Mons