In [None]:
url='https://www.kaggle.com/datasets/rounakbanik/the-movies-dataset'

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import ast

# Download NLTK data (run once)
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:


def load_and_filter_data(filepath):
    """Load and filter the dataset to include only necessary and enhanced columns."""
    # Load the dataset
    df = pd.read_csv(filepath)
    # Select relevant columns
    columns = ['title', 'overview', 'genres', 'tagline', 'popularity', 'vote_average', 'revenue']
    df_filtered = df[columns]

    # Drop rows with missing values in 'title' or 'overview'
    df_filtered = df_filtered.dropna(subset=['title', 'overview'])

    # Fill missing values in other columns
    df_filtered['tagline'] = df_filtered['tagline'].fillna('')
    df_filtered['genres'] = df_filtered['genres'].fillna('[]')
    df_filtered['popularity'] = df_filtered['popularity'].fillna(0)
    df_filtered['vote_average'] = df_filtered['vote_average'].fillna(0)
    df_filtered['revenue'] = df_filtered['revenue'].fillna(0)

    # Convert genres from string to list of dictionaries
    df_filtered['genres'] = df_filtered['genres'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else [])

    # Extract genre names
    df_filtered['genres'] = df_filtered['genres'].apply(lambda x: ', '.join([g['name'] for g in x]))

    # Combine 'overview', 'tagline', and 'genres' into a single text feature
    df_filtered['combined_text'] = df_filtered['overview'] + ' ' + df_filtered['tagline'] + ' ' + df_filtered['genres']

    # Preprocess the combined text
    df_filtered['combined_text'] = df_filtered['combined_text'].apply(preprocess_text)

    return df_filtered



def preprocess_text(text):
    """Preprocess text by lowercasing, removing special characters, and lemmatizing."""
    # Lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # Lemmatize
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text

def vectorize_text(df):
    """Convert combined text into TF-IDF vectors."""
    tfidf = TfidfVectorizer(stop_words='english', max_features=5000, ngram_range=(1, 2))
    tfidf_matrix = tfidf.fit_transform(df['combined_text'])
    return tfidf, tfidf_matrix

def recommend_movies(query, df, tfidf, tfidf_matrix, top_n=5):
    """Recommend movies based on user query."""
    # Preprocess query
    query = preprocess_text(query)
    # Transform query into TF-IDF vector
    query_vector = tfidf.transform([query])

    # Compute cosine similarity
    similarity_scores = cosine_similarity(query_vector, tfidf_matrix).flatten()

    # Get top N recommendations
    top_indices = similarity_scores.argsort()[-top_n:][::-1]
    recommendations = df.iloc[top_indices].copy()
    recommendations.loc[:, 'similarity'] = similarity_scores[top_indices]  # Fix SettingWithCopyWarning

    return recommendations

if __name__ == "__main__":
    # Load and filter data
    df = load_and_filter_data('/content/drive/MyDrive/SEM-3/movies_metadata.csv')


    # Vectorize text
    tfidf, tfidf_matrix = vectorize_text(df)

    # Ask the user for input
    query = input("What kind of movies do you like? (e.g., 'I love thrilling action movies set in space'): ")

    # Get recommendations
    recommendations = recommend_movies(query, df, tfidf, tfidf_matrix)

    # Print results
    print("\nTop 5 Recommended Movies:")
    for i, row in recommendations.iterrows():
        print(f"{row['title']} (Similarity: {row['similarity']:.2f})")

  df = pd.read_csv(filepath)


What kind of movies do you like? (e.g., 'I love thrilling action movies set in space'): comedy

Top 5 Recommended Movies:
Money Is Not Everything (Similarity: 1.00)
Afstiros katallilo (Similarity: 1.00)
Job, czyli ostatnia szara komórka (Similarity: 1.00)
Cabbages and Kings (Similarity: 1.00)
Mr. Kuka's Advice (Similarity: 1.00)
