In [4]:
import pandas as pd

# Load dataset
file_path = "/kaggle/input/movie-dataset/cleaned_hollywood_movies_1980_2024 (1).csv"
df = pd.read_csv(file_path)

# Drop rows with missing key info
df = df.dropna(subset=['title', 'genre', 'language', 'directors'])

# Combine columns into a single feature for similarity
df['combined'] = (
    df['title'].fillna('') + ' ' +
    df['genre'].fillna('') + ' ' +
    df['language'].fillna('') + ' ' +
    df['directors'].fillna('')
).str.lower()


In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import NearestNeighbors

# Vectorize using TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['combined'])

# NearestNeighbors model (more memory-efficient than cosine similarity matrix)
nn = NearestNeighbors(n_neighbors=6, metric='cosine', algorithm='brute')
nn.fit(tfidf_matrix)

# Map titles to indices
indices = pd.Series(df.index, index=df['title'].str.lower()).drop_duplicates()


In [6]:
def recommend(title, num=5):
    title = title.lower()
    if title not in indices:
        return "❌ Movie not found in dataset."
    
    idx = indices[title]
    distances, neighbors = nn.kneighbors(tfidf_matrix[idx], n_neighbors=num+1)
    
    # Skip the first match (it's the movie itself)
    recommended_indices = neighbors.flatten()[1:]
    return df['title'].iloc[recommended_indices].tolist()


In [7]:
recommend("Inception")


['Alina idässä', 'D&Co', 'Twenty-five', 'D.E.A.', 'Becoming Us']