In [1]:
import numpy as np
import pandas as pd
import ast
import re
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

print('Movies rows:', len(movies))
print('Credits rows:', len(credits))

Movies rows: 4803
Credits rows: 4803


In [2]:
df = movies.merge(credits, on='title', how='inner')
df = df[['id', 'title', 'genres', 'keywords', 'overview', 'cast', 'crew']].copy()

for col in ['genres', 'keywords', 'overview', 'cast', 'crew']:
    df[col] = df[col].fillna('[]')

print('Merged dataframe shape:', df.shape)
df.head(2)

Merged dataframe shape: (4809, 7)


Unnamed: 0,id,title,genres,keywords,overview,cast,crew
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","In the 22nd century, a paraplegic Marine is di...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","Captain Barbossa, long believed to be dead, ha...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [3]:
def parse_names(column_value):
    try:
        data = ast.literal_eval(column_value)
    except Exception:
        return []
    names = []
    for item in data:
        if isinstance(item, dict):
            if 'name' in item and item['name']:
                names.append(item['name'])
            elif 'job' in item and item['job']:
                names.append(item['job'])
    return names

def get_top_cast(cast_str, top_n=3):
    items = parse_names(cast_str)
    return items[:top_n]

def get_director(crew_str):
    try:
        crew = ast.literal_eval(crew_str)
    except Exception:
        return []
    for person in crew:
        if isinstance(person, dict) and person.get('job') == 'Director':
            return [person.get('name')]
    return []

In [4]:
def clean_list(items):
    cleaned = []
    for it in items:
        if not isinstance(it, str):
            continue
        s = it.lower().replace(' ', '')  
        cleaned.append(s)
    return cleaned

df['genres_list'] = df['genres'].apply(parse_names)
df['keywords_list'] = df['keywords'].apply(parse_names)
df['cast_list'] = df['cast'].apply(get_top_cast)
df['director_list'] = df['crew'].apply(get_director)
df['overview_text'] = df['overview'].apply(lambda x: x if isinstance(x, str) else '')

df['tags'] = df.apply(lambda row: ' '.join(
    clean_list(row['genres_list']) +
    clean_list(row['keywords_list']) +
    clean_list(row['cast_list']) +
    clean_list(row['director_list']) +
    re.sub(r'[^a-zA-Z0-9 ]', ' ', row['overview_text']).lower().split()
), axis=1)

print('Sample tags for first movie:')
print(df.iloc[0]['title'])
print(df.iloc[0]['tags'][:300])

Sample tags for first movie:
Avatar
action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron in the 22nd century 


In [5]:
ps = PorterStemmer()

def stem_text(text):
    return ' '.join([ps.stem(word) for word in text.split()])

df['tags'] = df['tags'].apply(stem_text)
df['tags'] = df['tags'].astype(str)
print('Tags after stemming sample:')
print(df['tags'].iloc[0][:200])
ps

Tags after stemming sample:
action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindan


<PorterStemmer>

In [6]:
tfidf = TfidfVectorizer(max_features=5000, stop_words='english')
vectors = tfidf.fit_transform(df['tags']).toarray()

print('Vector shape:', vectors.shape)

similarity = cosine_similarity(vectors)
print('Computed cosine similarity matrix with shape', similarity.shape)

Vector shape: (4809, 5000)
Computed cosine similarity matrix with shape (4809, 4809)


In [7]:
title_to_index = pd.Series(df.index, index=df['title'].str.lower()).to_dict()

def recommend(movie_name, top_n=10):
    if not isinstance(movie_name, str):
        raise ValueError('movie_name must be a string')
    key = movie_name.lower().strip()
    if key not in title_to_index:
        matches = [t for t in title_to_index.keys() if key in t]
        if not matches:
            raise ValueError(f'Movie \"{movie_name}\" not found in the dataset.')
        idx = title_to_index[matches[0]]
    else:
        idx = title_to_index[key]

    sim_scores = list(enumerate(similarity[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    recommended = []
    for i, score in sim_scores[1: top_n+1]:  
        recommended.append((df.iloc[i]['title'], float(score)))
    return recommended

sample_title = df['title'].iloc[0]
print('Recommendations for:', sample_title)
print(recommend(sample_title, top_n=5))

Recommendations for: Avatar
[('Aliens', 0.2469980227167981), ('Falcon Rising', 0.21160673945757763), ('Battle: Los Angeles', 0.1886776410463586), ('Apollo 18', 0.17170993498785955), ('Star Trek Into Darkness', 0.16870880199066648)]
