# Movie Recommender System
A content-based movie recommendation system using cosine similarity

## 1. Import Libraries

In [1]:
import numpy as np
import pandas as pd
import ast
import re
from IPython.display import display

## 2. Load Data

In [2]:
# Load movie and credits datasets
movies = pd.read_csv('tmdb_5000_movies.csv', sep=',')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [3]:
# Merge datasets on title
movies = movies.merge(credits, on='title')
print(f"Total movies: {len(movies)}")

Total movies: 4809


## 3. Data Preprocessing

In [4]:
# Select relevant columns
movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'original_language', 'release_date', 'runtime']]

# Truncate overview to first 2 sentences for better processing
def truncate_overview(text):
    if isinstance(text, str):
        sentences = text.split('. ')
        return '. '.join(sentences[:2]) + ('.' if len(sentences) > 1 else '')
    return text

movies['overview'] = movies['overview'].apply(truncate_overview)
movies.head(3)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,original_language,release_date,runtime
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...",en,2009-12-10,162.0
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de...",en,2007-05-19,169.0
2,206647,Spectre,A cryptic message from Bond’s past sends him o...,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...","[{""cast_id"": 1, ""character"": ""James Bond"", ""cr...","[{""credit_id"": ""54805967c3a36829b5002c41"", ""de...",en,2015-10-26,148.0


In [5]:
# Check for missing values
print("Missing values before cleaning:")
print(movies.isnull().sum())

# Drop rows with missing values
movies.dropna(inplace=True)

print(f"\nMovies after cleaning: {len(movies)}")
print("\nMissing values after cleaning:")
print(movies.isnull().sum())

Missing values before cleaning:
id                   0
title                0
overview             3
genres               0
keywords             0
cast                 0
crew                 0
original_language    0
release_date         1
runtime              2
dtype: int64

Movies after cleaning: 4805

Missing values after cleaning:
id                   0
title                0
overview             0
genres               0
keywords             0
cast                 0
crew                 0
original_language    0
release_date         0
runtime              0
dtype: int64


In [6]:
# Check for duplicates
print(f"Duplicate rows: {movies.duplicated().sum()}")

Duplicate rows: 0


## 4. Feature Extraction

In [7]:
# Helper function to extract names from stringified JSON lists
def convert(obj):
    """Extract all names from a list of dictionaries"""
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

def convert_cast(obj):
    """Extract top 3 cast members"""
    L = []
    count = 0
    for i in ast.literal_eval(obj):
        if count != 3:
            count += 1
            L.append(i['name'])
        else:
            break
    return L

def fetch_director(obj):
    """Extract director name from crew"""
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L

In [8]:
# Apply extraction functions
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(fetch_director)

print("Sample extracted features:")
movies[['title', 'genres', 'cast', 'crew']].head(2)

Sample extracted features:


Unnamed: 0,title,genres,cast,crew
0,Avatar,"[Action, Adventure, Fantasy, Science Fiction]","[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
1,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action]","[Johnny Depp, Orlando Bloom, Keira Knightley]",[Gore Verbinski]


## 5. Text Processing

In [9]:
# Convert overview to list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Remove spaces from multi-word names (e.g., "Science Fiction" -> "ScienceFiction")
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])

print("Sample processed features:")
movies[['title', 'overview', 'genres']].head(1)

Sample processed features:


Unnamed: 0,title,overview,genres
0,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, ScienceFiction]"


In [10]:
# Combine all features into a single 'tag' column
movies['tag'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create final dataframe with essential columns
newMovieDf = movies[['id', 'title', 'tag']].copy()

# Convert tag list to string
newMovieDf['tag'] = newMovieDf['tag'].apply(lambda x: " ".join(x))

# Convert to lowercase
newMovieDf['tag'] = newMovieDf['tag'].apply(lambda x: x.lower())

print(f"\nFinal dataset shape: {newMovieDf.shape}")
print(f"\nSample tag:\n{newMovieDf['tag'][0][:200]}...")


Final dataset shape: (4805, 3)

Sample tag:
in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy...


## 6. Text Stemming (NLTK)

In [11]:
# Import NLTK for stemming
import nltk
from nltk.stem.porter import PorterStemmer

ps = PorterStemmer()

In [12]:
# Download required NLTK data
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
print("NLTK data downloaded successfully!")

NLTK data downloaded successfully!


In [13]:
# Define stemming function
def stem(text):
    """Apply Porter Stemmer to reduce words to root form"""
    y = []
    for i in text.split(" "):
        y.append(ps.stem(i))
    return " ".join(y)

# Test stemming
print("Stemming test:")
print(f"'loved danced' -> '{stem('loved danced')}'")

Stemming test:
'loved danced' -> 'love danc'


In [14]:
# Apply stemming to all tags
print("Applying stemming to all movie tags...")
newMovieDf['tag'] = newMovieDf['tag'].apply(stem)
print("Stemming complete!")
print(f"\nSample stemmed tag:\n{newMovieDf['tag'][0][:200]}...")

Applying stemming to all movie tags...
Stemming complete!

Sample stemmed tag:
in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cul...


## 7. Vectorization & Similarity Computation

In [15]:
# Import scikit-learn libraries
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [16]:
# Create CountVectorizer (max 5000 features, remove English stop words)
cv = CountVectorizer(max_features=5000, stop_words='english')

# Transform tags to vector representation
vector = cv.fit_transform(newMovieDf['tag']).toarray()

print(f"Vector shape: {vector.shape}")
print(f"Number of movies: {vector.shape[0]}")
print(f"Number of features: {vector.shape[1]}")

Vector shape: (4805, 5000)
Number of movies: 4805
Number of features: 5000


In [17]:
# Compute cosine similarity matrix
similarity = cosine_similarity(vector)

print(f"Similarity matrix shape: {similarity.shape}")
print(f"\nSample similarities for first movie:")
print(similarity[0][:10])

Similarity matrix shape: (4805, 4805)

Sample similarities for first movie:
[1.         0.08134892 0.08385255 0.05795006 0.18450624 0.10660036
 0.03922323 0.14301939 0.05679618 0.0942809 ]


## 8. Title Normalization for Robust Lookup

In [18]:
# Create normalized title column for flexible matching
def normalize_title(s: str) -> str:
    """Normalize title: lowercase, remove non-alphanumeric characters"""
    s = str(s).lower()
    s = re.sub(r"[^a-z0-9]+", "", s)  # Keep only alphanumerics
    return s

newMovieDf['title_norm'] = newMovieDf['title'].apply(normalize_title)

print("Sample normalized titles:")
print(newMovieDf[['title', 'title_norm']].head(5))

Sample normalized titles:
                                      title                        title_norm
0                                    Avatar                            avatar
1  Pirates of the Caribbean: At World's End  piratesofthecaribbeanatworldsend
2                                   Spectre                           spectre
3                     The Dark Knight Rises                thedarkknightrises
4                               John Carter                        johncarter


## 9. Recommendation Function

In [19]:
from difflib import get_close_matches

def recommend(title_query: str, k: int = 10):
    """
    Recommend k similar movies based on the input title.
    
    Parameters:
    - title_query: Movie title (case-insensitive, flexible spelling)
    - k: Number of recommendations to return (default: 10)
    
    Returns:
    - Prints list of recommended movie titles
    """
    # Normalize query
    q = normalize_title(title_query)
    
    # Find exact match
    matches = newMovieDf[newMovieDf['title_norm'] == q]
    
    if matches.empty:
        # No exact match - suggest similar titles
        candidates = newMovieDf['title_norm'].tolist()
        close = get_close_matches(q, candidates, n=5, cutoff=0.6)
        
        if close:
            print(f"Title not found: '{title_query}'. Did you mean:")
            for c in close:
                print(' •', newMovieDf.loc[newMovieDf['title_norm'] == c, 'title'].iloc[0])
        else:
            print(f"Title not found: '{title_query}'. Try another title or spelling.")
        return
    
    # Get movie index
    movie_index = matches.index[0]
    
    # Find top-k most similar movies (excluding the input movie itself)
    top = sorted(enumerate(similarity[movie_index]), key=lambda x: x[1], reverse=True)[1:k+1]
    
    print(f"Top {k} recommendations for '{matches.iloc[0]['title']}':\n")
    for rank, (idx, score) in enumerate(top, 1):
        print(f"{rank}. {newMovieDf.iloc[idx].title} (similarity: {score:.4f})")

## 10. Test Recommendations

In [20]:
# Test with Spider-Man
recommend('Spider-Man', k=5)

Top 5 recommendations for 'Spider-Man':

1. Spider-Man 3 (similarity: 0.4760)
2. Spider-Man 2 (similarity: 0.4397)
3. The Amazing Spider-Man 2 (similarity: 0.3462)
4. Arachnophobia (similarity: 0.2669)
5. The Amazing Spider-Man (similarity: 0.2288)


In [21]:
# Test with Avatar
recommend('Avatar', k=5)

Top 5 recommendations for 'Avatar':

1. Aliens vs Predator: Requiem (similarity: 0.2697)
2. Aliens (similarity: 0.2565)
3. Falcon Rising (similarity: 0.2539)
4. Titan A.E. (similarity: 0.2440)
5. Battle: Los Angeles (similarity: 0.2339)


In [22]:
# Test with flexible spelling
recommend('the dark knight', k=5)

Top 5 recommendations for 'The Dark Knight':

1. The Dark Knight Rises (similarity: 0.4543)
2. Batman Begins (similarity: 0.4518)
3. Batman & Robin (similarity: 0.3643)
4. Amidst the Devil's Wings (similarity: 0.3381)
5. Batman Returns (similarity: 0.3313)


## 11. Explore Dataset

In [23]:
# Show random movies for testing
print("Random sample of movies in dataset:")
newMovieDf.sample(10)[['title']].values.flatten()

Random sample of movies in dataset:


array(['Facing the Giants', 'Pocahontas',
       'The Oogieloves in the Big Balloon Adventure', 'Legal Eagles',
       'Paul Blart: Mall Cop 2', 'The Wolf of Wall Street', 'Jaws',
       'Monsoon Wedding', 'Son of the Mask', 'Godsend'], dtype=object)

In [24]:
import pickle


In [25]:
pickle.dump(newMovieDf, open('movies.pkl', 'wb'))

In [27]:
pickle.dump(similarity, open('similarity_list.pkl', 'wb'))