<a href="https://colab.research.google.com/github/Charan01729/-Movie-Recommendation-System-Using-Content-Based-Filtering/blob/main/Movie_Recommendation_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Movie Recommendation System Using Content-Based Filtering

**Technologies used**: Python, Pandas, Scikit-Learn.

*   Built a content-based recommendation model using vectorization and cosine similarity, achieving Precision@5: 0.9154, Recall@5: 0.0027, MAP@5: 0.8890, and NDCG@5: 0.9185 on a sample.
*   Analyzed movie metadata (genres, cast, keywords) to generate personalized recommendations.
*   Enabled users to discover similar movies based on selected input titles using text-based similarity scores.

In [None]:
!pip install scikit-learn pandas numpy



In [None]:
# Cell 1 - imports
import pandas as pd
import numpy as np
import ast
import time
import tracemalloc
from difflib import get_close_matches
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neighbors import NearestNeighbors
import pickle

In [None]:
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_credits.csv to tmdb_5000_credits.csv


In [None]:
from google.colab import files
uploaded = files.upload()

Saving tmdb_5000_movies.csv to tmdb_5000_movies.csv


In [None]:
# Cell 2 - load files (assumes tmdb_5000_movies.csv and tmdb_5000_credits.csv are in working dir)
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# merge on title (like your original)
movies = movies.merge(credits, on='title')

# unify id column name (some files use 'id' others 'movie_id')
if 'movie_id' not in movies.columns and 'id' in movies.columns:
    movies = movies.rename(columns={'id':'movie_id'})

# select only relevant columns
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

# drop rows with missing values in these columns
movies.dropna(inplace=True)

In [None]:
# Cell 3 - parsing helpers and building tags
def safe_literal_eval(x):
    try:
        return ast.literal_eval(x)
    except Exception:
        return []

def get_names(text):
    L = safe_literal_eval(text)
    return [d.get('name') for d in L if isinstance(d, dict) and d.get('name')]

def get_director(text):
    L = safe_literal_eval(text)
    return [d.get('name') for d in L if isinstance(d, dict) and d.get('job') == 'Director']

def collapse_no_space(list_in):
    return [str(x).replace(' ', '') for x in list_in]

# apply conversions
movies['genres']   = movies['genres'].apply(get_names).apply(lambda x: x)      # keep all genres
movies['keywords'] = movies['keywords'].apply(get_names)
movies['cast']     = movies['cast'].apply(lambda x: [d.get('name') for d in safe_literal_eval(x)[:3]])
movies['crew']     = movies['crew'].apply(get_director)

# remove spaces inside tokens (e.g., "Tom Hanks" -> "TomHanks") to keep tokens single-word
for col in ['genres','keywords','cast','crew']:
    movies[col] = movies[col].apply(collapse_no_space)

# overview -> token list
movies['overview'] = movies['overview'].apply(lambda x: str(x).split())

# build tags
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# final dataframe used for model
new = movies[['movie_id','title','tags','genres']].copy()
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

# reset index so positional indices (0..n-1) match NN indices
new = new.reset_index(drop=True)

# quick check
print("Dataset size:", len(new))
print(new[['title','tags']].head(3))

Dataset size: 4806
                                      title  \
0                                    Avatar   
1  Pirates of the Caribbean: At World's End   
2                                   Spectre   

                                                tags  
0  In the 22nd century, a paraplegic Marine is di...  
1  Captain Barbossa, long believed to be dead, ha...  
2  A cryptic message from Bondâ€™s past sends him o...  


In [None]:
# Cell 4 - vectorize (sparse) and build NN index
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new['tags'])         # sparse matrix (n_samples x n_features)

# NearestNeighbors with cosine distance (we'll ask for n_neighbors = k+1)
nn = NearestNeighbors(n_neighbors=11, metric='cosine', algorithm='brute')
nn.fit(vector)

# precompute neighbors for all items (fast lookup at query time)
distances, indices = nn.kneighbors(vector)    # distances shape (n_samples, 11), indices likewise

print("Vector shape:", vector.shape)

Vector shape: (4806, 5000)


In [None]:
# Cell 5 - recommend + fuzzy title match (uses difflib.get_close_matches)
titles = new['title'].tolist()

def find_title(query):
    if query in titles:
        return query
    # fuzzy suggestions
    suggestions = get_close_matches(query, titles, n=5, cutoff=0.4)
    if suggestions:
        print("No exact match. Closest matches found (using fuzzy matching):")
        for i, s in enumerate(suggestions, 1):
            print(f"{i}. {s}")
        print("Using the first suggestion:", suggestions[0])
        return suggestions[0]
    return None

def recommend(movie_query, k=5):
    title = find_title(movie_query)
    if title is None:
        return ["Movie not found in dataset (and no close match)"]
    idx = new.index[new['title'] == title][0]    # integer positional index
    rec_pos = indices[idx, 1:k+1]                # skip self (position 0)
    return new.iloc[rec_pos]['title'].tolist()

In [None]:
# Cell 6 - interactive user input
movie_name = input("Enter movie name: ").strip()
k = input("How many recommendations? (enter integer, default 5): ").strip()
k = int(k) if k.isdigit() else 5

recs = recommend(movie_name, k=k)
print(f"\nTop {k} recommendations for '{movie_name}':")
for i, r in enumerate(recs, 1):
    print(f"{i}. {r}")

Enter movie name: Gandhi
How many recommendations? (enter integer, default 5): 5

Top 5 recommendations for 'Gandhi':
1. Gandhi, My Father
2. The Wind That Shakes the Barley
3. A Passage to India
4. Guiana 1838
5. Ramanujan


In [None]:
# Cell 7 - metrics
import math

def precision_at_k(predicted, actual, k):
    if k == 0: return 0.0
    return len(set(predicted[:k]) & set(actual)) / k

def recall_at_k(predicted, actual, k):
    if not actual: return 0.0
    return len(set(predicted[:k]) & set(actual)) / len(actual)

def apk(predicted, actual, k):
    if not actual: return 0.0
    score = 0.0
    hits = 0
    for i, p in enumerate(predicted[:k], start=1):
        if p in actual and p not in predicted[:i-1]:
            hits += 1
            score += hits / i
    return score / min(len(actual), k)

def ndcg_at_k(predicted, actual, k):
    dcg = 0.0
    for i, p in enumerate(predicted[:k], start=1):
        if p in actual:
            dcg += 1.0 / math.log2(i + 1)
    idcg = sum(1.0 / math.log2(i + 1) for i in range(1, min(len(actual), k) + 1))
    return dcg / idcg if idcg > 0 else 0.0

In [None]:
# Cell 8 - evaluate on a sample (genre-overlap as proxy relevance)
def evaluate_recommender(k=5, sample_size=500):
    n = len(new)
    sample = new.sample(n=min(sample_size, n), random_state=42).reset_index(drop=True)

    precisions, recalls, maps, ndcgs = [], [], [], []

    for idx_row in range(len(sample)):
        # find positional index in full 'new' dataframe
        row_title = sample.loc[idx_row, 'title']
        full_idx = new.index[new['title'] == row_title][0]

        # ground truth = titles that share at least one genre (exclude self)
        query_genres = set(new.iloc[full_idx]['genres'])
        if len(query_genres) == 0:
            continue
        actual_idxs = [i for i in range(len(new)) if i != full_idx and len(set(new.iloc[i]['genres']) & query_genres) > 0]
        actual_titles = new.iloc[actual_idxs]['title'].tolist()

        # predicted using precomputed indices
        pred_idxs = indices[full_idx, 1:k+1]
        predicted_titles = new.iloc[pred_idxs]['title'].tolist()

        precisions.append(precision_at_k(predicted_titles, actual_titles, k))
        recalls.append(recall_at_k(predicted_titles, actual_titles, k))
        maps.append(apk(predicted_titles, actual_titles, k))
        ndcgs.append(ndcg_at_k(predicted_titles, actual_titles, k))

    print(f"Evaluated on {len(precisions)} queries (sample_size={len(sample)})")
    print(f"Precision@{k}: {np.mean(precisions):.4f}")
    print(f"Recall@{k}:    {np.mean(recalls):.4f}")
    print(f"MAP@{k}:       {np.mean(maps):.4f}")
    print(f"NDCG@{k}:      {np.mean(ndcgs):.4f}")

# run evaluation (example)
evaluate_recommender(k=5, sample_size=500)

Evaluated on 499 queries (sample_size=500)
Precision@5: 0.9154
Recall@5:    0.0027
MAP@5:       0.8890
NDCG@5:      0.9185


In [None]:
# Cell 9 - save model artifacts and quick latency check
pickle.dump(cv, open('cv_countvec.pkl', 'wb'))
pickle.dump((indices, distances), open('knn_indices_distances.pkl', 'wb'))
pickle.dump(new, open('movie_list_df.pkl', 'wb'))

# measure latency for one query (transform + kneighbors)
query_title = new.iloc[0]['title']
start = time.time()
qvec = cv.transform([new.iloc[0]['tags']])
d, idxs = nn.kneighbors(qvec, n_neighbors=11)
latency = time.time() - start
print("Sample query:", query_title)
print("Latency (transform + knn): {:.4f} sec".format(latency))

Sample query: Avatar
Latency (transform + knn): 0.0118 sec
