# Movie Recommendation System
### This notebook contains the implementation of a content-based movie recommendation system.

## Import Libraries
Import necessary libraries like pandas, numpy, and scikit-learn.

In [2]:
import pandas as pd  # For data manipulation
import numpy as np  # For numerical operations
from sklearn.feature_extraction.text import CountVectorizer  # For text vectorization
from sklearn.metrics.pairwise import cosine_similarity  # For computing similarity

## Load and Merge Datasets
Load the movie and credit datasets, and merge them based on the title column.


In [5]:
# Load datasets
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

# Merge datasets on the title column
movies = movies.merge(credits, on='title')


In [6]:
movies.head(2)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,movie_id,cast,crew
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,19995,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,285,"[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


## Data Preprocessing
### Clean and preprocess the data to prepare it for feature extraction.

In [7]:
movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']] # Select Relevant Columns

In [8]:
movies.head(2)

Unnamed: 0,movie_id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
1,285,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...","[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...","[{""cast_id"": 4, ""character"": ""Captain Jack Spa...","[{""credit_id"": ""52fe4232c3a36847f800b579"", ""de..."


In [10]:
movies.dropna(inplace=True) # Drop missing values

In [11]:
movies.drop_duplicates(inplace=True) # Check and drop duplicates

In [12]:
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [14]:
print(movies.dtypes)


movie_id     int64
title       object
overview    object
genres      object
keywords    object
cast        object
crew        object
dtype: object


## Feature Engineering
Extract meaningful features and combine them into a single column for processing.


In [17]:
import ast

In [18]:
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)


In [21]:
#Merge overview, genres, keywords, cast, and crew into a single 'tags' column.
# Convert all columns to strings
movies['overview'] = movies['overview'].apply(lambda x: "" if isinstance(x, float) else x)  # Handle NaN
movies['genres'] = movies['genres'].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
movies['keywords'] = movies['keywords'].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
movies['cast'] = movies['cast'].apply(lambda x: " ".join(x) if isinstance(x, list) else "")
movies['crew'] = movies['crew'].apply(lambda x: " ".join(x) if isinstance(x, list) else "")

# Combine all columns into a single 'tags' column
movies['tags'] = (
    movies['overview'] + " " +
    movies['genres'] + " " +
    movies['keywords'] + " " +
    movies['cast'] + " " +
    movies['crew']
)

In [23]:
#Convert the 'tags' column into numerical vectors using CountVectorizer.
# Text Vectorization 
from sklearn.feature_extraction.text import CountVectorizer 
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(movies['tags']).toarray()

In [24]:
# Compute Similarity - Calculate cosine similarity between movie vectors.
from sklearn.metrics.pairwise import cosine_similarity
similarity = cosine_similarity(vectors)


In [25]:
# Recommendation Function - Define a function to recommend movies based on cosine similarity.
def recommend(movie):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    for i in movies_list:
        print(movies.iloc[i[0]].title)


In [26]:
# Save Processed Data - Save the preprocessed data and similarity matrix as pickle files.
import pickle
pickle.dump(movies, open('movies.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))

## TESTING

In [32]:
def recommend(movie, movies, similarity):
    movie_index = movies[movies['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    recommended = [movies.iloc[i[0]].title for i in movies_list]
    return recommended



In [33]:
# Example Prediction
user_input = "Avatar"  # A movie the user likes
predicted_movies = recommend(user_input, movies, similarity)
print(f"Recommended Movies for '{user_input}': {predicted_movies}")

Recommended Movies for 'Avatar': ['Beowulf', 'Apollo 18', 'Tears of the Sun', 'The American', 'Aliens vs Predator: Requiem']


In [35]:
def precision_recall_f1(ground_truth, predicted):
    """
    Calculate precision, recall, and F1-score.
    
    Parameters:
        ground_truth (list): List of actual movies the user liked.
        predicted (list): List of movies recommended by the system.
    
    Returns:
        tuple: Precision, recall, and F1-score.
    """
    true_positives = len(set(ground_truth) & set(predicted))  # Common items in both lists
    precision = true_positives / len(predicted) if len(predicted) > 0 else 0
    recall = true_positives / len(ground_truth) if len(ground_truth) > 0 else 0
    f1_score = (2 * precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

    return precision, recall, f1_score


In [37]:
# Step 1: Predict Recommendations
user_input = "Avatar"
predicted_movies = recommend(user_input, movies, similarity)
print(f"Predicted Movies: {predicted_movies}")

# Step 2: Ground Truth (User's actual preferences)
ground_truth = ["Apollo 18", "Beowulf", "Tears of the Sun", "Interstellar"]

#Step 3: Evaluate Predictions
precision, recall, f1_score = precision_recall_f1(ground_truth, predicted_movies)
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1_score}")


Predicted Movies: ['Beowulf', 'Apollo 18', 'Tears of the Sun', 'The American', 'Aliens vs Predator: Requiem']
Precision: 0.6
Recall: 0.75
F1-Score: 0.6666666666666665
