
# Movie Recommendation System

This notebook demonstrates the process of building a movie recommendation system by loading, cleaning, merging, and transforming movie datasets using Python. We will perform the following steps:
1. Import necessary libraries.
2. Load datasets.
3. Select relevant columns.
4. Handle missing values.
5. Merge datasets.
6. Convert JSON-like strings to lists.
7. Extract specific information (e.g., director, cast names).
8. Build the recommendation system using cosine similarity.
9. Display the processed data and results.

Let's get started!


In [None]:

# Step 1: Import necessary libraries
import numpy as np
import pandas as pd
import ast
import nltk
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle


In [None]:

# Step 2: Load datasets
# Loading datasets containing movie credits, keywords, and metadata
credits_df = pd.read_csv("credits.csv")
keywords_df = pd.read_csv("keywords.csv")
movies_metadata_df = pd.read_csv("movies_metadata.csv")


In [None]:

# Step 3: Select relevant columns
# We are only interested in a subset of columns from each dataset to reduce memory usage and focus on essential information
movies_metadata_df = movies_metadata_df[['id', 'genres', 'original_title', 'overview', 'production_companies']]
credits_df = credits_df[['id', 'cast', 'crew']]
keywords_df = keywords_df[['id', 'keywords']]
print(movies_metadata_df.shape)


In [None]:

# Step 4: Handle missing values
# Checking for null values in each dataset to ensure data integrity
print(movies_metadata_df.isnull().sum())
print(credits_df.isnull().sum())
print(keywords_df.isnull().sum())

# Filling missing 'id' values with 0 and converting to numeric to prevent errors during merging
movies_metadata_df['id'] = movies_metadata_df['id'].fillna(0)
movies_metadata_df['id'] = pd.to_numeric(movies_metadata_df['id'], errors='coerce')


In [None]:

# Step 5: Merge datasets
# Merging datasets on 'id' column to combine all relevant information into a single DataFrame
merged_df = keywords_df.merge(movies_metadata_df, on='id')
merged_df = merged_df.merge(credits_df, on='id')

# Filling missing JSON-like columns with empty lists to avoid issues during data processing
merged_df['genres'] = merged_df['genres'].fillna('[]')
merged_df['keywords'] = merged_df['keywords'].fillna('[]')
merged_df['production_companies'] = merged_df['production_companies'].fillna('[]')


In [None]:

# Step 6: Convert JSON-like strings to lists
# Define a function to convert JSON-like strings to lists
def convert_to_list(text):
    elements_list = []
    for element in ast.literal_eval(text):
        elements_list.append(element['name'])
    return elements_list

# Apply the function to relevant columns to convert JSON-like strings into Python lists
merged_df['genres'] = merged_df['genres'].apply(convert_to_list)
merged_df['keywords'] = merged_df['keywords'].apply(convert_to_list)


In [None]:

# Step 7: Extract specific information
# Define a function to extract director names from the 'crew' column
def extract_director(crew_data):
    director_list = []
    for member in ast.literal_eval(crew_data):
        if member['job'] == 'Director':
            director_list.append(member['name'])
            break
    return director_list

# Apply the function to extract directors
merged_df['crew'] = merged_df['crew'].apply(extract_director)

# Define a function to extract up to 5 cast names from the 'cast' column
def extract_cast_names(cast_data):
    cast_list = []
    counter = 0
    for member in ast.literal_eval(cast_data):
        if counter < 5:
            cast_list.append(member['name'])
            counter += 1
        else:
            break
    return cast_list

# Apply the function to extract cast names
merged_df['cast'] = merged_df['cast'].apply(extract_cast_names)


In [None]:

# Step 8: Build the recommendation system using cosine similarity
# Combining all relevant text features into a single column for vectorization
merged_df['combined_features'] = merged_df['genres'] + merged_df['keywords'] + merged_df['cast'] + merged_df['crew']
merged_df['combined_features'] = merged_df['combined_features'].apply(lambda x: ' '.join(x))

# Vectorizing the combined features column
vectorizer = CountVectorizer()
count_matrix = vectorizer.fit_transform(merged_df['combined_features'])

# Calculating the cosine similarity matrix
cosine_sim_matrix = cosine_similarity(count_matrix, count_matrix)

# Function to get movie recommendations based on cosine similarity
def get_movie_recommendations(title, cosine_sim=cosine_sim_matrix):
    idx = merged_df[merged_df['original_title'] == title].index[0]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:11]
    movie_indices = [i[0] for i in sim_scores]
    return merged_df['original_title'].iloc[movie_indices]

# Example usage
print(get_movie_recommendations('The Dark Knight'))


In [None]:

# Step 9: Display the processed data
# Display the first few rows of the processed DataFrame to verify the transformations
merged_df.head()
