In [1]:
import pandas as pd
import numpy as nd
import ast

In [3]:
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = pd.read_csv('tmdb_5000_movies.csv')

In [4]:
movies = movies.merge(credits,left_on='title', right_on='title')

In [5]:
movies=movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [6]:
def convert(obj):
  L=[]
  for i in ast.literal_eval(obj):
    L.append(i['name'])
  return L

In [7]:
movies['genres']=movies['genres'].apply(convert)

In [8]:
movies['keywords']=movies['keywords'].apply(convert)

In [9]:
movies['cast'] = movies['cast'].apply(lambda x: [i['name'] for i in ast.literal_eval(x)[:3]])

In [10]:
movies['crew'] = movies['crew'].apply(lambda x: [i['name'] for i in ast.literal_eval(x) if i['job'] == 'Director'])

In [11]:
movies['tags'] = movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew'] 

In [12]:
movies = movies[['movie_id','title','tags', 'overview']]

In [13]:
movies.head()

Unnamed: 0,movie_id,title,tags,overview
0,19995,Avatar,"[Action, Adventure, Fantasy, Science Fiction, ...","In the 22nd century, a paraplegic Marine is di..."
1,285,Pirates of the Caribbean: At World's End,"[Adventure, Fantasy, Action, ocean, drug abuse...","Captain Barbossa, long believed to be dead, ha..."
2,206647,Spectre,"[Action, Adventure, Crime, spy, based on novel...",A cryptic message from Bond’s past sends him o...
3,49026,The Dark Knight Rises,"[Action, Crime, Drama, Thriller, dc comics, cr...",Following the death of District Attorney Harve...
4,49529,John Carter,"[Action, Adventure, Science Fiction, based on ...","John Carter is a war-weary, former military ca..."


In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Clean tags column
movies['tags'] = movies['tags'].fillna('').astype(str)

# Apply TF-IDF
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(movies['tags'])

In [15]:

# Block 1: TF-IDF Vectorization
# This section converts a collection of raw documents into a matrix of TF-IDF features.
from sklearn.feature_extraction.text import TfidfVectorizer

# Initialize TfidfVectorizer with English stopwords to filter out common words
tfidf = TfidfVectorizer(stop_words='english')

# Fit and transform the 'tags' column from the 'movies' dataframe
# This creates a matrix where each row represents a movie and each column represents a word
tfidf_matrix = tfidf.fit_transform(movies['tags'])

# Block 2: Cosine Similarity
# This part calculates the cosine similarity between all movies based on their TF-IDF vectors.
from sklearn.metrics.pairwise import cosine_similarity

# Compute the cosine similarity matrix
# The result is a matrix where each element (i, j) is the cosine similarity between movie i and movie j
cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Block 3: Recommendation Function
# This function takes a movie title and the cosine similarity matrix to recommend similar movies.
def get_recommendations(title, cosine_sim=cosine_sim):
    # Get the index of the movie that matches the title
    idx = movies[movies['title'] == title].index[0]

    # Get the similarity scores for all movies with the given movie
    # enumerate() adds a counter to the list, creating tuples of (index, similarity_score)
    sim_scores = list(enumerate(cosine_sim[idx]))

    # Sort the list of tuples in descending order based on the similarity score
    # The lambda function specifies that the sorting key is the second element of the tuple (the score)
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get the top 10 most similar movies (excluding the movie itself, which will have a score of 1)
    sim_scores = sim_scores[1:11] 

    # Extract the movie indices from the sorted list
    movie_indices = [i[0] for i in sim_scores]

    # Return the titles of the recommended movies
    return movies['title'].iloc[movie_indices]


In [16]:
import pickle
with open('movie_data.pkl', 'wb') as file:
    pickle.dump((movies,cosine_sim),file)