In [2]:
import numpy as np
import pandas as pd
import ast
import nltk
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

# Load the datasets
movies = pd.read_csv('data/tmdb_5000_movies.csv')
credits = pd.read_csv('data/tmdb_5000_credits.csv')

# Merge movies and credits on the 'title' column
movies = movies.merge(credits, on='title')

# Keep only the important columns
movies = movies[['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

# Drop rows with missing values
movies.dropna(inplace=True)

# Remove duplicates if any
movies.drop_duplicates(inplace=True)

# Convert stringified lists into actual lists using 'ast.literal_eval'
def convert(text):
    return [i['name'] for i in ast.literal_eval(text)]

# Apply the conversion function to the 'genres' and 'keywords' columns
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Handle 'cast' column by keeping only the top 3 cast members
def convert_cast(text):
    return [i['name'] for i in ast.literal_eval(text)[:3]]

movies['cast'] = movies['cast'].apply(convert_cast)

# Extract the director's name from the 'crew' column
def fetch_director(text):
    for i in ast.literal_eval(text):
        if i['job'] == 'Director':
            return [i['name']]
    return []

movies['crew'] = movies['crew'].apply(fetch_director)

# Convert 'overview' column into a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Remove spaces from names in 'cast', 'crew', 'genres', and 'keywords'
def remove_space(L):
    return [i.replace(" ", "") for i in L]

movies['cast'] = movies['cast'].apply(remove_space)
movies['crew'] = movies['crew'].apply(remove_space)
movies['genres'] = movies['genres'].apply(remove_space)
movies['keywords'] = movies['keywords'].apply(remove_space)

# Concatenate all relevant columns into a single 'tags' column
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

# Create a new dataframe with 'movie_id', 'title', and 'tags' columns
new_df = movies[['movie_id', 'title', 'tags']]

# Convert the list of tags into a single string for each movie
new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))

# Convert all tags to lowercase
new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())

# Initialize the Porter Stemmer for stemming
ps = PorterStemmer()

# Define a function to apply stemming to the 'tags' column
def stems(text):
    return " ".join([ps.stem(word) for word in text.split()])

# Apply stemming to the 'tags' column
new_df['tags'] = new_df['tags'].apply(stems)

# Vectorize the 'tags' column using CountVectorizer
cv = CountVectorizer(max_features=5000, stop_words='english')
vector = cv.fit_transform(new_df['tags']).toarray()

# Compute the cosine similarity between the vectors
similarity = cosine_similarity(vector)

# Function to recommend movies based on similarity
def recommend(movie):
    index = new_df[new_df['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])), reverse=True, key=lambda x: x[1])
    for i in distances[1:6]:
        print(new_df.iloc[i[0]].title)

# Save the data and model using pickle
pickle.dump(new_df, open('artifacts/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('artifacts/similarity.pkl', 'wb'))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: " ".join(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x: x.lower())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stems)


In [3]:
# Example usage
recommend('Spider-Man 2')

# Save the data and model using pickle
pickle.dump(new_df, open('artifacts/movie_list.pkl', 'wb'))
pickle.dump(similarity, open('artifacts/similarity.pkl', 'wb'))

Spider-Man 3
Spider-Man
The Amazing Spider-Man
Iron Man 2
Superman


██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████