In [2]:
# Import libraries
import pandas as pd
import numpy as np 
import ast
import pickle

from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# Import databases
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [4]:
# Join tables

movies = movies.merge(credits, on='title')

In [5]:
# As we want to create a recommended movie system, we have choice the columns that make sense
'''
genre 
id 
keywords 
title
overview
cast 
crew
'''

cols = ['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']
movies = movies[cols]

In [6]:
# Remove Na and duplicates
movies.isna().sum()
movies.dropna()
movies.duplicated().sum()
movies.drop_duplicates(inplace=True)

In [7]:
# we will process the data 
# Starting with the genres, for example: 
# For the movie Avatar we have the following: 
'''
'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'''
# We want it to look like this: [Action, Adventure, Fantasy, Science Fiction]
# we to do the same for keywords

def convert(obj): 
    genre = []
    for i in ast.literal_eval(obj): 
        genre.append(i['name'])
    return genre

# For the cast we will take the first three artists

def take3(obj):
    count = 0
    artists = []
    for i in ast.literal_eval(obj):
        artists.append(i['name'])
        count += 1
        if count >= 3: 
            break
    return artists

# For the crew we only want the director

def take_director(obj): 
    director = []
    for i in ast.literal_eval(obj): 
        if i['job'] == 'Director': 
            director.append(i['name'])
    return director

In [8]:
movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)
movies['cast'] = movies['cast'].apply(take3)
movies['crew'] = movies['crew'].apply(take_director)
# For an overview, we have to break it down word by word
movies['overview'] = movies['overview'].apply(lambda x: str(x).split())

In [9]:
# We also need to remove spaces between words
movies['genres'] = movies['genres'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['overview'] = movies['overview'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['cast'] = movies['cast'].apply(lambda x: [i.replace(' ', '') for i in x])
movies['crew'] = movies['crew'].apply(lambda x: [i.replace(' ', '') for i in x])

In [10]:
# let's create a tag with the columns genres, keywords, cast, crew
movies['tag'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']
df = movies[['movie_id', 'title', 'tag']]

In [11]:
# Let's remove the commas
df['tag'] = df['tag'].apply(lambda x: " ".join(str(i) for i in x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tag'] = df['tag'].apply(lambda x: " ".join(str(i) for i in x))


In [12]:
# Let's lower the tag
df['tag'] = df['tag'].apply(lambda x: x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tag'] = df['tag'].apply(lambda x: x.lower())


In [13]:
# Before vectorizing the tag, there are very similar words, such as beloved, amorous and love, which they replace with love
ps = PorterStemmer()

def stem(text): 
    word = []
    for i in text.split(): 
        word.append(ps.stem(i))
    return " ".join(word)

In [14]:
df['tag'] = df['tag'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tag'] = df['tag'].apply(stem)


In [15]:
# vectorizing the tag so we can calculate the similarity
cv = CountVectorizer(max_features=5000, stop_words='english')
vectors = cv.fit_transform(df['tag']).toarray()

In [16]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [17]:
# Now we go calculate the similarity between movies using the cosine similarity
similarity = cosine_similarity(vectors)

In [18]:
def recomended(movie): 
    movie_index = df[df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movie_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]

    movie_recommended_list = []
    for i in movie_list: 
        movie_recommended_list.append(df.iloc[i[0]].title)
    return movie_recommended_list

In [19]:
recomended('Avatar')

['Aliens vs Predator: Requiem',
 'Aliens',
 'Falcon Rising',
 'Independence Day',
 'Titan A.E.']

In [20]:
pickle.dump(df, open('movies.pkl', 'wb'))

In [22]:
import gzip

with gzip.open('similarity.pkl.gz', 'wb') as file: 
    pickle.dump(similarity, file)