In [3]:
import numpy as np
import pandas as pd
import ast 
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity
import pickle

In [4]:
#Read movies and credits csv files
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [5]:
#Merge movies and credits dataframe
movies = movies.merge(credits, on='title')

In [6]:
#Remove unnecessary columns
#Keep id, genres, keywords, title, overview, cast, crew

movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']]

In [7]:
#Remove movies with null or duplicate values
movies.dropna(inplace=True)

In [8]:
#Function for converting columns values from list of dictionaries to simple list
#Use literal eval function from ast module
#Apply on genres and keywords columns
def convert_genres_keywords(Object):
    details = []
    for i in ast.literal_eval(Object):
        details.append(i['name'])
    return details

In [9]:
#Function for converting columns values from list of dictionaries to simple list
#Use literal eval function from ast module
#Apply on cast columns for first 4 actors
def convert_cast(Object):
    details = []
    counter = 0
    for i in ast.literal_eval(Object):
        if counter != 4:
            details.append(i['name'])
            counter += 1
        else:
            break
    return details

In [10]:
#Function for converting columns values from list of dictionaries to simple list
#Use literal eval function from ast module
#Apply on crew column for Director of the movie only
def convert_crew(Object):
    director = []
    for i in ast.literal_eval(Object):
        if i['job'] == 'Director':
            director.append(i['name'])
            break
    return director

In [11]:
for column_name in movies.columns:
    if column_name == "genres" or column_name == "keywords":
        movies[column_name] = movies[column_name].apply(convert_genres_keywords)
    elif column_name == "cast":
        movies[column_name] = movies[column_name].apply(convert_cast)
    elif column_name == "crew":
        movies[column_name] = movies[column_name].apply(convert_crew)

In [12]:
#Change overview from string to list for creating TAGS using concatenation
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [13]:
movies.head()

Unnamed: 0,id,title,overview,genres,keywords,cast,crew
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron]
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d...","[Adventure, Fantasy, Action]","[ocean, drug abuse, exotic island, east india ...","[Johnny Depp, Orlando Bloom, Keira Knightley, ...",[Gore Verbinski]
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send...","[Action, Adventure, Crime]","[spy, based on novel, secret agent, sequel, mi...","[Daniel Craig, Christoph Waltz, Léa Seydoux, R...",[Sam Mendes]
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney...","[Action, Crime, Drama, Thriller]","[dc comics, crime fighter, terrorist, secret i...","[Christian Bale, Michael Caine, Gary Oldman, A...",[Christopher Nolan]
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili...","[Action, Adventure, Science Fiction]","[based on novel, mars, medallion, space travel...","[Taylor Kitsch, Lynn Collins, Samantha Morton,...",[Andrew Stanton]


In [14]:
#Remove whitespaces in order to make the recommendation system 
#recognize every genre/keywords/cast/crew as unique entities and 
#not get confused between 2 entities with same first name
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [15]:
#Create a TAGS column that includes all columns of the movies.
#Considers all columns overview, genre, keywords, cast, and crew while making recommendations
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [19]:
#Create new dataframe with only the tags
new = movies.drop(columns=['overview','genres','keywords','cast','crew'])

In [26]:
#Convert tags back to string
new['tags'] = new['tags'].apply(lambda x: " ".join(x))

In [27]:
#Initialise count vectorrizer 
cv = CountVectorizer(max_features=6000, stop_words="english")

In [30]:
vectors = cv.fit_transform(new['tags']).toarray()

In [31]:
#Function for stemming the text
ps = PorterStemmer()
def stem_text(text):
    result = []
    for word in text.split():
        result.append(ps.stem(word))
    return " ".join(result)

In [32]:
new['tags'] = new['tags'].apply(stem_text)

In [33]:
similarity = cosine_similarity(vectors)

In [44]:
#Recommend function based on tags, which returns the 25 most similar movies
#Enumerate and then sort, since we must keep track of the id of the movie in order to display it
def recommend(movie):
    index = new[new['title'] == movie].index[0]
    distances = sorted(list(enumerate(similarity[index])),reverse=True,key = lambda x: x[1])
    for i in distances[1:11]:
        print(new.iloc[i[0]].title)
        print(new.iloc[i[0]].id)

In [45]:
#Convert list of movies to a dictionary to transfer it to streamlit website as binary file
pickle.dump(movies.to_dict(), open('movies_list.pkl','wb'))

In [46]:
#Dump recommendation function as binary files to be used on streamlit website
pickle.dump(similarity, open('similarity.pkl', 'wb'))