### DATA CLEANING AND PREPROCESSING

In [1]:
# importing tmdb movies dataset both credits and movies

import pandas as pd

movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

print(movies.shape)
print(credits.shape)

(4803, 20)
(4803, 4)


In [2]:
# merging both the dataframes

movies = movies.merge(credits, on='title')

print(movies.shape) # run only one time because it will change the shape of the dataframe

(4809, 23)


In [3]:
# columns which does not need to dropped
# movie_id, title, overview, genres, keywords, cast, crew

needed = ['movie_id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew']

not_needed = [col for col in movies.columns if col not in needed]

movies.drop(columns=not_needed, inplace=True)


In [4]:
# checking null values

print("Number of null values in each column:\n", movies.isnull().sum())

Number of null values in each column:
 genres      0
keywords    0
overview    3
title       0
movie_id    0
cast        0
crew        0
dtype: int64


In [5]:
# removing the null value

movies = movies.dropna()

print(movies.shape)

# checking duplicated values

movies.duplicated().sum()

(4806, 7)


0

In [6]:
# creating convert function for genre column
import ast

def convert(genres):
    genre = []
    for i in ast.literal_eval(genres): # ast.literal_eval() will convert string list to list
        genre.append(i['name'])
    return genre

# creating convert_cast function for cast column

def convert_cast(casts):
    cast = []
    counter = 0 # we only need first 3 cast, don't need all
    for i in ast.literal_eval(casts):
        if counter < 3:
            cast.append(i['name'])
            counter += 1
        else:
            break
    return cast

# creating convert_crew function for crew column

def convert_crew(crews):
    crew = []
    for i in ast.literal_eval(crews):
        if i['job'] == 'Director': # we only need director name
            crew.append(i['name'])
    return crew

# creating convert_keywords function for keywords column

def convert_keywords(keywords):
    keyword = []
    for i in ast.literal_eval(keywords):
        keyword.append(i['name'])
    return keyword


In [7]:
# changin genres column of dataframe to list of genres

movies['genres'] = movies['genres'].apply(convert)

# changing the cast column of dataframe to list of cast

movies['cast'] = movies['cast'].apply(convert_cast)

# changin the crew column of dataframe to list of crew

movies['crew'] = movies['crew'].apply(convert_crew)

# changing the keywords column of dataframe to list of keywords

movies['keywords'] = movies['keywords'].apply(convert_keywords)

# converting the overview column of dataframe to list of overview

movies['overview'] = movies['overview'].apply(lambda x: x.split())

In [8]:
# removing in between spaces from each values in columns

movies['genres'] = movies['genres'].apply(lambda x: [i.replace(" ", "") for i in x])

movies['keywords'] = movies['keywords'].apply(lambda x: [i.replace(" ", "") for i in x])

movies['cast'] = movies['cast'].apply(lambda x: [i.replace(" ", "") for i in x])

movies['crew'] = movies['crew'].apply(lambda x: [i.replace(" ", "") for i in x])


In [9]:
# creating new column tags in addition of all crew , cast, overview, genres, keywords

movies['tags'] = movies['cast'] + movies['crew'] + movies['genres'] + movies['keywords'] + movies['overview']

In [10]:
# creating newframe only title, movie_id, tags

newframe = movies[['movie_id', 'title', 'tags']]

# converting tags column from list to string

newframe['tags'] = newframe['tags'].apply(lambda x: (" ".join(x)).lower())

newframe.head(10)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newframe['tags'] = newframe['tags'].apply(lambda x: (" ".join(x)).lower())


Unnamed: 0,movie_id,title,tags
0,19995,Avatar,samworthington zoesaldana sigourneyweaver jame...
1,285,Pirates of the Caribbean: At World's End,johnnydepp orlandobloom keiraknightley gorever...
2,206647,Spectre,danielcraig christophwaltz léaseydoux sammende...
3,49026,The Dark Knight Rises,christianbale michaelcaine garyoldman christop...
4,49529,John Carter,taylorkitsch lynncollins samanthamorton andrew...
5,559,Spider-Man 3,tobeymaguire kirstendunst jamesfranco samraimi...
6,38757,Tangled,zacharylevi mandymoore donnamurphy byronhoward...
7,99861,Avengers: Age of Ultron,robertdowneyjr. chrishemsworth markruffalo jos...
8,767,Harry Potter and the Half-Blood Prince,danielradcliffe rupertgrint emmawatson davidya...
9,209112,Batman v Superman: Dawn of Justice,benaffleck henrycavill galgadot zacksnyder act...


In [11]:
# first we need to use stemmer to stem the words

from nltk.stem import PorterStemmer

stemmer = PorterStemmer()

def stem_text(text):
    return " ".join([stemmer.stem(word) for word in text.split()])

newframe['tags'] = newframe['tags'].apply(stem_text)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newframe['tags'] = newframe['tags'].apply(stem_text)


In [12]:
# using CountVectorizer to convert the text collection to a matrix

from sklearn.feature_extraction.text import CountVectorizer

# create a CountVectorizer object with English stop words

count = CountVectorizer(max_features=5000, stop_words='english')

vectors = count.fit_transform(newframe['tags']).toarray()

In [13]:
# calculating cosine similarity between movies

from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(vectors)

print(cosine_sim)

[[1.         0.08346223 0.0860309  ... 0.04499213 0.         0.        ]
 [0.08346223 1.         0.06063391 ... 0.02378257 0.         0.02615329]
 [0.0860309  0.06063391 1.         ... 0.02451452 0.         0.        ]
 ...
 [0.04499213 0.02378257 0.02451452 ... 1.         0.03962144 0.04229549]
 [0.         0.         0.         ... 0.03962144 1.         0.08714204]
 [0.         0.02615329 0.         ... 0.04229549 0.08714204 1.        ]]


  ret = a @ b


In [14]:
# creating the top 10 movie recommendation system using the cosine similarity

# here we have used movies frame instead of newframe , but its not a problem.

def similar_movies(movie_name):
    movie_name = movie_name.lower()
    newframe['title'] = newframe['title'].str.lower()
    
    if movie_name not in newframe['title'].unique():
        return('Movie not in Database')
    
    else:
        # getting the index of the movie
        movie_index = movies.loc[newframe['title'] == movie_name].index[0]
        
        # getting the similarity scores of all movies with that movie
        # enumerate() is used to combine the index of the movie along with the similarity score
        similar_movies = list(enumerate(cosine_sim[movie_index]))
        
        # sorting the list of similar movies in descending order
        sorted_similar_movies = sorted(similar_movies, key = lambda x:x[1], reverse = True)
        
        # getting the top 10 similar movies
        top_10_similar_movies = sorted_similar_movies[1:11]
        
        # printing the top 10 similar movies
        print('Top 10 similar movies to ' + movie_name + ' are:\n')
        for i in range(len(top_10_similar_movies)):
            print(i+1, newframe['title'][top_10_similar_movies[i][0]], sep = ': ')
            
        return

In [15]:
similar_movies('Avatar')

Top 10 similar movies to avatar are:

1: aliens vs predator: requiem
2: aliens
3: anne of green gables
4: independence day
5: titan a.e.
6: battle: los angeles
7: predators
8: small soldiers
9: meet dave
10: this is england


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  newframe['title'] = newframe['title'].str.lower()


In [16]:
import pickle

# Save the movies dataset as a pickle binary form file to use on app.py
with open('movies.pkl', 'wb') as file:
    pickle.dump(newframe, file)

In [19]:
# dumping similarity matrix into a pickle binary form file to use on app.py

pickle.dump(cosine_sim, open('similarity.pkl', 'wb'))