In [40]:
# importing neseccesary libaries
import numpy as np
import pandas as pd

In [41]:
# Data is loading from CSV files into  data structures
movies = pd.read_csv('tmdb_5000_movies.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')

In [42]:
# merging both tables into singal table
movies = movies.merge(credits,on = 'title')

In [43]:
#genres
#id
#keywords
#title
#overview
#cast
#crew
#these columns that are necesseary to get tags for movie recommendation, only these columns are used

movies = movies[['movie_id','title','overview','genres','keywords','cast','crew']]

In [45]:
# to find if there are any null values
movies.isnull().sum()

movie_id    0
title       0
overview    3
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [46]:
# drops rows with empty cells
movies.dropna(inplace=True)

In [47]:
# rechecking for empty values
movies.isnull().sum()

movie_id    0
title       0
overview    0
genres      0
keywords    0
cast        0
crew        0
dtype: int64

In [48]:
# to find duplicated rows
movies.duplicated().sum()

np.int64(0)

In [49]:
# literal_eval takes a string that contains a Python literal expression and parses it into a corresponding Python object
import ast
ast.literal_eval

<function ast.literal_eval(node_or_string)>

In [56]:
movies['genres'][0]

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [51]:
#function to convert dictionaries into array
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [57]:
#converts values of genres and keywords from dictionaries to arrays
movies['genres'] = movies['genres'].apply(convert)

In [58]:
movies['keywords'] = movies['keywords'].apply(convert)

In [59]:
movies['genres'][0]

['Action', 'Adventure', 'Fantasy', 'Science Fiction']

In [60]:
#function to convert dictionaries into array and take first 3 values ( since main 3 actors of flim are important )
def convert3(obj):
    L = []
    counter = 0
    for i in ast.literal_eval(obj):
        if counter != 3 :
            L.append(i['name'])
            counter +=1
        else:
            break
    return L     

In [61]:
#applies above function to whole cast column
movies['cast'] = movies['cast'].apply(convert3)

In [62]:
movies['cast'][0]

['Sam Worthington', 'Zoe Saldana', 'Sigourney Weaver']

In [63]:
#function to take director name from crew column
def fetch_director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == 'Director':
            L.append(i['name'])
            break
    return L  

In [64]:
#applies above function to whole cast column
movies['crew'] = movies['crew'].apply(fetch_director)

In [65]:
#converts overview column string into array
movies['overview'] = movies['overview'].apply(lambda x:x.split())

In [66]:
#removes spaces from all columns
movies['genres'] = movies['genres'].apply(lambda x:[i.replace(" ","") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x:[i.replace(" ","") for i in x])
movies['cast'] = movies['cast'].apply(lambda x:[i.replace(" ","") for i in x])
movies['crew'] = movies['crew'].apply(lambda x:[i.replace(" ","") for i in x])

In [67]:
#tags column is made by combining 4 other columns
movies['tags'] = movies['overview'] + movies['genres'] + movies['keywords'] + movies['cast'] + movies['crew']

In [68]:
#making new table
new_df = movies[['movie_id','title','tags']]

In [69]:
new_df.head()

Unnamed: 0,movie_id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."


In [70]:
# removing space from tags cloumn
new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:" ".join(x))


In [71]:
# turning letters into lowercase in tags cloumn
new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(lambda x:x.lower())


In [72]:
new_df['tags'][0]

'in the 22nd century, a paraplegic marine is dispatched to the moon pandora on a unique mission, but becomes torn between following orders and protecting an alien civilization. action adventure fantasy sciencefiction cultureclash future spacewar spacecolony society spacetravel futuristic romance space alien tribe alienplanet cgi marine soldier battle loveaffair antiwar powerrelations mindandsoul 3d samworthington zoesaldana sigourneyweaver jamescameron'

In [73]:
# importing NLTK (Natural Language Toolkit) to remove similar words and get root words using PorterStemmer 
# and a function to apply it for whole column
import nltk

In [74]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

In [75]:
def stem(text):
    y = []
    for i in text.split():
        y.append(ps.stem(i))
    return " ".join(y) 

In [77]:
new_df['tags'] = new_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  new_df['tags'] = new_df['tags'].apply(stem)


In [78]:
new_df['tags'][0]

'in the 22nd century, a parapleg marin is dispatch to the moon pandora on a uniqu mission, but becom torn between follow order and protect an alien civilization. action adventur fantasi sciencefict cultureclash futur spacewar spacecoloni societi spacetravel futurist romanc space alien tribe alienplanet cgi marin soldier battl loveaffair antiwar powerrel mindandsoul 3d samworthington zoesaldana sigourneyweav jamescameron'

In [79]:
#vectorisation is used turn large strings into vectors to make program run faster
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=5000,stop_words='english')

In [80]:
vectors = cv.fit_transform(new_df['tags']).toarray()

In [81]:
cv.get_feature_names_out()

array(['000', '007', '10', ..., 'zone', 'zoo', 'zooeydeschanel'],
      dtype=object)

In [82]:
#using cosine similarity function to find similarity between each movie with every other movie
from sklearn.metrics.pairwise import cosine_similarity

In [83]:
similarity = cosine_similarity(vectors)

In [84]:
similarity

array([[1.        , 0.08346223, 0.0860309 , ..., 0.04499213, 0.        ,
        0.        ],
       [0.08346223, 1.        , 0.06063391, ..., 0.02378257, 0.        ,
        0.02615329],
       [0.0860309 , 0.06063391, 1.        , ..., 0.02451452, 0.        ,
        0.        ],
       ...,
       [0.04499213, 0.02378257, 0.02451452, ..., 1.        , 0.03962144,
        0.04229549],
       [0.        , 0.        , 0.        , ..., 0.03962144, 1.        ,
        0.08714204],
       [0.        , 0.02615329, 0.        , ..., 0.04229549, 0.08714204,
        1.        ]])

In [85]:
#function works by taking a movie and outputs 5 similar movies 
def recommend(movie):
    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = similarity[movie_index]
    movies_list = sorted(list(enumerate(distances)),reverse=True,key=lambda x:x[1])[1:6]

    for i in movies_list:
        print(new_df.iloc[i[0]].title)

In [86]:
recommend('Avatar')

Aliens vs Predator: Requiem
Aliens
Falcon Rising
Independence Day
Titan A.E.


In [87]:
#The pickle module is a powerful tool in Python for serializing and deserializing objects
#it converts python objects into byte stream
import pickle

In [37]:
pickle.dump(new_df.to_dict(),open('movies_dict.pkl','wb'))

In [38]:
pickle.dump(similarity,open('similarity.pkl','wb'))