In [43]:
import numpy as np
import pandas as pd

In [44]:
movies = pd.read_csv("tmdb_5000_movies.csv")
credits = pd.read_csv("tmdb_5000_credits.csv")

In [45]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 20 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4803 non-null   int64  
 1   genres                4803 non-null   object 
 2   homepage              1712 non-null   object 
 3   id                    4803 non-null   int64  
 4   keywords              4803 non-null   object 
 5   original_language     4803 non-null   object 
 6   original_title        4803 non-null   object 
 7   overview              4800 non-null   object 
 8   popularity            4803 non-null   float64
 9   production_companies  4803 non-null   object 
 10  production_countries  4803 non-null   object 
 11  release_date          4802 non-null   object 
 12  revenue               4803 non-null   int64  
 13  runtime               4801 non-null   float64
 14  spoken_languages      4803 non-null   object 
 15  status               

In [46]:
movies.shape

(4803, 20)

In [48]:
movies = movies.merge(credits , on = "title")
# as they get merged based on the title thus we have 23 columns instead of 24

In [49]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4809 entries, 0 to 4808
Data columns (total 23 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   budget                4809 non-null   int64  
 1   genres                4809 non-null   object 
 2   homepage              1713 non-null   object 
 3   id                    4809 non-null   int64  
 4   keywords              4809 non-null   object 
 5   original_language     4809 non-null   object 
 6   original_title        4809 non-null   object 
 7   overview              4806 non-null   object 
 8   popularity            4809 non-null   float64
 9   production_companies  4809 non-null   object 
 10  production_countries  4809 non-null   object 
 11  release_date          4808 non-null   object 
 12  revenue               4809 non-null   int64  
 13  runtime               4807 non-null   float64
 14  spoken_languages      4809 non-null   object 
 15  status               

# Column Filtering 

In [39]:
# We will keep mainly the keyword for the tags and the ids for the poster

In [51]:
# Instead of using the drop function we will only write the columns we want to keep
movies = movies[["id" , "title" , "overview" , "keywords" , "cast" , "crew" , "genres"]]

In [53]:
movies.head(1)
# To check the columns

Unnamed: 0,id,title,overview,keywords,cast,crew,genres
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...","[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de...","[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam..."


# Making the tags(combination of cast , crea and keywords)

In [54]:
# checking for the missing values
movies.isnull().sum()
# We have only 3 NaN overviews

id          0
title       0
overview    3
keywords    0
cast        0
crew        0
genres      0
dtype: int64

In [55]:
# Dropping the NaN
movies.dropna(inplace = True)

In [57]:
# Checking duplication
movies.duplicated().sum()
# Therefore no duplicated data

0

In [58]:
# Finding the genres
movies.iloc[0].genres

'[{"id": 28, "name": "Action"}, {"id": 12, "name": "Adventure"}, {"id": 14, "name": "Fantasy"}, {"id": 878, "name": "Science Fiction"}]'

In [60]:
import ast
# This module with the help of ast.literal_eval() can change the string of list to a proper List

In [65]:
# We have to filter out only the names from the dictionaries in this String of List
def convert(obj):
    L = []
    for i in ast.literal_eval(obj):
        L.append(i['name'])
    return L

In [66]:
movies['genres'].apply(convert)

0       [Action, Adventure, Fantasy, Science Fiction]
1                        [Adventure, Fantasy, Action]
2                          [Action, Adventure, Crime]
3                    [Action, Crime, Drama, Thriller]
4                [Action, Adventure, Science Fiction]
                            ...                      
4804                        [Action, Crime, Thriller]
4805                                [Comedy, Romance]
4806               [Comedy, Drama, Romance, TV Movie]
4807                                               []
4808                                    [Documentary]
Name: genres, Length: 4806, dtype: object

In [67]:
movies['genres'] = movies['genres'].apply(convert)

In [68]:
# Same will do for keywords
movies['keywords'] = movies['keywords'].apply(convert)

In [69]:
def cast_mains(obj):
    count = 0
    L = []
    for i in ast.literal_eval(obj):
        if count != 3:
            L.append(i['name'])
        else :
            break
    return L

In [70]:
movies['cast'] = movies['cast'].apply(cast_mains)

In [71]:
movies['crew']

0       [{"credit_id": "52fe48009251416c750aca23", "de...
1       [{"credit_id": "52fe4232c3a36847f800b579", "de...
2       [{"credit_id": "54805967c3a36829b5002c41", "de...
3       [{"credit_id": "52fe4781c3a36847f81398c3", "de...
4       [{"credit_id": "52fe479ac3a36847f813eaa3", "de...
                              ...                        
4804    [{"credit_id": "52fe44eec3a36847f80b280b", "de...
4805    [{"credit_id": "52fe487dc3a368484e0fb013", "de...
4806    [{"credit_id": "52fe4df3c3a36847f8275ecf", "de...
4807    [{"credit_id": "52fe4ad9c3a368484e16a36b", "de...
4808    [{"credit_id": "58ce021b9251415a390165d9", "de...
Name: crew, Length: 4806, dtype: object

In [73]:
movies['crew'].iloc[0]
# Too compolicated and huge , we want only the director

'[{"credit_id": "52fe48009251416c750aca23", "department": "Editing", "gender": 0, "id": 1721, "job": "Editor", "name": "Stephen E. Rivkin"}, {"credit_id": "539c47ecc3a36810e3001f87", "department": "Art", "gender": 2, "id": 496, "job": "Production Design", "name": "Rick Carter"}, {"credit_id": "54491c89c3a3680fb4001cf7", "department": "Sound", "gender": 0, "id": 900, "job": "Sound Designer", "name": "Christopher Boyes"}, {"credit_id": "54491cb70e0a267480001bd0", "department": "Sound", "gender": 0, "id": 900, "job": "Supervising Sound Editor", "name": "Christopher Boyes"}, {"credit_id": "539c4a4cc3a36810c9002101", "department": "Production", "gender": 1, "id": 1262, "job": "Casting", "name": "Mali Finn"}, {"credit_id": "5544ee3b925141499f0008fc", "department": "Sound", "gender": 2, "id": 1729, "job": "Original Music Composer", "name": "James Horner"}, {"credit_id": "52fe48009251416c750ac9c3", "department": "Directing", "gender": 2, "id": 2710, "job": "Director", "name": "James Cameron"},

In [74]:
def director(obj):
    L = []
    for i in ast.literal_eval(obj):
        if i['job'] == "Director":
            L.append(i['name'])
            break
    return L

In [75]:
movies['crew'] = movies['crew'].apply(director)

In [77]:
movies.head(1)

Unnamed: 0,id,title,overview,keywords,cast,crew,genres
0,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[culture clash, future, space war, space colon...","[Sam Worthington, Zoe Saldana, Sigourney Weave...",[James Cameron],"[Action, Adventure, Fantasy, Science Fiction]"


In [82]:
# Removig spces from genres , keywords , crew and cast
movies['genres'] = movies['genres'].apply(lambda x : [i.replace(" ", "") for i in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [i.replace(" ", "") for i in x])
movies['cast'] = movies['cast'].apply(lambda x : [i.replace(" ", "") for i in x])
movies['crew'] = movies['crew'].apply(lambda x : [i.replace(" ", "") for i in x])

In [80]:
# converting the overview column strings into a List of strings 
movies['overview'] = movies['overview'].apply(lambda x : x.split())
# here x is actually the string that goes in the lambda function
# Here each word is considered as a seperate element when seperatred by a space and made into a Lsit 

In [81]:
movies.head(1)

Unnamed: 0,id,title,overview,keywords,cast,crew,genres
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[cultureclash, future, spacewar, spacecolony, ...","[SamWorthington, ZoeSaldana, SigourneyWeaver, ...",[JamesCameron],"[Action, Adventure, Fantasy, ScienceFiction]"


In [83]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['cast'] + movies['crew'] + movies['genres']

In [84]:
updated_df = movies[['id' , 'title' , 'tags']]
# Creating this new updated data frame

In [85]:
updated_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin..."
1,285,Pirates of the Caribbean: At World's End,"[Captain, Barbossa,, long, believed, to, be, d..."
2,206647,Spectre,"[A, cryptic, message, from, Bond’s, past, send..."
3,49026,The Dark Knight Rises,"[Following, the, death, of, District, Attorney..."
4,49529,John Carter,"[John, Carter, is, a, war-weary,, former, mili..."
...,...,...,...
4804,9367,El Mariachi,"[El, Mariachi, just, wants, to, play, his, gui..."
4805,72766,Newlyweds,"[A, newlywed, couple's, honeymoon, is, upended..."
4806,231617,"Signed, Sealed, Delivered","[""Signed,, Sealed,, Delivered"", introduces, a,..."
4807,126186,Shanghai Calling,"[When, ambitious, New, York, attorney, Sam, is..."


In [87]:
updated_df['tags'][0]

['In',
 'the',
 '22nd',
 'century,',
 'a',
 'paraplegic',
 'Marine',
 'is',
 'dispatched',
 'to',
 'the',
 'moon',
 'Pandora',
 'on',
 'a',
 'unique',
 'mission,',
 'but',
 'becomes',
 'torn',
 'between',
 'following',
 'orders',
 'and',
 'protecting',
 'an',
 'alien',
 'civilization.',
 'cultureclash',
 'future',
 'spacewar',
 'spacecolony',
 'society',
 'spacetravel',
 'futuristic',
 'romance',
 'space',
 'alien',
 'tribe',
 'alienplanet',
 'cgi',
 'marine',
 'soldier',
 'battle',
 'loveaffair',
 'antiwar',
 'powerrelations',
 'mindandsoul',
 '3d',
 'SamWorthington',
 'ZoeSaldana',
 'SigourneyWeaver',
 'StephenLang',
 'MichelleRodriguez',
 'GiovanniRibisi',
 'JoelDavidMoore',
 'CCHPounder',
 'WesStudi',
 'LazAlonso',
 'DileepRao',
 'MattGerald',
 'SeanAnthonyMoran',
 'JasonWhyte',
 'ScottLawrence',
 'KellyKilgour',
 'JamesPatrickPitt',
 'SeanPatrickMurphy',
 'PeterDillon',
 'KevinDorman',
 'KelsonHenderson',
 'DavidVanHorn',
 'JacobTomuri',
 'MichaelBlain-Rozgay',
 'JonCurry',
 'Luke

In [89]:
updated_df['tags'] = updated_df['tags'].apply(lambda x : " ".join(x) )
# With the help of a space we are joining all the elements of the list and making into a string

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_df['tags'] = updated_df['tags'].apply(lambda x : " ".join(x) )


In [90]:
# All to lower case for simplicity
updated_df['tags'] = updated_df['tags'].apply(lambda x : x.lower())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_df['tags'] = updated_df['tags'].apply(lambda x : x.lower())


In [91]:
updated_df

Unnamed: 0,id,title,tags
0,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
1,285,Pirates of the Caribbean: At World's End,"captain barbossa, long believed to be dead, ha..."
2,206647,Spectre,a cryptic message from bond’s past sends him o...
3,49026,The Dark Knight Rises,following the death of district attorney harve...
4,49529,John Carter,"john carter is a war-weary, former military ca..."
...,...,...,...
4804,9367,El Mariachi,el mariachi just wants to play his guitar and ...
4805,72766,Newlyweds,a newlywed couple's honeymoon is upended by th...
4806,231617,"Signed, Sealed, Delivered","""signed, sealed, delivered"" introduces a dedic..."
4807,126186,Shanghai Calling,when ambitious new york attorney sam is sent t...


# Vectorisation

In [117]:
# we will remove the stop words (words used in sentence formation)
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 4806 , stop_words = 'english')

In [118]:
cv.fit_transform(updated_df['tags']).toarray().shape

(4806, 4806)

In [119]:
# Thus we made a 4806 by 4806 dimensional space for 4806 movies
vectors = cv.fit_transform(updated_df['tags']).toarray()
# This is the actual vectorisation process and here as sklearn returns in a scipy sparse matrix form thus we convert to a numpy array
# It is an array of dimensions 4806*4806 and each row is a vector of each movie

In [120]:
vectors[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [121]:
cv.get_feature_names()
# This will give the most common 4806 words after combining all the words in the tags column of all the movies
# This is BagOfWords Vectorisation

['000',
 '007',
 '10',
 '100',
 '11',
 '12',
 '13',
 '14',
 '15',
 '16',
 '17',
 '18',
 '18th',
 '19',
 '1930',
 '1940',
 '1950',
 '1960',
 '1960s',
 '1970',
 '1970s',
 '1980',
 '1990',
 '19th',
 '19thcenturi',
 '20',
 '20th',
 '24',
 '25',
 '30',
 '3d',
 '40',
 '50',
 '60',
 '70',
 'aaron',
 'aaroneckhart',
 'aarontaylor',
 'aasifmandvi',
 'abandon',
 'abduct',
 'abigailbreslin',
 'abil',
 'abl',
 'abov',
 'abus',
 'academi',
 'accept',
 'access',
 'accid',
 'accident',
 'accompani',
 'accomplish',
 'account',
 'accus',
 'ace',
 'achiev',
 'act',
 'action',
 'activ',
 'activist',
 'actor',
 'actress',
 'actual',
 'adam',
 'adambrodi',
 'adamgoldberg',
 'adamlefevr',
 'adamsandl',
 'adamscott',
 'adamshankman',
 'adapt',
 'add',
 'addict',
 'adewaleakinnuoye',
 'adjust',
 'admir',
 'admit',
 'adolesc',
 'adopt',
 'ador',
 'adrianmartinez',
 'adrienbrodi',
 'adult',
 'adulteri',
 'adulthood',
 'advanc',
 'adventur',
 'adventure',
 'advertis',
 'advic',
 'affair',
 'affect',
 'afghanista

# Stemming

In [112]:
pip install nltk





[notice] A new release of pip available: 22.2.2 -> 22.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [113]:
import nltk
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
# This is an object of the PorterStemmer class

In [114]:
def stemm(txt):
    y = []
    for i in txt.split():
        y.append(ps.stem(i))
    
    return " ".join(y)
# We return in the string format again by using the join command

In [116]:
updated_df['tags'] = updated_df['tags'].apply(stemm)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  updated_df['tags'] = updated_df['tags'].apply(stemm)


# Curse of Dimensionality

In [122]:
# Curse of dimensionality means that if we calculate the distance between two vectors then we must not do that using the euclidean distance . Instead we must use the cosine distance 
# Cosine distance is actually the measure of the angle between two vectors 
# Lesser the angle more are they similar

In [123]:
from sklearn.metrics.pairwise import cosine_similarity

In [124]:
cosine_similarity(vectors)
# This returns the value ranging from 0 to 1 . 0 means very far away while 1 means completely same

array([[1.        , 0.06897007, 0.07733089, ..., 0.04357102, 0.        ,
        0.        ],
       [0.06897007, 1.        , 0.07644708, ..., 0.02153652, 0.        ,
        0.02383656],
       [0.07733089, 0.07644708, 1.        , ..., 0.02414726, 0.        ,
        0.        ],
       ...,
       [0.04357102, 0.02153652, 0.02414726, ..., 1.        , 0.04307305,
        0.0451754 ],
       [0.        , 0.        , 0.        , ..., 0.04307305, 1.        ,
        0.09534626],
       [0.        , 0.02383656, 0.        , ..., 0.0451754 , 0.09534626,
        1.        ]])

In [125]:
similarity = cosine_similarity(vectors)

In [126]:
similarity.shape
# It will be (4806,4806) as it is the distance between each other movies

(4806, 4806)

In [127]:
similarity[1]
# here in the 1 th index we see that the value is 1 , as the distance from the second movie to the second movie is 0 , thus they are same

array([0.06897007, 1.        , 0.07644708, ..., 0.02153652, 0.        ,
       0.02383656])

In [128]:
# Thus all the diagonal elements will be havig the value of 1 throughout the array of arrays Similarity

In [136]:
def recommend(movie):
    movie_index = updated_df[updated_df['title'] == movie].index[0]
    distance = similarity[movie_index]
    movies_list = sorted(list(enumerate(distance)) , reverse = True , key = lambda x : x[1])[1:6]
    
#     here x is an enumerate object which when made into a List got converted into a tuple with the first element being the index number and the second element being the distance from the specified movie
#     reverse = True means in descendin gorder sort
#     key is telling that we must take the distance instead of the index while sorting
#     [1:6] means the first five elements

    for i in movies_list:
        print((updated_df.iloc[i[0]]).title)
#         In movies_list we get the list of tuples with the first element as the index and the second element as the distance 
#         i is the tuple and i[0] is the index and of that index we find the title

In [137]:
recommend("Batman Begins")

The Dark Knight
Batman
The Dark Knight Rises
Amidst the Devil's Wings
Batman
