# <font color = 'orange'> Movie Recommendation System

---

In [40]:
import pandas as pd

In [41]:
movies = pd.read_csv('./tmdb9000movies.csv', encoding='utf-8')

movies.head(2)

Unnamed: 0,id,original_language,original_title,overview,popularity,poster_path,release_date,title,vote_average,vote_count,crew,cast,keywords,genres
0,693134,en,Dune: Part Two,Follow the mythic journey of Paul Atreides as ...,3437.313,https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...,2024-02-27,Dune: Part Two,8.3,2935,"[{'adult': False, 'gender': 2, 'id': 137427, '...","[{'adult': False, 'gender': 2, 'id': 1190668, ...","[{'id': 6917, 'name': 'epic'}, {'id': 818, 'na...","['Science Fiction', 'Adventure']"
1,1011985,en,Kung Fu Panda 4,Po is gearing up to become the spiritual leade...,2340.977,https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...,2024-03-02,Kung Fu Panda 4,7.152,1072,"[{'adult': False, 'gender': 1, 'id': 2008023, ...","[{'adult': False, 'gender': 2, 'id': 70851, 'k...","[{'id': 779, 'name': 'martial arts'}, {'id': 7...","['Animation', 'Action', 'Family', 'Comedy', 'F..."


In [42]:
movies.shape

(9918, 14)

---

## <font color='blue'> Data Preprocessing

### Removing unwanted columns

In [43]:
movies.columns

Index(['id', 'original_language', 'original_title', 'overview', 'popularity',
       'poster_path', 'release_date', 'title', 'vote_average', 'vote_count',
       'crew', 'cast', 'keywords', 'genres'],
      dtype='object')

### Wanted columns
1. genres
2. id (used to fetch the posters of the movie)
3. keywords (are like tags)
4. title (will be english)
5. overview (says summary of movies)
6. cast (actor name is present)
7. crew (director name is present)

### Unwanted columns
1. Unnamed: 0
2. original_language
3. original_title (reginal language will be present)
4. popularity
5. release_date (can be used) (parents like old movies)
6. vote_average
7. vote_count

In [44]:
wanted_columns = ['id', 'title', 'overview', 'genres', 'keywords', 'cast', 'crew', 'poster_path']
movies = movies[wanted_columns]

movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,poster_path
0,693134,Dune: Part Two,Follow the mythic journey of Paul Atreides as ...,"['Science Fiction', 'Adventure']","[{'id': 6917, 'name': 'epic'}, {'id': 818, 'na...","[{'adult': False, 'gender': 2, 'id': 1190668, ...","[{'adult': False, 'gender': 2, 'id': 137427, '...",https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...
1,1011985,Kung Fu Panda 4,Po is gearing up to become the spiritual leade...,"['Animation', 'Action', 'Family', 'Comedy', 'F...","[{'id': 779, 'name': 'martial arts'}, {'id': 7...","[{'adult': False, 'gender': 2, 'id': 70851, 'k...","[{'adult': False, 'gender': 1, 'id': 2008023, ...",https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...


### Checking and droping null and duplicated rows

In [45]:
movies.isnull().sum()

id              0
title           0
overview        3
genres          0
keywords        0
cast            0
crew            0
poster_path    26
dtype: int64

In [46]:
movies.dropna(inplace=True)

In [47]:
movies.duplicated().sum()

0

### Create 'tags' column by merging ['overview', 'genres', 'keywords', 'cast', 'crew'] columns

1. Handling 'keywords' and 'genres' columns

In [48]:
movies.iloc[0]['keywords']

"[{'id': 6917, 'name': 'epic'}, {'id': 818, 'name': 'based on novel or book'}, {'id': 1721, 'name': 'fight'}, {'id': 1965, 'name': 'sandstorm'}, {'id': 5789, 'name': 'sand'}, {'id': 6074, 'name': 'spice'}, {'id': 4238, 'name': 'chosen one'}, {'id': 9663, 'name': 'sequel'}, {'id': 11239, 'name': 'distant future'}, {'id': 13031, 'name': 'creature'}, {'id': 13194, 'name': 'planet'}, {'id': 18034, 'name': 'desert'}, {'id': 40850, 'name': 'destiny'}, {'id': 158144, 'name': 'giant worm'}, {'id': 161176, 'name': 'space opera'}, {'id': 178080, 'name': 'sand dune'}, {'id': 194063, 'name': 'messiah'}, {'id': 232766, 'name': 'vengeance'}, {'id': 286709, 'name': 'giant creature'}, {'id': 288793, 'name': 'power'}, {'id': 312898, 'name': 'violence'}]"

In [49]:
# we want in this format ["Action", "Adventure", "Fantasy", "Science Fiction"]

import ast

def get_names(li):
    li = ast.literal_eval(li)   # converting string into list
    names = []
    for di in li:
        names.append(di['name'])
    return names

movies['keywords'] = movies['keywords'].apply(get_names)

In [50]:
movies.iloc[0]['genres']

"['Science Fiction', 'Adventure']"

In [51]:
movies['genres'] = movies['genres'].apply(lambda x:ast.literal_eval(x))

2. Handling 'cast' column

In [52]:
movies.iloc[0]['cast']

'[{\'adult\': False, \'gender\': 2, \'id\': 1190668, \'known_for_department\': \'Acting\', \'name\': \'Timothée Chalamet\', \'original_name\': \'Timothée Chalamet\', \'popularity\': 128.858, \'profile_path\': \'/BE2sdjpgsa2rNTFa66f7upkaOP.jpg\', \'cast_id\': 2, \'character\': \'Paul Atreides\', \'credit_id\': \'5e959c45955c6500159f1c98\', \'order\': 0}, {\'adult\': False, \'gender\': 1, \'id\': 505710, \'known_for_department\': \'Acting\', \'name\': \'Zendaya\', \'original_name\': \'Zendaya\', \'popularity\': 89.578, \'profile_path\': \'/3WdOloHpjtjL96uVOhFRRCcYSwq.jpg\', \'cast_id\': 3, \'character\': \'Chani\', \'credit_id\': \'5e959c5caf58cb001adc1c94\', \'order\': 1}, {\'adult\': False, \'gender\': 1, \'id\': 933238, \'known_for_department\': \'Acting\', \'name\': \'Rebecca Ferguson\', \'original_name\': \'Rebecca Ferguson\', \'popularity\': 159.11, \'profile_path\': \'/lJloTOheuQSirSLXNA3JHsrMNfH.jpg\', \'cast_id\': 8, \'character\': \'Jessica\', \'credit_id\': \'5f193d28519bbb003

In [53]:
# we will take out only top 3 actors of the movie

def get_3_actors_name(li):
    ''' returns top 3 actors name '''
    li = ast.literal_eval(li)
    names = []
    counter = 0
    for di in li:
        names.append(di['name'])
        counter += 1
        if counter == 3:
            break
    return names

movies['cast'] = movies['cast'].apply(get_3_actors_name)

3. Handling 'crew' column

In [54]:
movies.iloc[0]['crew']

'[{\'adult\': False, \'gender\': 2, \'id\': 137427, \'known_for_department\': \'Directing\', \'name\': \'Denis Villeneuve\', \'original_name\': \'Denis Villeneuve\', \'popularity\': 33.839, \'profile_path\': \'/433lXlkdMGXzrpwnKM4Ul1sln15.jpg\', \'credit_id\': \'5e959beedb72c00012ad7dcf\', \'department\': \'Directing\', \'job\': \'Director\'}, {\'adult\': False, \'gender\': 2, \'id\': 12506, \'known_for_department\': \'Writing\', \'name\': \'Frank Herbert\', \'original_name\': \'Frank Herbert\', \'popularity\': 7.06, \'profile_path\': \'/1uE09gzcwXMjCfXZfXXwNDIHaXG.jpg\', \'credit_id\': \'5e9ca1974a4bfc001cd6cbdc\', \'department\': \'Writing\', \'job\': \'Novel\'}, {\'adult\': False, \'gender\': 2, \'id\': 564940, \'known_for_department\': \'Writing\', \'name\': \'Jon Spaihts\', \'original_name\': \'Jon Spaihts\', \'popularity\': 8.031, \'profile_path\': \'/jzIwgRpEG2VXYHmIyk0vpb6S3TA.jpg\', \'credit_id\': \'5e9ca1bfe22d28001b61990e\', \'department\': \'Writing\', \'job\': \'Screenplay

In [55]:
# we need only the 'movie director name'.
# means we need name of a person whose 'job' is 'director'

def get_director_name(li):
    ''' returns the movie director name '''
    li = ast.literal_eval(li)
    director_name = []
    for di in li:
        if di['job'] == 'Director':
            director_name.append(di['name'])
            break
    return director_name

movies['crew'] = movies['crew'].apply(get_director_name)

4. Handling overview column

In [56]:
# let's convert into list so that it is easy to concatinate with other column values

movies['overview'] = movies['overview'].apply(lambda string : string.split())

movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,poster_path
0,693134,Dune: Part Two,"[Follow, the, mythic, journey, of, Paul, Atrei...","[Science Fiction, Adventure]","[epic, based on novel or book, fight, sandstor...","[Timothée Chalamet, Zendaya, Rebecca Ferguson]",[Denis Villeneuve],https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...
1,1011985,Kung Fu Panda 4,"[Po, is, gearing, up, to, become, the, spiritu...","[Animation, Action, Family, Comedy, Fantasy]","[martial arts, kung fu, china, sequel, panda, ...","[Jack Black, Awkwafina, Viola Davis]",[Mike Mitchell],https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...


* Remvoing space between the words

In [57]:
# as we cannot give space in the tag so we have to remove space 
# else if 2 director have same first name then both director movie will be recommended

remove_space_columns = ['genres', 'keywords', 'cast', 'crew']

for col in remove_space_columns:
    movies[col] = movies[col].apply(lambda lst : [string.replace(' ', '') for string in lst])
    
movies.head(2)

Unnamed: 0,id,title,overview,genres,keywords,cast,crew,poster_path
0,693134,Dune: Part Two,"[Follow, the, mythic, journey, of, Paul, Atrei...","[ScienceFiction, Adventure]","[epic, basedonnovelorbook, fight, sandstorm, s...","[TimothéeChalamet, Zendaya, RebeccaFerguson]",[DenisVilleneuve],https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...
1,1011985,Kung Fu Panda 4,"[Po, is, gearing, up, to, become, the, spiritu...","[Animation, Action, Family, Comedy, Fantasy]","[martialarts, kungfu, china, sequel, panda, an...","[JackBlack, Awkwafina, ViolaDavis]",[MikeMitchell],https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...


Creating the 'tags' column

In [58]:
movies['tags'] = movies['overview'] + movies['keywords'] + movies['cast'] + movies['crew']
remove_columns = ['overview', 'genres', 'keywords', 'cast', 'crew']

new_df = movies.drop(columns = remove_columns)
new_df.head(2)

Unnamed: 0,id,title,poster_path,tags
0,693134,Dune: Part Two,https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...,"[Follow, the, mythic, journey, of, Paul, Atrei..."
1,1011985,Kung Fu Panda 4,https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...,"[Po, is, gearing, up, to, become, the, spiritu..."


In [59]:
# convert list to string and convert words present in the string to lower case

new_df['tags'] = new_df['tags'].apply(lambda lst : ' '.join(lst))
new_df['tags'] = new_df['tags'].apply(lambda string : string.lower())

new_df.head(2)

Unnamed: 0,id,title,poster_path,tags
0,693134,Dune: Part Two,https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...,follow the mythic journey of paul atreides as ...
1,1011985,Kung Fu Panda 4,https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...,po is gearing up to become the spiritual leade...


---

## <font color='blue'> Feature Engineering

In [60]:
new_df

Unnamed: 0,id,title,poster_path,tags
0,693134,Dune: Part Two,https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...,follow the mythic journey of paul atreides as ...
1,1011985,Kung Fu Panda 4,https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...,po is gearing up to become the spiritual leade...
2,823464,Godzilla x Kong: The New Empire,https://image.tmdb.org/t/p/w500/tMefBSflR6PGQL...,"following their explosive showdown, godzilla a..."
3,601796,Alienoid,https://image.tmdb.org/t/p/w500/8QVDXDiOGHRcAD...,gurus in the late goryeo dynasty try to obtain...
4,359410,Road House,https://image.tmdb.org/t/p/w500/bXi6IQiQDHD00J...,ex-ufc fighter dalton takes a job as a bouncer...
...,...,...,...,...
9913,9367,El Mariachi,https://image.tmdb.org/t/p/w500/zRh7K4SV1xQ419...,el mariachi just wants to play his guitar and ...
9914,72766,Newlyweds,https://image.tmdb.org/t/p/w500/zMEfv8533tJpNX...,a newlywed couple's honeymoon is upended by th...
9915,231617,"Signed, Sealed, Delivered",https://image.tmdb.org/t/p/w500/6BVCgmhLeSTF8n...,"""signed, sealed, delivered"" introduces a dedic..."
9916,126186,Shanghai Calling,https://image.tmdb.org/t/p/w500/2a1q1RTxspKxGW...,when ambitious new york attorney sam is sent t...


#### Stemming

In [61]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()

def stem_text(text):
    words = []
    for word in text.split(' '):
        words.append(ps.stem(word))
    return ' '.join(words)

new_df['tags'] = new_df['tags'].apply(stem_text)
new_df.head(5)

Unnamed: 0,id,title,poster_path,tags
0,693134,Dune: Part Two,https://image.tmdb.org/t/p/w500/1pdfLvkbY9ohJl...,follow the mythic journey of paul atreid as he...
1,1011985,Kung Fu Panda 4,https://image.tmdb.org/t/p/w500/kDp1vUBnMpe8ak...,po is gear up to becom the spiritu leader of h...
2,823464,Godzilla x Kong: The New Empire,https://image.tmdb.org/t/p/w500/tMefBSflR6PGQL...,"follow their explos showdown, godzilla and kon..."
3,601796,Alienoid,https://image.tmdb.org/t/p/w500/8QVDXDiOGHRcAD...,guru in the late goryeo dynasti tri to obtain ...
4,359410,Road House,https://image.tmdb.org/t/p/w500/bXi6IQiQDHD00J...,ex-ufc fighter dalton take a job as a bouncer ...


#### Vectorization

In [62]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features = 7000, stop_words = 'english')

vectors  = cv.fit_transform(new_df['tags']).toarray()

vectors

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [63]:
vectors.shape

# we have 4806 movies in dataset
# and i have taken top 5000 words
# representing each movie as a 5000 dimension vector

(9889, 7000)

#### Cosine similarity between vectors

In [64]:
from sklearn.metrics.pairwise import cosine_similarity

In [65]:
similarity = cosine_similarity(vectors)

similarity.shape
# this is the similarity of each movie with other movies

(9889, 9889)

In [66]:
# below gives the similarity of the first movie with all movies
print(similarity[0])

[1.         0.1246112  0.05572782 ... 0.02106314 0.08178608 0.04550158]


* 1 -> denotes 100% similarity with the particular movie  
so, similarity of movie itself is 100%
* Diagonal will be always 1.

---

## <font color='blue'> Recommendation function

In [67]:
list(enumerate(similarity[0]))  # to save the index position of sorting we are using enumerate

[(0, 0.9999999999999999),
 (1, 0.1246111965698067),
 (2, 0.05572782125753528),
 (3, 0.022750787759664503),
 (4, 0.06019292654288461),
 (5, 0.09968895725584535),
 (6, 0.07881104062391008),
 (7, 0.026481357066618805),
 (8, 0.027379283909669677),
 (9, 0.0),
 (10, 0.03940552031195504),
 (11, 0.1182165609358651),
 (12, 0.10767638041163309),
 (13, 0.41995538211085015),
 (14, 0.021739130434782608),
 (15, 0.04212627318711346),
 (16, 0.0),
 (17, 0.056750435383916574),
 (18, 0.04605312990585041),
 (19, 0.026919095102908273),
 (20, 0.021739130434782608),
 (21, 0.043013239361934026),
 (22, 0.02786391062876764),
 (23, 0.0),
 (24, 0.0),
 (25, 0.05897678246195886),
 (26, 0.03806934938134405),
 (27, 0.05572782125753528),
 (28, 0.0),
 (29, 0.1318760946791574),
 (30, 0.030743773095067286),
 (31, 0.10951713563867871),
 (32, 0.14457873299156007),
 (33, 0.028915746598312014),
 (34, 0.06950480468569159),
 (35, 0.0659380473395787),
 (36, 0.02948839123097943),
 (37, 0.0),
 (38, 0.021506619680967013),
 (39, 0.

In [68]:
sorted(list(enumerate(similarity[0])), reverse = True, key = lambda x : x[1]) [1:6]

# in this we want movies from index 1 to 5

[(13, 0.41995538211085015),
 (1566, 0.33701018549690775),
 (160, 0.31126635188256063),
 (1823, 0.30174873642981176),
 (9644, 0.2952931892760994)]

In [69]:
def recommend(movie):
    movie_index = new_df[ new_df['title'] == movie ].index[0]
    distance = similarity[movie_index]
    movies_list = sorted(list(enumerate(distance)), reverse = True, key = lambda x:x[1])[1:6]
    for i in movies_list:
        print(new_df.iloc[i[0]]['title'])
        print()

In [72]:
recommend('Fast X')

Furious 7

Kind Hearts and Coronets

Eat Drink Man Woman

Migration

A Madea Family Funeral



In [71]:
import dill

dill.dump(new_df, open('movies.pkl', 'wb'))
dill.dump(similarity, open('similarity.pkl', 'wb'))

---