In [77]:
import numpy as np
import pandas as pd
import ast

import pickle
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.stem.porter import PorterStemmer
from typing import List


In [78]:

movies = pd.read_csv('TMDB_movie_dataset_v11.csv')
credits = pd.read_csv('tmdb_5000_credits.csv')
movies = movies.merge(credits, on='title')

In [79]:
movies.head(1)

Unnamed: 0,id,title,vote_average,vote_count,status,release_date,revenue,runtime,adult,backdrop_path,...,poster_path,tagline,genres,production_companies,production_countries,spoken_languages,keywords,movie_id,cast,crew
0,27205,Inception,8.364,34495,Released,2010-07-15,825532764,148,False,/8ZTVqvKDQ8emSGUEMjsS4yHAwrp.jpg,...,/oYuLEt3zVCKq57qu2F8dT7NIa6f.jpg,Your mind is the scene of the crime.,"Action, Science Fiction, Adventure","Legendary Pictures, Syncopy, Warner Bros. Pict...","United Kingdom, United States of America","English, French, Japanese, Swahili","rescue, mission, dream, airplane, paris, franc...",27205,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [80]:
movies.shape

(16294, 27)

In [81]:

movies = movies[['id', 'title', 'overview', 'genres', 'keywords', 'tagline', 'cast', 'crew']]


In [82]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [83]:

movies['overview'] = movies['overview'].fillna('')
movies['tagline'] = movies['tagline'].fillna('')
movies['genres'] = movies['genres'].fillna('')
movies['keywords'] = movies['keywords'].fillna('')

In [84]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [85]:
movies.isnull().sum()

id          0
title       0
overview    0
genres      0
keywords    0
tagline     0
cast        0
crew        0
dtype: int64

In [86]:

#movies.dropna(inplace=True)

In [87]:
movies.drop_duplicates(subset='title', inplace=True)


In [88]:
#movies.duplicated().sum()

In [89]:
movies.head(5)


Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","Action, Science Fiction, Adventure","rescue, mission, dream, airplane, paris, franc...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."
3,157336,Interstellar,The adventures of a group of explorers who mak...,"Adventure, Drama, Science Fiction","rescue, future, spacecraft, race against time,...",Mankind was born on Earth. It was never meant ...,"[{""cast_id"": 9, ""character"": ""Joseph Cooper"", ...","[{""credit_id"": ""52fe4bbf9251416c910e4801"", ""de..."
4,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,"Drama, Action, Crime, Thriller","joker, sadism, chaos, secret identity, crime f...",Welcome to a world without rules.,"[{""cast_id"": 35, ""character"": ""Bruce Wayne"", ""...","[{""credit_id"": ""55a0eb4a925141296b0010f8"", ""de..."
6,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","Action, Adventure, Fantasy, Science Fiction","future, society, culture clash, space travel, ...",Enter the world of Pandora.,"[{""cast_id"": 242, ""character"": ""Jake Sully"", ""...","[{""credit_id"": ""52fe48009251416c750aca23"", ""de..."
11,24428,The Avengers,When an unexpected enemy emerges and threatens...,"Science Fiction, Action, Adventure","new york city, superhero, shield, based on com...",Some assembly required.,"[{""cast_id"": 46, ""character"": ""Tony Stark / Ir...","[{""credit_id"": ""52fe4495c3a368484e02b1cf"", ""de..."


In [90]:
movies.iloc[0].genres

'Action, Science Fiction, Adventure'

In [91]:

def convert(obj: str) -> List[str]:
    try:
        if isinstance(obj, str) and obj.startswith('['):
            return [item["name"].strip() for item in ast.literal_eval(obj)]
        elif isinstance(obj, str):
            return [s.strip() for s in obj.split(",") if s.strip()]
        return []
    except:
        return []

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

In [92]:

def convert_cast(obj: str) -> List[str]:
    try:
        return [i['name'] for i in ast.literal_eval(obj)[:3]]
    except:
        return []

def get_director(obj: str) -> List[str]:
    try:
        return [i['name'] for i in ast.literal_eval(obj) if i['job'] == 'Director']
    except:
        return []

In [93]:
movies.head(1)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...",Your mind is the scene of the crime.,"[{""cast_id"": 1, ""character"": ""Dom Cobb"", ""cred...","[{""credit_id"": ""56e8462cc3a368408400354c"", ""de..."


In [94]:
movies['cast'] = movies['cast'].apply(convert_cast)
movies['crew'] = movies['crew'].apply(get_director)


In [95]:
movies.head(5)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew
0,27205,Inception,"Cobb, a skilled thief who commits corporate es...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...",Your mind is the scene of the crime.,"[Leonardo DiCaprio, Joseph Gordon-Levitt, Elle...",[Christopher Nolan]
3,157336,Interstellar,The adventures of a group of explorers who mak...,"[Adventure, Drama, Science Fiction]","[rescue, future, spacecraft, race against time...",Mankind was born on Earth. It was never meant ...,"[Matthew McConaughey, Jessica Chastain, Anne H...",[Christopher Nolan]
4,155,The Dark Knight,Batman raises the stakes in his war on crime. ...,"[Drama, Action, Crime, Thriller]","[joker, sadism, chaos, secret identity, crime ...",Welcome to a world without rules.,"[Christian Bale, Heath Ledger, Aaron Eckhart]",[Christopher Nolan]
6,19995,Avatar,"In the 22nd century, a paraplegic Marine is di...","[Action, Adventure, Fantasy, Science Fiction]","[future, society, culture clash, space travel,...",Enter the world of Pandora.,"[Sam Worthington, Zoe Saldana, Sigourney Weaver]",[James Cameron]
11,24428,The Avengers,When an unexpected enemy emerges and threatens...,"[Science Fiction, Action, Adventure]","[new york city, superhero, shield, based on co...",Some assembly required.,"[Robert Downey Jr., Chris Evans, Mark Ruffalo]",[Joss Whedon]


In [96]:

movies['overview'] = movies['overview'].apply(lambda x: x.split()) #convert lists
movies['tagline'] = movies['tagline'].apply(lambda x: x.split())


for col in ['cast', 'crew']:
    movies[col] = movies[col].apply(lambda x: [i.replace(" ", "") for i in x])  #space

movies["tags"] = (
    movies["overview"] 
    + movies["tagline"] 
    + movies["genres"] 
    + movies["keywords"] 
    + movies["cast"] 
    + movies["crew"]
)


movies['tags'] = movies['tags'].apply(lambda x: ' '.join(x).lower())


new_df = movies[['id', 'title', 'tags']]

In [97]:
movies.head(10)

Unnamed: 0,id,title,overview,genres,keywords,tagline,cast,crew,tags
0,27205,Inception,"[Cobb,, a, skilled, thief, who, commits, corpo...","[Action, Science Fiction, Adventure]","[rescue, mission, dream, airplane, paris, fran...","[Your, mind, is, the, scene, of, the, crime.]","[LeonardoDiCaprio, JosephGordon-Levitt, EllenP...",[ChristopherNolan],"cobb, a skilled thief who commits corporate es..."
3,157336,Interstellar,"[The, adventures, of, a, group, of, explorers,...","[Adventure, Drama, Science Fiction]","[rescue, future, spacecraft, race against time...","[Mankind, was, born, on, Earth., It, was, neve...","[MatthewMcConaughey, JessicaChastain, AnneHath...",[ChristopherNolan],the adventures of a group of explorers who mak...
4,155,The Dark Knight,"[Batman, raises, the, stakes, in, his, war, on...","[Drama, Action, Crime, Thriller]","[joker, sadism, chaos, secret identity, crime ...","[Welcome, to, a, world, without, rules.]","[ChristianBale, HeathLedger, AaronEckhart]",[ChristopherNolan],batman raises the stakes in his war on crime. ...
6,19995,Avatar,"[In, the, 22nd, century,, a, paraplegic, Marin...","[Action, Adventure, Fantasy, Science Fiction]","[future, society, culture clash, space travel,...","[Enter, the, world, of, Pandora.]","[SamWorthington, ZoeSaldana, SigourneyWeaver]",[JamesCameron],"in the 22nd century, a paraplegic marine is di..."
11,24428,The Avengers,"[When, an, unexpected, enemy, emerges, and, th...","[Science Fiction, Action, Adventure]","[new york city, superhero, shield, based on co...","[Some, assembly, required.]","[RobertDowneyJr., ChrisEvans, MarkRuffalo]",[JossWhedon],when an unexpected enemy emerges and threatens...
14,293660,Deadpool,"[The, origin, story, of, former, Special, Forc...","[Action, Adventure, Comedy]","[superhero, anti hero, mercenary, based on com...","[Witness, the, beginning, of, a, happy, ending.]","[RyanReynolds, MorenaBaccarin, EdSkrein]",[TimMiller],the origin story of former special forces oper...
15,550,Fight Club,"[A, ticking-time-bomb, insomniac, and, a, slip...",[Drama],"[dual identity, rage and hate, based on novel ...","[Mischief., Mayhem., Soap.]","[EdwardNorton, BradPitt, MeatLoaf]",[DavidFincher],a ticking-time-bomb insomniac and a slippery s...
19,118340,Guardians of the Galaxy,"[Light, years, from, Earth,, 26, years, after,...","[Action, Science Fiction, Adventure]","[spacecraft, based on comic, space, orphan, ad...","[All, heroes, start, somewhere.]","[ChrisPratt, ZoeSaldana, DaveBautista]",[JamesGunn],"light years from earth, 26 years after being a..."
20,680,Pulp Fiction,"[A, burger-loving, hit, man,, his, philosophic...","[Thriller, Crime]","[drug dealer, boxer, massage, stolen money, br...","[Just, because, you, are, a, character, doesn'...","[JohnTravolta, SamuelL.Jackson, UmaThurman]",[QuentinTarantino],"a burger-loving hit man, his philosophical par..."
21,13,Forrest Gump,"[A, man, with, a, low, IQ, has, accomplished, ...","[Comedy, Drama, Romance]","[vietnam war, vietnam veteran, mentally disabl...","[The, world, will, never, be, the, same, once,...","[TomHanks, RobinWright, GarySinise]",[RobertZemeckis],a man with a low iq has accomplished great thi...


In [98]:
new_df.head(10)

Unnamed: 0,id,title,tags
0,27205,Inception,"cobb, a skilled thief who commits corporate es..."
3,157336,Interstellar,the adventures of a group of explorers who mak...
4,155,The Dark Knight,batman raises the stakes in his war on crime. ...
6,19995,Avatar,"in the 22nd century, a paraplegic marine is di..."
11,24428,The Avengers,when an unexpected enemy emerges and threatens...
14,293660,Deadpool,the origin story of former special forces oper...
15,550,Fight Club,a ticking-time-bomb insomniac and a slippery s...
19,118340,Guardians of the Galaxy,"light years from earth, 26 years after being a..."
20,680,Pulp Fiction,"a burger-loving hit man, his philosophical par..."
21,13,Forrest Gump,a man with a low iq has accomplished great thi...


In [99]:
new_df.head(1)

Unnamed: 0,id,title,tags
0,27205,Inception,"cobb, a skilled thief who commits corporate es..."


In [100]:

cv = CountVectorizer(max_features=1000000, stop_words='english')   # Vectorization
vectors = cv.fit_transform(new_df['tags']).toarray()               #matrix convert

In [101]:

similarity = cosine_similarity(vectors)   # Similarity matrix  #angle

In [102]:
def recommend(movie):
    movie = movie.strip()
    if movie not in new_df['title'].values:
        return []

    movie_index = new_df[new_df['title'] == movie].index[0]
    distances = list(enumerate(similarity[movie_index]))
    recommended = sorted(distances, key=lambda x: x[1], reverse=True)[1:6]
    return [new_df.iloc[i[0]].title for i in recommended]


In [103]:
    print(f"\nTop 5 movies similar to ")
    
        
#Inception ,Interstellar ,The Avengers , Iron Man


        
    recommendations = recommend("Iron Man")   



    print(recommendations)



Top 5 movies similar to 
['Iron Man', 'Iron Man 2', 'Captain America: Civil War', 'Avengers: Age of Ultron', 'The Avengers']


In [104]:
# Save files
pickle.dump(new_df.to_dict(), open('movies_dict.pkl', 'wb'))
pickle.dump(similarity, open('similarity.pkl', 'wb'))