In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
df = pd.read_csv("imdb_top_1000.csv")

In [3]:
df.head()

Unnamed: 0,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,Star1,Star2,Star3,Star4,No_of_Votes,Gross
0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,Tim Robbins,Morgan Freeman,Bob Gunton,William Sadler,2343110,28341469
1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,Marlon Brando,Al Pacino,James Caan,Diane Keaton,1620367,134966411
2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,Christian Bale,Heath Ledger,Aaron Eckhart,Michael Caine,2303232,534858444
3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,Al Pacino,Robert De Niro,Robert Duvall,Diane Keaton,1129952,57300000
4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,Henry Fonda,Lee J. Cobb,Martin Balsam,John Fiedler,689845,4360000


In [4]:
df.shape

(1000, 16)

In [5]:
df['Cast'] = df['Star1']+","+df['Star2']+","+df['Star3']+","+df['Star4']
df.drop(['Star1','Star2','Star3','Star4'], axis=1,inplace=True)
df.index.rename('index',inplace = True)

In [6]:
df=df.reset_index('index') 

In [7]:
df.head()

Unnamed: 0,index,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,Gross,Cast
0,0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,2343110,28341469,"Tim Robbins,Morgan Freeman,Bob Gunton,William ..."
1,1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,1620367,134966411,"Marlon Brando,Al Pacino,James Caan,Diane Keaton"
2,2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,2303232,534858444,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich..."
3,3,https://m.media-amazon.com/images/M/MV5BMWMwMG...,The Godfather: Part II,1974,A,202 min,"Crime, Drama",9.0,The early life and career of Vito Corleone in ...,90.0,Francis Ford Coppola,1129952,57300000,"Al Pacino,Robert De Niro,Robert Duvall,Diane K..."
4,4,https://m.media-amazon.com/images/M/MV5BMWU4N2...,12 Angry Men,1957,U,96 min,"Crime, Drama",9.0,A jury holdout attempts to prevent a miscarria...,96.0,Sidney Lumet,689845,4360000,"Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie..."


In [8]:
df.shape

(1000, 14)

In [9]:
features = ['Overview','Cast','Genre','Director']
df[features].head(5)

Unnamed: 0,Overview,Cast,Genre,Director
0,Two imprisoned men bond over a number of years...,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",Drama,Frank Darabont
1,An organized crime dynasty's aging patriarch t...,"Marlon Brando,Al Pacino,James Caan,Diane Keaton","Crime, Drama",Francis Ford Coppola
2,When the menace known as the Joker wreaks havo...,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich...","Action, Crime, Drama",Christopher Nolan
3,The early life and career of Vito Corleone in ...,"Al Pacino,Robert De Niro,Robert Duvall,Diane K...","Crime, Drama",Francis Ford Coppola
4,A jury holdout attempts to prevent a miscarria...,"Henry Fonda,Lee J. Cobb,Martin Balsam,John Fie...","Crime, Drama",Sidney Lumet


In [10]:
df[features].isnull().values.any()

False

In [11]:
for feature in features:
    df[feature]=df[feature].fillna('')

In [12]:
def combine_features(row):
    return row['Overview']+' '+row['Cast']+' '+row['Genre']+' '+row['Director']

In [13]:
df['combined_features']=df.apply(combine_features,axis=1)

In [14]:
df.head(3)

Unnamed: 0,index,Poster_Link,Series_Title,Released_Year,Certificate,Runtime,Genre,IMDB_Rating,Overview,Meta_score,Director,No_of_Votes,Gross,Cast,combined_features
0,0,https://m.media-amazon.com/images/M/MV5BMDFkYT...,The Shawshank Redemption,1994,A,142 min,Drama,9.3,Two imprisoned men bond over a number of years...,80.0,Frank Darabont,2343110,28341469,"Tim Robbins,Morgan Freeman,Bob Gunton,William ...",Two imprisoned men bond over a number of years...
1,1,https://m.media-amazon.com/images/M/MV5BM2MyNj...,The Godfather,1972,A,175 min,"Crime, Drama",9.2,An organized crime dynasty's aging patriarch t...,100.0,Francis Ford Coppola,1620367,134966411,"Marlon Brando,Al Pacino,James Caan,Diane Keaton",An organized crime dynasty's aging patriarch t...
2,2,https://m.media-amazon.com/images/M/MV5BMTMxNT...,The Dark Knight,2008,UA,152 min,"Action, Crime, Drama",9.0,When the menace known as the Joker wreaks havo...,84.0,Christopher Nolan,2303232,534858444,"Christian Bale,Heath Ledger,Aaron Eckhart,Mich...",When the menace known as the Joker wreaks havo...


In [15]:
count_matrix=CountVectorizer().fit_transform(df['combined_features'])

In [16]:
cosine_sim=cosine_similarity(count_matrix)
print(cosine_sim)

[[1.         0.09379581 0.19897096 ... 0.04888237 0.18706095 0.04175738]
 [0.09379581 1.         0.1928473  ... 0.11844484 0.09065192 0.10118058]
 [0.19897096 0.1928473  1.         ... 0.10050378 0.36857838 0.32910919]
 ...
 [0.04888237 0.11844484 0.10050378 ... 1.         0.19488113 0.1423737 ]
 [0.18706095 0.09065192 0.36857838 ... 0.19488113 1.         0.196744  ]
 [0.04175738 0.10118058 0.32910919 ... 0.1423737  0.196744   1.        ]]


In [17]:
cosine_sim.shape

(1000, 1000)

In [18]:
def get_title_from_index(index):
    return df[df.index == index]["Series_Title"].values[0]
def get_index_from_title(Series_Title):
    return df[df.Series_Title == Series_Title]["index"].values[0]

In [19]:
movie_user_likes = "The Godfather"
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))

In [20]:
print(similar_movies)

[(0, 0.09379580992210838), (1, 0.9999999999999999), (2, 0.1928473039599675), (3, 0.4304052898729295), (4, 0.25425669046549126), (5, 0.1297498240269205), (6, 0.20100756305184247), (7, 0.08471737420873576), (8, 0.09656090991705353), (9, 0.08703882797784893), (10, 0.0492365963917331), (11, 0.19837990021453855), (12, 0.050783671611902566), (13, 0.14346842995764314), (14, 0.07242068243779015), (15, 0.27247463045653303), (16, 0.08382594846159087), (17, 0.05723638070321425), (18, 0.08444006618414981), (19, 0.025125945381480306), (20, 0.0984731927834662), (21, 0.1250610798961445), (22, 0.15151515151515155), (23, 0.025391835805951283), (24, 0.08827348295047496), (25, 0.15235101483570768), (26, 0.24720661623652207), (27, 0.15151515151515155), (28, 0.21749089913081476), (29, 0.09065192181986734), (30, 0.07106690545187015), (31, 0.0657951694959769), (32, 0.11295649894498103), (33, 0.20571764439712248), (34, 0.1592795619605697), (35, 0.14712247158412492), (36, 0.08362420100070908), (37, 0.169434748

In [21]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)[1:]

In [22]:
i = 0
print("Top 7 similar movies to " + movie_user_likes+" are:\n")
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i = i+1
    if 1>=7:
        break

Top 7 similar movies to The Godfather are:

The Godfather: Part III
The Godfather: Part II
On the Waterfront
The Irishman
Drishyam
Haider
Do lok tin si
The Wolf of Wall Street
Fargo
Scarface: The Shame of the Nation
Road to Perdition
Goodfellas
Drishyam
Blood Simple
Tengoku to jigoku
Glengarry Glen Ross
The Gentlemen
The Ten Commandments
Jagten
In the Name of the Father
Dead Man's Shoes
Le passé
12 Angry Men
The Man Who Shot Liberty Valance
Ed Wood
Serpico
La vita è bella
Cat on a Hot Tin Roof
Captain Fantastic
Lord of War
Donnie Brasco
Zerkalo
Nightcrawler
Hamlet
Arsenic and Old Lace
A Wednesday
The Apartment
Swades: We, the People
Boksuneun naui geot
Out of the Past
Dip huet seung hung
Stand by Me
El secreto de sus ojos
Harvey
Knives Out
Walk the Line
Once Upon a Time in America
American Psycho
Rope
The Remains of the Day
The Butterfly Effect
Kaguyahime no monogatari
Smultronstället
Red River
Sherlock Jr.
Freaks
The Illusionist
Into the Wild
Batman Begins
The Peanut Butter Falcon
Neb

Fried Green Tomatoes
End of Watch
Harry Potter and the Sorcerer's Stone
The 39 Steps
X-Men: Days of Future Past
Hidden Figures
Mulan
Memento
Requiem for a Dream
American Beauty
Peeping Tom
Platoon
City Lights
There Will Be Blood
Room
Star Wars: Episode VII - The Force Awakens
A Man for All Seasons
Watership Down
Gravity
Lost in Translation
True Grit
21 Grams
The Help
The Boy in the Striped Pyjamas
Soorarai Pottru
Dev.D
M.S. Dhoni: The Untold Story
Naked
Apollo 13
Amadeus
The Hobbit: An Unexpected Journey
Philomena
Blood Diamond
Inception
Celda 211
Le Petit Prince
Avengers: Endgame
Stalker
Black Swan
The Wrestler
Serbuan maut
300
Finding Neverland
Metropolis
Indiana Jones and the Last Crusade
Jungfrukällan
The Blind Side
The Shawshank Redemption
A Beautiful Mind
Gully Boy
Vizontele
La dolce vita
Jab We Met
La migliore offerta
28 Days Later...
Star Wars: Episode VI - Return of the Jedi
Vertigo
Up
To Be or Not to Be
The Boondock Saints
Blazing Saddles
Dark Waters
Gone Baby Gone
Kelly's He