In [101]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# These function will return the title of the movies given their indexes
def get_title_from_index(index):
    return data[data.index == index]["title"].values[0]

def get_index_from_title(title):
    return data[data.title == title]["index"].values[0]


## Step 1: Read CSV File
data = pd.read_csv("/Users/tramy/Documents/IntrotoML/MovieRecommendationEngine/movie_dataset.csv")


In [80]:
# Preview data
data.head()
data_sum = data.describe()
data.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director',
       'combined_features'],
      dtype='object')

In [81]:
##Step 2: Select Features
features = ['genres','keywords','cast']

In [84]:
# Fill in missing values
for feature in features:
    data[feature] = data[feature].fillna('')

In [96]:
data.head()

Unnamed: 0,index,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,...,spoken_languages,status,tagline,title,vote_average,vote_count,cast,crew,director,combined_features
0,0,237000000,Action Adventure Fantasy Science Fiction,http://www.avatarmovie.com/,19995,culture clash future space war space colony so...,en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Sam Worthington Zoe Saldana Sigourney Weaver S...,"[{'name': 'Stephen E. Rivkin', 'gender': 0, 'd...",James Cameron,Action Adventure Fantasy Science Fictioncultur...
1,1,300000000,Adventure Fantasy Action,http://disney.go.com/disneypictures/pirates/,285,ocean drug abuse exotic island east india trad...,en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Johnny Depp Orlando Bloom Keira Knightley Stel...,"[{'name': 'Dariusz Wolski', 'gender': 2, 'depa...",Gore Verbinski,Adventure Fantasy Actionocean drug abuse exoti...
2,2,245000000,Action Adventure Crime,http://www.sonypictures.com/movies/spectre/,206647,spy based on novel secret agent sequel mi6,en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,...,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Daniel Craig Christoph Waltz L\u00e9a Seydoux ...,"[{'name': 'Thomas Newman', 'gender': 2, 'depar...",Sam Mendes,Action Adventure Crimespy based on novel secre...
3,3,250000000,Action Crime Drama Thriller,http://www.thedarkknightrises.com/,49026,dc comics crime fighter terrorist secret ident...,en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Christian Bale Michael Caine Gary Oldman Anne ...,"[{'name': 'Hans Zimmer', 'gender': 2, 'departm...",Christopher Nolan,Action Crime Drama Thrillerdc comics crime fig...
4,4,260000000,Action Adventure Science Fiction,http://movies.disney.com/john-carter,49529,based on novel mars medallion space travel pri...,en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,...,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Taylor Kitsch Lynn Collins Samantha Morton Wil...,"[{'name': 'Andrew Stanton', 'gender': 2, 'depa...",Andrew Stanton,Action Adventure Science Fictionbased on novel...


In [86]:
# Get an overview of missing data
data.isna().sum()

index                      0
budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    0
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                       0
crew                       0
director                  30
combined_features          0
dtype: int64

In [87]:
#Combine features
def combine_features(row):
    return row['genres'] + row['keywords'] +row['cast']
data["combined_features"] = data.apply(combine_features,axis=1)


In [89]:
print (data.iloc[1].combined_features)


Adventure Fantasy Actionocean drug abuse exotic island east india trading company love of one's lifeJohnny Depp Orlando Bloom Keira Knightley Stellan Skarsg\u00e5rd Chow Yun-fat


In [93]:
##Step 4: Create count matrix from this new combined column - this includes tokenizing text data and transform it into numerical data
cv = CountVectorizer()

count_matrix = cv.fit_transform(data["combined_features"])

In [95]:
##Step 5: Compute the Cosine Similarity based on the count_matrix 
cosine_sim = cosine_similarity(count_matrix)


In [97]:
# Pick a movie that we want to find something similar
movie_user_likes = 'The Dark Knight Rises'

In [102]:
## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)

similar_movies =  list(enumerate(cosine_sim[movie_index]))

## Step 7: Get a list of similar movies in descending order of similarity score
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

## Step 8: Print titles of first 50 movies
i=0
for element in sorted_similar_movies:
    print (get_title_from_index(element[0]))
    i=i+1
    if i>50:
        break

The Dark Knight Rises
The Dark Knight
Batman Begins
Amidst the Devil's Wings
Batman & Robin
The Killer Inside Me
Takers
Hitman
10th & Wolf
Faster
The Art of War
Batman
RockNRolla
True Romance
Murder by Numbers
Blood and Wine
Lions for Lambs
Rampage
Batman Returns
London Has Fallen
The Prestige
Smilla's Sense of Snow
Child 44
The Replacement Killers
The Newton Boys
Dark Blue
Dear Wendy
Romeo Is Bleeding
Kiss of Death
Mr. & Mrs. Smith
Machine Gun McCain
The Other Side of Heaven
Kill the Messenger
Dead Man Down
Class of 1984
Suicide Squad
Interstellar
Superman
Gangster Squad
Street Kings
Harry Brown
Training Day
Get Smart
Miss Congeniality
Wild Card
Cradle 2 the Grave
The Baader Meinhof Complex
The Devil's Double
11:14
In Cold Blood
Point Blank
