In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
###### helper functions. Use them when needed #######
def get_title_from_index(index):
	return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
	return df[df.title == title]["index"].values[0]
##################################################

In [5]:
##Step 1: Read CSV File
df = pd.read_csv("movie_dataset.csv")
##df.head()
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [20]:
##Step 2: Select Features
features = ['keywords','cast','genres','director']
ind = get_index_from_title("The Matrix")
print(ind)
print(df[df.index == ind]["director"].values[0])

634
Lilly Wachowski


In [6]:
##Step 3: Create a column in DF which combines all selected features
for feature in features:
    df[feature] = df[feature].fillna('')

def combine_features(row):
    try:
        return row["keywords"]+" "+row["cast"]+" "+row["genres"]+" "+row["director"]
    except:
        print ("Error: ", row)
        
df["Combined_Features"] = df.apply(combine_features,axis = 1)
print ("Combined Features:", df["Combined_Features"].head())

Combined Features: 0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 Dan...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: Combined_Features, dtype: object


In [12]:
##Step 4: Create count matrix from this new combined column
cv = CountVectorizer()
count_matrix = cv.fit_transform(df["Combined_Features"])
print (count_matrix)

  (0, 3115)	1
  (0, 2616)	1
  (0, 4886)	1
  (0, 12386)	2
  (0, 14235)	1
  (0, 2755)	1
  (0, 12299)	1
  (0, 11517)	1
  (0, 14561)	1
  (0, 14820)	1
  (0, 11490)	1
  (0, 12134)	1
  (0, 14291)	1
  (0, 12567)	1
  (0, 7496)	1
  (0, 8831)	1
  (0, 11217)	1
  (0, 86)	1
  (0, 144)	1
  (0, 4435)	1
  (0, 11745)	1
  (0, 4566)	1
  (0, 6542)	1
  (0, 2061)	1
  (1, 86)	1
  :	:
  (4801, 10069)	1
  (4801, 5844)	1
  (4801, 252)	1
  (4801, 4098)	1
  (4801, 14796)	1
  (4801, 11361)	1
  (4801, 2978)	1
  (4801, 12036)	1
  (4801, 6138)	1
  (4802, 9659)	1
  (4802, 3812)	1
  (4802, 1788)	2
  (4802, 4210)	1
  (4802, 5181)	1
  (4802, 2912)	1
  (4802, 3821)	1
  (4802, 1069)	1
  (4802, 11185)	1
  (4802, 3681)	1
  (4802, 5399)	1
  (4802, 3894)	1
  (4802, 2056)	1
  (4802, 3093)	1
  (4802, 4502)	1
  (4802, 5900)	2


In [21]:
##Step 5: Compute the Cosine Similarity based on the count_matrix
cosine_sim = cosine_similarity(count_matrix)
movie_user_likes = "Ice Age"



In [22]:
## Step 6: Get index of this movie from its title
movie_index = get_index_from_title(movie_user_likes)
similar_movies = list(enumerate(cosine_sim[movie_index]))


In [23]:
## Step 7: Get a list of similar movies in descending order of similarity score

sorted_similar_movies = sorted(similar_movies,key = lambda x:x[1], reverse=True)


In [24]:
## Step 8: Print titles of first 50 movies
i = 0
for movies in sorted_similar_movies:
    print (get_title_from_index(movies[0]))
    i = i+1
    if i>50:
        break;

Ice Age
Ice Age: Continental Drift
Ice Age: Dawn of the Dinosaurs
Ice Age: The Meltdown
Spy Kids
Bolt
Friday
Joe Dirt
Penguins of Madagascar
Monster's Ball
Beginners
Cars 2
Wreck-It Ralph
Kung Fu Panda
Mrs. Doubtfire
Spirited Away
The Shaggy Dog
Shrek the Third
Kung Fu Panda 3
Kung Pow: Enter the Fist
Walking With Dinosaurs
Rugrats in Paris: The Movie
Cold Mountain
The Nut Job
Legend of a Rabbit
Inside Out
Big Hero 6
In the Name of the King: A Dungeon Siege Tale
The Ant Bully
The Borrowers
Raising Helen
White Oleander
Shark Tale
The Lego Movie
Aliens in the Attic
Mary Poppins
Jesus' Son
The Next Best Thing
Super Mario Bros.
TMNT
Epic
Ballistic: Ecks vs. Sever
My Big Fat Greek Wedding 2
Robots
Goosebumps
Jimmy Neutron: Boy Genius
The Jungle Book 2
Ramona and Beezus
Stuart Little 2
Spirit: Stallion of the Cimarron
Titan A.E.
