In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
# CountVectorizer is used to count the words and these will output in a matrix which then can be used to find similarity

In [3]:
from sklearn.metrics.pairwise import cosine_similarity
# cosine_similarity is metric to determine how similar docs are irrespective of size.
# It uses orientation or angle of docs instead of distance.
# Similar orientations are grouped together

In [4]:
df = pd.read_csv('movies.csv')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4803 entries, 0 to 4802
Data columns (total 24 columns):
index                   4803 non-null int64
budget                  4803 non-null int64
genres                  4775 non-null object
homepage                1712 non-null object
id                      4803 non-null int64
keywords                4391 non-null object
original_language       4803 non-null object
original_title          4803 non-null object
overview                4800 non-null object
popularity              4803 non-null float64
production_companies    4803 non-null object
production_countries    4803 non-null object
release_date            4802 non-null object
revenue                 4803 non-null int64
runtime                 4801 non-null float64
spoken_languages        4803 non-null object
status                  4803 non-null object
tagline                 3959 non-null object
title                   4803 non-null object
vote_average            4803 non-null fl

In [6]:
features = ['keywords','cast','genres','director']

In [7]:
for feature in features:
    df[feature] = df[feature].fillna('')
    

In [8]:
def combine_features(row):
    try:
        return row['keywords'] +" "+row['cast']+" "+row["genres"]+" "+row["director"]
    except:
        print ("Error:", row)
        
df["combined_features"] = df.apply(combine_features,axis=1)

In [9]:
df['combined_features'].head()

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 Dan...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combined_features, dtype: object

In [10]:
cv = CountVectorizer()

count_matrix = cv.fit_transform(df["combined_features"])

In [11]:
cosine_sim = cosine_similarity(count_matrix) 
movie_user_likes = "Avatar"

In [12]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

In [13]:
movie_index = get_index_from_title(movie_user_likes)

similar_movies =  list(enumerate(cosine_sim[movie_index]))


In [14]:
sorted_similar_movies = sorted(similar_movies,key=lambda x:x[1],reverse=True)

In [15]:
i=0
for element in sorted_similar_movies:
        print (get_title_from_index(element[0]))
        i=i+1
        if i>10:
            break

Avatar
Guardians of the Galaxy
Aliens
Star Wars: Clone Wars: Volume 1
Star Trek Into Darkness
Star Trek Beyond
Alien
Lockout
Jason X
The Helix... Loaded
Moonraker
