In [11]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity


In [12]:
def get_title_from_index(index):
    return df[df.index == index]["title"].values[0]

def get_index_from_title(title):
    return df[df.title == title]["index"].values[0]

    
    

In [18]:
#Read CSV file
df = pd.read_csv("movie_dataset.csv")
df.columns

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [25]:
#Select features
features = ['keywords','cast','genres','director']

for feature in features:
    df[feature] = df[features].fillna(' ') 

In [28]:
#Create a column in DF which combines all selected features

def combine_features(row):
    
        return row['keywords'] +" "+row['cast'] +" "+row['genres'] +" "+row['director'] 
  
df["combined_features"] = df.apply(combine_features,axis=1)
print(df["combined_features"].head())

0    culture clash future space war space colony so...
1    ocean drug abuse exotic island east india trad...
2    spy based on novel secret agent sequel mi6 spy...
3    dc comics crime fighter terrorist secret ident...
4    based on novel mars medallion space travel pri...
Name: combined_features, dtype: object


In [36]:
#Create count matrix from the new combined column

cv = CountVectorizer()
count_matrix = cv.fit_transform(df["combined_features"])

In [37]:
#Compute the cosine similarity based on count matrix
cosine_sim = cosine_similarity(count_matrix)
movie_user_like = "Avatar"

In [38]:
#Get index of this movie from the title
movie_index = get_index_from_title(movie_user_like)

similar_movies = list(enumerate(cosine_sim[movie_index]))


#Get the list of all similar movies
sorted_similar_movies = sorted(similar_movies,key = lambda x:x[1], reverse = True )

In [56]:
#Print titles of  first 50 movies
i = 0
for movie in sorted_similar_movies:
    print(get_title_from_index(movie[0]))
    i = i + 1
    if i > 50 :
        break
      
    

Avatar
Space Dogs
Gravity
Jason X
Cargo
Space Chimps
Star Wars: Clone Wars: Volume 1
Guardians of the Galaxy
Planet of the Apes
Alien
In the Shadow of the Moon
Silent Running
The Astronaut's Wife
Moonraker
Trekkies
Wing Commander
Pocahontas
Lockout
The Ice Pirates
The Right Stuff
Star Trek Into Darkness
Star Trek Beyond
Elysium
Oblivion
Titan A.E.
Space Cowboys
Brother
Saving Private Ryan
Lost in Space
Event Horizon
Alien³
Spaceballs
Subway
Unbroken
Jupiter Ascending
Star Trek
Starship Troopers
Mission to Mars
Deep Impact
Soldier
Sphere
Star Trek: Insurrection
Zathura: A Space Adventure
Chicken Little
Galaxy Quest
The Thing
Lifeforce
The Empire Strikes Back
Aliens
Invaders from Mars
Crocodile Dundee
