In [3]:
#Load useful libraries
import pandas as pd

movies = pd.read_csv('movie_dataset.csv') #data from UCI repository-- the famous movie_lens

In [4]:
movies.columns #check what data we have

Index(['index', 'budget', 'genres', 'homepage', 'id', 'keywords',
       'original_language', 'original_title', 'overview', 'popularity',
       'production_companies', 'production_countries', 'release_date',
       'revenue', 'runtime', 'spoken_languages', 'status', 'tagline', 'title',
       'vote_average', 'vote_count', 'cast', 'crew', 'director'],
      dtype='object')

In [5]:
features = ['keywords','cast','genres','director'] #pick features to use--this features explain the content of the movie

In [6]:
movies.isna().sum() #check if we have NAS

index                      0
budget                     0
genres                    28
homepage                3091
id                         0
keywords                 412
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
cast                      43
crew                       0
director                  30
dtype: int64

In [7]:
for feature in features:
    movies[feature] = movies[feature].fillna('') #replace NAs with empty string

In [8]:
#combine the features into a single string and store in a column called combine_features
movies['combined_features'] = movies['keywords']+" "+movies['cast']+" "+movies['genres']+" "+movies['director']

In [9]:
movies.iloc[0].combined_features #check that we got it right

'culture clash future space war space colony society Sam Worthington Zoe Saldana Sigourney Weaver Stephen Lang Michelle Rodriguez Action Adventure Fantasy Science Fiction James Cameron'

In [10]:
#let us convert the features to a count matrix using countvectorizer
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()
count_matrix = cv.fit_transform(movies['combined_features'])

In [11]:
# let us find the cosine similarity from the count matrix
from sklearn.metrics.pairwise import cosine_similarity
cosine_sim = cosine_similarity(count_matrix)

In [12]:
#let us define two functions that we'll use to retrieve the index and title of the movies
def get_title_from_index(index):
    return movies[movies.index == index]['title'].values[0]

def get_index_from_Title(title):
    return movies[movies.title == title]['index'].values[0]

Please **concentrate**, this is crucial but also tricky.

In [15]:
#get the movie liked by the user, get similar movies
movie_liked = 'Avatar'
movie_index = get_index_from_Title(movie_liked)
similar_movies = list(enumerate(cosine_sim[movie_index])) #return both index and score of a movie

In [16]:
#sort the movies by score and discard the first one since it is itself
sorted_similar_movies = sorted(similar_movies,key = lambda x:x[1],reverse=True)[1:]

In [17]:
#get the title names of the top k movies
i = 0
k = 5
print ('Top ' +str(k)+' similar movies to ' +movie_liked+ ' are: \n')
for element in sorted_similar_movies:
    print(get_title_from_index(element[0]))
    i+=1
    if i >k:
        break

Top 5 similar movies to Avatar are: 

Guardians of the Galaxy
Aliens
Star Wars: Clone Wars: Volume 1
Star Trek Into Darkness
Star Trek Beyond
Alien


Please Google **'similar movies to avatar'**

<img src="similarmovies.png">

Our simple movie recommendation engine works pretty good. Right? It’s good as a basic level implementation but obviously not anywhere near industry standards. This code be found in my github repository and here is the link https://github.com/AlexSananka/Machine-Learning-with-Python