In [None]:
# Building a movie recommendation engine using python
import pandas as pd
import numpy as np 
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#Load data 

from google.colab import files
uploaded = files.upload()

Saving movie_data.csv to movie_data (1).csv


In [None]:
#store data 

df = pd.read_csv('movie_data.csv')
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0


In [None]:
# get count
df.shape

(1000, 12)

In [None]:
# create a list of important columns 

columns = ['Actors', 'Director', 'Genre', 'Title']

In [None]:
# show the data 
df[columns].head(3)

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split


In [None]:
# check for any missing values in the imported columns 

df[columns].isnull().values.any()

False

In [None]:
# create a function to combine 
def get_important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['Actors'][i]+' ' + data['Director'][i] + ' ' + data['Title'][i])
  return important_features

In [None]:
# create a column to hold the combined strings
df['important_features'] = get_important_features(df)

# show
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,"Matthew McConaughey,Reese Witherspoon, Seth Ma..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,"Will Smith, Jared Leto, Margot Robbie, Viola D..."


In [None]:
df.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore', 'important_features'],
      dtype='object')

In [None]:
# convert the text to a matrix of token counts 

cm = CountVectorizer().fit_transform(df['important_features'])

In [None]:
# get the cosine similarity matrix from the count matrix 

cs = cosine_similarity(cm)

print(cs)

[[1.         0.         0.07412493 ... 0.07142857 0.         0.        ]
 [0.         1.         0.         ... 0.         0.         0.        ]
 [0.07412493 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.07142857 0.         0.         ... 1.         0.0805823  0.        ]
 [0.         0.         0.         ... 0.0805823  1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


In [None]:
cs.shape

(1000, 1000)

In [None]:
# get the title of the movie that the user likes 
title = 'Suicide Squad'

movie_id = df[df.Title == title]['Rank'].values[0] # at position 0


In [None]:
# create a list of enumerations for the similarity score

scores = list(enumerate(cs[movie_id]))

In [None]:
sorted_scores = sorted(scores, key = lambda x: x[1], reverse = True) # x corresponds to scores and x[1] element in position 1 , in descending order
#sorted_scores
sorted_scores = sorted_scores[1:]

In [None]:
print(sorted_scores)

[(99, 0.24019223070763074), (102, 0.24019223070763074), (176, 0.23652495839563303), (416, 0.23076923076923078), (427, 0.23076923076923078), (706, 0.23076923076923078), (604, 0.20180183819889375), (257, 0.18156825980064073), (313, 0.16984155512168939), (526, 0.16724840200141816), (622, 0.16724840200141816), (909, 0.16724840200141816), (53, 0.16012815380508716), (389, 0.16012815380508716), (743, 0.16012815380508716), (907, 0.16012815380508716), (17, 0.15384615384615385), (95, 0.15384615384615385), (137, 0.15384615384615385), (593, 0.15384615384615385), (720, 0.14824986333222026), (82, 0.14322297480788657), (271, 0.14322297480788657), (440, 0.14322297480788657), (779, 0.14322297480788657), (62, 0.1345345587992625), (302, 0.1345345587992625), (135, 0.1307440900921227), (209, 0.1307440900921227), (223, 0.1307440900921227), (518, 0.1307440900921227), (558, 0.1307440900921227), (843, 0.1307440900921227), (510, 0.12725695259515554), (739, 0.12725695259515554), (978, 0.12725695259515554), (308,

In [None]:
# create a loop to print the first seven similar movies

j = 0 
print ('The 7 most recommended movies to:' , title, 'are:\n')
for item in sorted_scores:
  movie_title = df[df.Rank == item[0]]['Title'].values[0]
  print (j+1, movie_title)
  j = j + 1 
  if j>6:
    break

The 7 most recommended movies to: Suicide Squad are:

1 Personal Shopper
2 Thor
3 Tomorrowland
4 Tusk
5 Sin City: A Dame to Kill For
6 The Neighbor
7 Kung Fu Panda 3
