In [None]:
#import the libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#load the data
df=pd.read_csv("/content/Data/movie_data.csv")

In [None]:
#create movie id column
df['Movie_id'] = range(0,1000)
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2


In [None]:
#get a count of number of movies in dataset and number of columns
df.shape

(1000, 13)

In [None]:
#list of important columns
columns = ['Actors', 'Director', 'Genre', 'Title']

In [None]:
#show the data
df[columns].head(3)

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split


In [None]:
#check for any missing values in the important columns 
df[columns].isnull().values.any()

False

In [None]:
#create a function to combine the values of the important columns into one string
def get_important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['Actors'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Title'][i])

  return important_features

In [None]:
#create a column to hold the combined strings
df['important_features'] = get_important_features(df)

#show the data
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."


In [None]:
#convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(df['important_features'])

In [None]:
#get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)
#print the cosine similarity matrix
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [None]:
#get the shape of the cosine similarity matrix
cs.shape

(1000, 1000)

In [None]:
#get the title of the movie that the user likes
title = 'The Dark Knight'

#find the movies id
movie_id = df[df.Title == title]['Movie_id'].values[0]

In [None]:
#create a list of enums for the similarity score
scores = list(enumerate(cs[movie_id]))

In [None]:
#sort the list
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]

In [None]:
#print the sorted scores
print(sorted_scores)

[(64, 0.5), (124, 0.5), (655, 0.3638034375544995), (759, 0.32274861218395134), (424, 0.3125), (852, 0.30316953129541624), (223, 0.2727723627949905), (89, 0.2672612419124244), (347, 0.2672612419124244), (482, 0.2672612419124244), (511, 0.2672612419124244), (856, 0.2672612419124244), (391, 0.2581988897471611), (521, 0.2581988897471611), (663, 0.2581988897471611), (738, 0.2581988897471611), (70, 0.25), (142, 0.25), (601, 0.25), (65, 0.24253562503633297), (107, 0.24253562503633297), (516, 0.24253562503633297), (700, 0.24253562503633297), (938, 0.24253562503633297), (438, 0.23570226039551587), (535, 0.23570226039551587), (821, 0.23570226039551587), (604, 0.22941573387056174), (62, 0.22360679774997896), (566, 0.22360679774997896), (135, 0.2182178902359924), (308, 0.20851441405707477), (59, 0.20801257358446093), (890, 0.20801257358446093), (26, 0.2004459314343183), (192, 0.2004459314343183), (240, 0.2004459314343183), (293, 0.2004459314343183), (329, 0.2004459314343183), (372, 0.2004459314343

In [None]:
#create a loop to print the first 7 similar movies
j = 0
print('The 7 Most recommended movies to', title, 'are:\n')
for item in sorted_scores:
  movie_title = df[df.Movie_id == item[0]]['Title'].values[0]
  print(j+1, movie_title)
  j = j+1
  if j>6:
    break

The 7 Most recommended movies to The Dark Knight are:

1 The Prestige
2 The Dark Knight Rises
3 Public Enemies
4 The Fighter
5 London Has Fallen
6 Out of the Furnace
7 The Girl with the Dragon Tattoo
