<a href="https://colab.research.google.com/github/Derek-Martineau/Games/blob/main/Movie_Recommendation_Engine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#import the libraries
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
#load the data
df=pd.read_csv("/content/movie_data.csv")

In [None]:
#create movie id column
df['Movie_id'] = range(0,1000)
df.head(3)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2


In [None]:
#retrieve the list of columns that our dataset contains
df.columns

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore', 'Movie_id'],
      dtype='object')

In [None]:
#get a count of number of movies in dataset and number of columns
df.shape

(1000, 13)

In [None]:
#list of important columns
columns = ['Actors', 'Director', 'Genre', 'Title']

In [None]:
#show the data
df[columns].head()

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split
3,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet,"Animation,Comedy,Family",Sing
4,"Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer,"Action,Adventure,Fantasy",Suicide Squad


In [None]:
#check for any missing values in the important columns 
df[columns].isnull().values.any()

False

In [None]:
#create a function to combine the values of the important columns into one string
def get_important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['Actors'][i]+' '+data['Director'][i]+' '+data['Genre'][i]+' '+data['Title'][i])

  return important_features

In [None]:
print(get_important_features)

<function get_important_features at 0x7f8f91689940>


In [None]:
#create a column to hold the combined strings
df['important_features'] = get_important_features(df)

#show the data
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3,"Matthew McConaughey,Reese Witherspoon, Seth Ma..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4,"Will Smith, Jared Leto, Margot Robbie, Viola D..."


In [None]:
#convert the text to a matrix of token counts
#tokenization of the data of text into a matrix of counts; why do we do this because ML models are not able to intrepret natural text
cm = CountVectorizer().fit_transform(df['important_features'])

In [None]:
#get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)
#print the cosine similarity matrix
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [None]:
#get the shape of the cosine similarity matrix
cs.shape

(1000, 1000)

In [None]:
#get the title of the movie that the user likes
title = 'Deadpool'

#find the movies id
movie_id = df[df.Title == title]['Movie_id'].values[0]

In [None]:
#create a list of enums for the similarity score
scores = list(enumerate(cs[movie_id]))

In [None]:
#sort the list using the lambda function to index through the matrix
sorted_scores = sorted(scores, key = lambda x:x[1], reverse = True)
sorted_scores = sorted_scores[1:]

In [None]:
#print the sorted scores
print(sorted_scores)

[(448, 0.3450327796711771), (998, 0.28644594961577313), (673, 0.2581988897471611), (993, 0.2581988897471611), (8, 0.25048971643405976), (268, 0.25048971643405976), (67, 0.24343224778007383), (921, 0.2369395511036369), (115, 0.21483446221182984), (646, 0.21483446221182984), (846, 0.21483446221182984), (346, 0.20701966780270625), (372, 0.20701966780270625), (409, 0.20701966780270625), (464, 0.20701966780270625), (492, 0.20701966780270625), (565, 0.20701966780270625), (951, 0.20701966780270625), (14, 0.19999999999999996), (119, 0.19999999999999996), (179, 0.19999999999999996), (217, 0.19999999999999996), (331, 0.19999999999999996), (456, 0.19999999999999996), (597, 0.19999999999999996), (618, 0.19999999999999996), (688, 0.19999999999999996), (715, 0.19999999999999996), (728, 0.19999999999999996), (810, 0.19999999999999996), (833, 0.19999999999999996), (95, 0.19364916731037082), (104, 0.19364916731037082), (199, 0.19364916731037082), (220, 0.19364916731037082), (537, 0.19364916731037082), 

In [None]:
#create a loop to print the first 7 similar movies
j = 0
print('The 7 Most recommended movies to watch, similar to', title, 'are:\n')

for item in sorted_scores:
  movie_title = df[df.Movie_id == item[0]]['Title'].values[0]
  print(j+1, movie_title)
  j = j+1
  if j>6:
    break

The 7 Most recommended movies to watch, similar to Deadpool are:

1 Big Hero 6
2 Search Party
3 Green Lantern
4 Resident Evil: Afterlife
5 The Lost City of Z
6 X-Men Origins: Wolverine
7 Mad Max: Fury Road
