In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity # References: C.D. Manning, P. Raghavan and H. Schütze (2008). Introduction to Information Retrieval. Cambridge University Press. https://nlp.stanford.edu/IR-book/html/htmledition/the-vector-space-model-for-scoring-1.html
from sklearn.feature_extraction.text import CountVectorizer
from google.colab import files

In [2]:
uploaded = files.upload()

Saving movies_dataset.csv to movies_dataset.csv


In [3]:
df = pd.read_csv('movies_dataset.csv')
df.head(20)

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4
5,6,The Great Wall,"Action,Adventure,Fantasy",European mercenaries searching for black powde...,Yimou Zhang,"Matt Damon, Tian Jing, Willem Dafoe, Andy Lau",2016,103,6.1,56036,45.13,42.0,5
6,7,La La Land,"Comedy,Drama,Music",A jazz pianist falls for an aspiring actress i...,Damien Chazelle,"Ryan Gosling, Emma Stone, Rosemarie DeWitt, J....",2016,128,8.3,258682,151.06,93.0,6
7,8,Mindhorn,Comedy,A has-been actor best known for playing the ti...,Sean Foley,"Essie Davis, Andrea Riseborough, Julian Barrat...",2016,89,6.4,2490,,71.0,7
8,9,The Lost City of Z,"Action,Adventure,Biography","A true-life drama, centering on British explor...",James Gray,"Charlie Hunnam, Robert Pattinson, Sienna Mille...",2016,141,7.1,7188,8.01,78.0,8
9,10,Passengers,"Adventure,Drama,Romance",A spacecraft traveling to a distant colony pla...,Morten Tyldum,"Jennifer Lawrence, Chris Pratt, Michael Sheen,...",2016,116,7.0,192177,100.01,41.0,9


In [4]:
df.shape

(1000, 13)

In [5]:
coulmns = ['Actors', 'Director', 'Genre', 'Title']
df[coulmns].head()

Unnamed: 0,Actors,Director,Genre,Title
0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",James Gunn,"Action,Adventure,Sci-Fi",Guardians of the Galaxy
1,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",Ridley Scott,"Adventure,Mystery,Sci-Fi",Prometheus
2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",M. Night Shyamalan,"Horror,Thriller",Split
3,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",Christophe Lourdelet,"Animation,Comedy,Family",Sing
4,"Will Smith, Jared Leto, Margot Robbie, Viola D...",David Ayer,"Action,Adventure,Fantasy",Suicide Squad


In [6]:
df[coulmns].isnull().values.any()

False

In [7]:
def important_features(data):
  important_features = []
  for i in range(0, data.shape[0]):
    important_features.append(data['Actors'][i] + ' ' + data['Director'][i] + ' '+ data['Genre'][i] + ' ' + data['Title'][i])

  return important_features



In [8]:
df['important_features'] = important_features(df)
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore,Movie_id,important_features
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0,0,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S..."
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0,1,"Noomi Rapace, Logan Marshall-Green, Michael Fa..."
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0,2,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar..."
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0,3,"Matthew McConaughey,Reese Witherspoon, Seth Ma..."
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0,4,"Will Smith, Jared Leto, Margot Robbie, Viola D..."


In [9]:
vectorizer = CountVectorizer().fit_transform(df['important_features'])

In [10]:
cs = cosine_similarity(vectorizer)
print(cs)

[[1.         0.1767767  0.06085806 ... 0.0571662  0.06537205 0.        ]
 [0.1767767  1.         0.         ... 0.         0.06933752 0.        ]
 [0.06085806 0.         1.         ... 0.         0.         0.        ]
 ...
 [0.0571662  0.         0.         ... 1.         0.06726728 0.        ]
 [0.06537205 0.06933752 0.         ... 0.06726728 1.         0.07161149]
 [0.         0.         0.         ... 0.         0.07161149 1.        ]]


In [11]:
cs.shape

(1000, 1000)

In [13]:
# write the movie name
title = 'Guardians of the Galaxy'
movie_id = df[df.Title == title]['Movie_id'].values[0]

In [15]:
scores = list(enumerate(cs[movie_id]))

In [19]:
sorted_scores = sorted(scores, key = lambda x:x[1], reverse=True)
sorted_scores = sorted_scores[1:]

In [20]:
print(sorted_scores)

[(48, 0.4001633653325207), (362, 0.4001633653325207), (257, 0.3771236166328254), (94, 0.3600411499115478), (85, 0.34299717028501775), (388, 0.34299717028501775), (162, 0.32444284226152503), (195, 0.32444284226152503), (216, 0.32444284226152503), (710, 0.32444284226152503), (566, 0.31622776601683794), (822, 0.31622776601683794), (87, 0.314970394174356), (558, 0.3086066999241839), (140, 0.3042903097250923), (728, 0.3042903097250923), (944, 0.3042903097250923), (32, 0.2946278254943948), (38, 0.2946278254943948), (76, 0.2946278254943948), (205, 0.2946278254943948), (924, 0.2946278254943948), (8, 0.2858309752375148), (200, 0.2858309752375148), (396, 0.2858309752375148), (712, 0.2858309752375148), (852, 0.2858309752375148), (176, 0.28284271247461906), (126, 0.27777777777777785), (253, 0.27777777777777785), (316, 0.27777777777777785), (325, 0.27777777777777785), (384, 0.27777777777777785), (35, 0.27036903521793754), (60, 0.27036903521793754), (429, 0.27036903521793754), (408, 0.25717224993681

In [22]:
# print the first 10 movies recommended

print('the recommended movies are:\n')
k = 0
for index in sorted_scores:
  movie_title = df[df.Movie_id == index[0]]["Title"].values[0]
  print(k+1, movie_title)
  k += 1
  if k>10:
    break

the recommended movies are:

1 Star Trek Beyond
2 Star Trek Into Darkness
3 Rise of the Planet of the Apes
4 Avengers: Age of Ultron
5 Jurassic World
6 The Wolverine
7 X-Men: Days of Future Past
8 Captain America: The First Avenger
9 Captain America: The Winter Soldier
10 Transformers: Revenge of the Fallen
11 Transformers: Dark of the Moon
