In [122]:
#Description: Build a movie recommendation engine using python

## Import Libraries

In [123]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

## Import Data

In [124]:
df=pd.read_csv('MoviesData.csv')

In [125]:
df.head()

Unnamed: 0,index,tmdbId,title,release_year,release_day,genres,original_language,runtime,content,production_companies,...,production_countries,status,popularity,vote_average,vote_count,keywords,cast,director,rating_count,mean_rating
0,0,119450,Dawn of the Planet of the Apes,2014,Thursday,Science Fiction|Action|Drama|Thriller,English,130.0,A group of scientists in San Francisco struggl...,Ingenious Media|Chernin Entertainment|TSG Ente...,...,United States of America,Released,75.385211,7.3,4511.0,"['leader', 'colony', 'post-apocalyptic', 'dyst...",Andy Serkis|Jason Clarke|Gary Oldman|Keri Russ...,Matt Reeves,341.0,4.054252
1,1,2124,Color of Night,1994,Friday,Drama|Mystery|Romance|Thriller,English,121.0,When New York psychiatrist Bill Capa visits Lo...,Hollywood Pictures|Cinergi Pictures Entertainment,...,United States of America,Released,14.228963,5.4,117.0,"['suicide', 'california', 'sex', 'secret ident...",Bruce Willis|Jane March|Rubén Blades|Lesley An...,Richard Rush,324.0,4.256173
2,2,75656,Now You See Me,2013,Wednesday,Thriller|Crime,English,115.0,An FBI agent and an Interpol detective track a...,Summit Entertainment|K/O Paper Products|SOIXAN...,...,United States of America|France,Released,17.852022,7.3,5635.0,"['paris', 'bank', 'secret', 'fbi', 'vault', 'm...",Jesse Eisenberg|Mark Ruffalo|Woody Harrelson|M...,Louis Leterrier,311.0,4.487138
3,3,567,Rear Window,1954,Sunday,Drama|Mystery|Thriller,English,112.0,"Professional photographer L.B. ""Jeff"" Jeffries...",Paramount Pictures,...,United States of America,Released,17.911314,8.2,1531.0,"['nurse', 'photographer', 'suspicion of murder...",James Stewart|Grace Kelly|Wendell Corey|Thelma...,Alfred Hitchcock,304.0,4.138158
4,4,24428,The Avengers,2012,Wednesday,Science Fiction|Action|Adventure,English,143.0,When an unexpected enemy emerges and threatens...,Paramount Pictures|Marvel Studios,...,United States of America,Released,89.887648,7.4,12000.0,"['new york', 'shield', 'marvel comic', 'superh...",Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,Joss Whedon,291.0,4.221649


In [126]:
#Get a count of the number of rows/movies in the dataset and the number of the columns 
df.shape

(9081, 22)

In [140]:
#Create a list of important columns for the recommendation engine
columns = ['title','genres', 'director', 'cast']

In [141]:
df2= df[columns]
df2.head()

Unnamed: 0,title,genres,director,cast
0,Dawn of the Planet of the Apes,Science Fiction|Action|Drama|Thriller,Matt Reeves,Andy Serkis|Jason Clarke|Gary Oldman|Keri Russ...
1,Color of Night,Drama|Mystery|Romance|Thriller,Richard Rush,Bruce Willis|Jane March|Rubén Blades|Lesley An...
2,Now You See Me,Thriller|Crime,Louis Leterrier,Jesse Eisenberg|Mark Ruffalo|Woody Harrelson|M...
3,Rear Window,Drama|Mystery|Thriller,Alfred Hitchcock,James Stewart|Grace Kelly|Wendell Corey|Thelma...
4,The Avengers,Science Fiction|Action|Adventure,Joss Whedon,Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...


In [142]:
#Check for any missing values in the important columns
df2.isnull().sum()

title        0
genres      35
director     0
cast        90
dtype: int64

In [143]:
df2.dropna(how = 'any', inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2.dropna(how = 'any', inplace=True)


In [144]:
df2.isnull().sum()

title       0
genres      0
director    0
cast        0
dtype: int64

In [145]:
df2.shape

(8962, 4)

### df2 and df merged 

In [146]:
df3= pd.merge(df,df2)
df3.head()

Unnamed: 0,index,tmdbId,title,release_year,release_day,genres,original_language,runtime,content,production_companies,...,production_countries,status,popularity,vote_average,vote_count,keywords,cast,director,rating_count,mean_rating
0,0,119450,Dawn of the Planet of the Apes,2014,Thursday,Science Fiction|Action|Drama|Thriller,English,130.0,A group of scientists in San Francisco struggl...,Ingenious Media|Chernin Entertainment|TSG Ente...,...,United States of America,Released,75.385211,7.3,4511.0,"['leader', 'colony', 'post-apocalyptic', 'dyst...",Andy Serkis|Jason Clarke|Gary Oldman|Keri Russ...,Matt Reeves,341.0,4.054252
1,1,2124,Color of Night,1994,Friday,Drama|Mystery|Romance|Thriller,English,121.0,When New York psychiatrist Bill Capa visits Lo...,Hollywood Pictures|Cinergi Pictures Entertainment,...,United States of America,Released,14.228963,5.4,117.0,"['suicide', 'california', 'sex', 'secret ident...",Bruce Willis|Jane March|Rubén Blades|Lesley An...,Richard Rush,324.0,4.256173
2,2,75656,Now You See Me,2013,Wednesday,Thriller|Crime,English,115.0,An FBI agent and an Interpol detective track a...,Summit Entertainment|K/O Paper Products|SOIXAN...,...,United States of America|France,Released,17.852022,7.3,5635.0,"['paris', 'bank', 'secret', 'fbi', 'vault', 'm...",Jesse Eisenberg|Mark Ruffalo|Woody Harrelson|M...,Louis Leterrier,311.0,4.487138
3,3,567,Rear Window,1954,Sunday,Drama|Mystery|Thriller,English,112.0,"Professional photographer L.B. ""Jeff"" Jeffries...",Paramount Pictures,...,United States of America,Released,17.911314,8.2,1531.0,"['nurse', 'photographer', 'suspicion of murder...",James Stewart|Grace Kelly|Wendell Corey|Thelma...,Alfred Hitchcock,304.0,4.138158
4,4,24428,The Avengers,2012,Wednesday,Science Fiction|Action|Adventure,English,143.0,When an unexpected enemy emerges and threatens...,Paramount Pictures|Marvel Studios,...,United States of America,Released,89.887648,7.4,12000.0,"['new york', 'shield', 'marvel comic', 'superh...",Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,Joss Whedon,291.0,4.221649


In [147]:
df3.shape

(8962, 22)

In [148]:
df3.reset_index(inplace=True)

In [149]:
df3.rename({'level_0':'indexx'}, axis = 'columns', inplace=True)

In [150]:
#Create a function to combine the values of the important columns into a single string
def get_important_features(data):
    important_features = []
    for i in range(0,data.shape[0]):
        important_features.append(data['director'][i]+' '+data['genres'][i]+' '+data['title'][i]+ ' '+data['cast'][i])
    return important_features

In [151]:
df3.head()

Unnamed: 0,indexx,index,tmdbId,title,release_year,release_day,genres,original_language,runtime,content,...,production_countries,status,popularity,vote_average,vote_count,keywords,cast,director,rating_count,mean_rating
0,0,0,119450,Dawn of the Planet of the Apes,2014,Thursday,Science Fiction|Action|Drama|Thriller,English,130.0,A group of scientists in San Francisco struggl...,...,United States of America,Released,75.385211,7.3,4511.0,"['leader', 'colony', 'post-apocalyptic', 'dyst...",Andy Serkis|Jason Clarke|Gary Oldman|Keri Russ...,Matt Reeves,341.0,4.054252
1,1,1,2124,Color of Night,1994,Friday,Drama|Mystery|Romance|Thriller,English,121.0,When New York psychiatrist Bill Capa visits Lo...,...,United States of America,Released,14.228963,5.4,117.0,"['suicide', 'california', 'sex', 'secret ident...",Bruce Willis|Jane March|Rubén Blades|Lesley An...,Richard Rush,324.0,4.256173
2,2,2,75656,Now You See Me,2013,Wednesday,Thriller|Crime,English,115.0,An FBI agent and an Interpol detective track a...,...,United States of America|France,Released,17.852022,7.3,5635.0,"['paris', 'bank', 'secret', 'fbi', 'vault', 'm...",Jesse Eisenberg|Mark Ruffalo|Woody Harrelson|M...,Louis Leterrier,311.0,4.487138
3,3,3,567,Rear Window,1954,Sunday,Drama|Mystery|Thriller,English,112.0,"Professional photographer L.B. ""Jeff"" Jeffries...",...,United States of America,Released,17.911314,8.2,1531.0,"['nurse', 'photographer', 'suspicion of murder...",James Stewart|Grace Kelly|Wendell Corey|Thelma...,Alfred Hitchcock,304.0,4.138158
4,4,4,24428,The Avengers,2012,Wednesday,Science Fiction|Action|Adventure,English,143.0,When an unexpected enemy emerges and threatens...,...,United States of America,Released,89.887648,7.4,12000.0,"['new york', 'shield', 'marvel comic', 'superh...",Robert Downey Jr.|Chris Evans|Mark Ruffalo|Chr...,Joss Whedon,291.0,4.221649


In [139]:
#Create a column to hold the combined strings
df3['important_features'] = get_important_features(df3)

TypeError: can only concatenate str (not "float") to str

In [119]:
#Show the data
df3.head(3)

Unnamed: 0,indexx,index,tmdbId,title,release_year,release_day,genres,original_language,runtime,content,...,status,popularity,vote_average,vote_count,keywords,cast,director,rating_count,mean_rating,important_features
0,0,0,119450,Dawn of the Planet of the Apes,2014,Thursday,Science Fiction|Action|Drama|Thriller,English,130.0,A group of scientists in San Francisco struggl...,...,Released,75.385211,7.3,4511.0,"['leader', 'colony', 'post-apocalyptic', 'dyst...",Andy Serkis|Jason Clarke|Gary Oldman|Keri Russ...,Matt Reeves,341.0,4.054252,Matt Reeves Science Fiction|Action|Drama|Thril...
1,1,1,2124,Color of Night,1994,Friday,Drama|Mystery|Romance|Thriller,English,121.0,When New York psychiatrist Bill Capa visits Lo...,...,Released,14.228963,5.4,117.0,"['suicide', 'california', 'sex', 'secret ident...",Bruce Willis|Jane March|Rubén Blades|Lesley An...,Richard Rush,324.0,4.256173,Richard Rush Drama|Mystery|Romance|Thriller Co...
2,2,2,75656,Now You See Me,2013,Wednesday,Thriller|Crime,English,115.0,An FBI agent and an Interpol detective track a...,...,Released,17.852022,7.3,5635.0,"['paris', 'bank', 'secret', 'fbi', 'vault', 'm...",Jesse Eisenberg|Mark Ruffalo|Woody Harrelson|M...,Louis Leterrier,311.0,4.487138,Louis Leterrier Thriller|Crime Now You See Me


In [109]:
#Convert the text to a matrix of token counts
cm = CountVectorizer().fit_transform(df3['important_features'])

In [110]:
#Get the cosine similarity matrix from the count matrix
cs = cosine_similarity(cm)

#Print the cosine similarity matrix
print(cs)

[[1.         0.31426968 0.08333333 ... 0.16666667 0.15713484 0.15713484]
 [0.31426968 1.         0.11785113 ... 0.         0.11111111 0.        ]
 [0.08333333 0.11785113 1.         ... 0.         0.         0.        ]
 ...
 [0.16666667 0.         0.         ... 1.         0.11785113 0.23570226]
 [0.15713484 0.11111111 0.         ... 0.11785113 1.         0.11111111]
 [0.15713484 0.         0.         ... 0.23570226 0.11111111 1.        ]]


In [111]:
#Get the shape of the cosine similarity matrix
cs.shape

(9046, 9046)

In [112]:
#Get the title of one movie
title='Dawn of the Planet of the Apes'
title2= 'Get Him to the Greek'

#Find the movie id
movie_id= df3[df3.title == title]['index'].values[0]
movie_id2= df3[df3.title == title2]['indexx'].values[0]

In [113]:
movie_id2

9043

In [114]:
#Create a list of enumerations for the similarity score [ (movie_id, similarity score), (...)]
scores = list(enumerate(cs[movie_id]))
scores2 = list(enumerate(cs[movie_id2]))

In [115]:
#Sort the list
sorted_scores =  sorted(scores, key= lambda x: x[1], reverse = True)
sorted_scores = sorted_scores[1:]
sorted_scores2 =  sorted(scores2, key= lambda x: x[1], reverse = True)
sorted_scores2 = sorted_scores2[1:]

In [68]:
#Print the sorted scores
print(sorted_scores)

[(978, 0.8333333333333335), (6237, 0.7660323462854266), (9, 0.7106690545187015), (1349, 0.7106690545187015), (1095, 0.6929348671835832), (7032, 0.6929348671835832), (4525, 0.669438681395203), (7577, 0.669438681395203), (6391, 0.629940788348712), (163, 0.596284793999944), (2760, 0.5948118774794626), (7752, 0.5883484054145521), (7972, 0.5883484054145521), (9004, 0.5883484054145521), (1583, 0.5689945423921312), (6545, 0.5689945423921312), (220, 0.5685352436149612), (5005, 0.5685352436149612), (7168, 0.5685352436149612), (1479, 0.5499719409228704), (6320, 0.5499719409228704), (1636, 0.5499719409228703), (3027, 0.5499719409228703), (1585, 0.5477225575051661), (2587, 0.5477225575051661), (1685, 0.5443310539518175), (3487, 0.5443310539518175), (4586, 0.5443310539518175), (7121, 0.5443310539518175), (7881, 0.5443310539518175), (8653, 0.5443310539518175), (1508, 0.5407380704358751), (646, 0.5406205059012896), (6351, 0.5406205059012896), (6278, 0.5345224838248488), (2326, 0.5303300858899107), (3

In [116]:
#Print the sorted scores
print(sorted_scores2)

[(6240, 0.5345224838248487), (6980, 0.5303300858899106), (1354, 0.4330127018922194), (2466, 0.40089186286863654), (7277, 0.40089186286863654), (250, 0.3749999999999999), (849, 0.3749999999999999), (868, 0.3749999999999999), (2479, 0.3749999999999999), (3042, 0.3749999999999999), (7745, 0.3749999999999999), (1789, 0.36514837167011066), (5537, 0.36514837167011066), (2588, 0.35355339059327373), (2985, 0.35355339059327373), (3339, 0.35355339059327373), (7753, 0.35355339059327373), (8461, 0.35355339059327373), (8870, 0.35355339059327373), (6845, 0.34299717028501764), (348, 0.33541019662496846), (2650, 0.33541019662496846), (3631, 0.33541019662496846), (4379, 0.33541019662496846), (6042, 0.33541019662496846), (7495, 0.33541019662496846), (623, 0.324442842261525), (359, 0.3198010745334156), (2742, 0.3198010745334156), (3967, 0.3198010745334156), (5035, 0.3198010745334156), (5046, 0.3198010745334156), (5946, 0.3198010745334156), (6565, 0.3198010745334156), (6731, 0.3198010745334156), (8278, 0.

In [118]:
#Create a loop to print the 7 similar movies
j=0
print('The 7 most recommended movies to', title, 'are: \n')
for item in sorted_scores:
    movie_title = df3[df3.indexx == item[0]]['title'].values[0]
    print(j+1, movie_title)
    j+=1
    if j>6:
        break

The 7 most recommended movies to Dawn of the Planet of the Apes are: 

1 Rise of the Planet of the Apes
2 Conquest of the Planet of the Apes
3 Planet of the Apes
4 Planet of the Apes
5 Battle for the Planet of the Apes
6 Escape from the Planet of the Apes
7 Until the End of the World


In [120]:
#Create a loop to print the 7 similar movies
j=0
print('The 7 most recommended movies to', title, 'are: \n')
for item in sorted_scores2:
    movie_title = df3[df3.indexx == item[0]]['title'].values[0]
    print(j+1, movie_title)
    j+=1
    if j>6:
        break

The 7 most recommended movies to Dawn of the Planet of the Apes are: 

1 The Five-Year Engagement
2 Neighbors
3 Neighbors 2: Sorority Rising
4 The History Boys
5 Welcome to the Jungle
6 Forgetting Sarah Marshall
7 Back to the Beach
