In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_csv("movie_data.csv")
data.head()

Unnamed: 0,director_name,duration,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,actor_3_name,movie_imdb_link,num_user_for_reviews,language,country,title_year,imdb_score
0,James Cameron,178.0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,Wes Studi,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,2009.0,7.9
1,Gore Verbinski,169.0,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,Jack Davenport,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,2007.0,7.1
2,Sam Mendes,148.0,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,Stephanie Sigman,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,994.0,English,UK,2015.0,6.8
3,Christopher Nolan,164.0,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,Joseph Gordon-Levitt,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2701.0,English,USA,2012.0,8.5
4,Doug Walker,,Rob Walker,Documentary,Doug Walker,Star Wars: Episode VII - The Force Awakens ...,8,,http://www.imdb.com/title/tt5289954/?ref_=fn_t...,,,,,7.1


In [3]:
data.shape

(5043, 14)

# Data wrangling

### Missing value

In [4]:
data.isna().sum()

director_name           104
duration                 15
actor_2_name             13
genres                    0
actor_1_name              7
movie_title               0
num_voted_users           0
actor_3_name             23
movie_imdb_link           0
num_user_for_reviews     21
language                 12
country                   5
title_year              108
imdb_score                0
dtype: int64

In [5]:
# These are user define value, we can't fill these value
# Also there has 5043 data

data.dropna(inplace = True)
data.shape

(4884, 14)

### ckeck duplicated in title column

In [6]:
data["movie_title"].duplicated().sum()

124

124 Duplications(drop it)

In [7]:
data = data[~data["movie_title"].duplicated()]       # boolean masking - filtering
data.head()

Unnamed: 0,director_name,duration,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,actor_3_name,movie_imdb_link,num_user_for_reviews,language,country,title_year,imdb_score
0,James Cameron,178.0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,Wes Studi,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,2009.0,7.9
1,Gore Verbinski,169.0,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,Jack Davenport,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,2007.0,7.1
2,Sam Mendes,148.0,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,Stephanie Sigman,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,994.0,English,UK,2015.0,6.8
3,Christopher Nolan,164.0,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,Joseph Gordon-Levitt,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2701.0,English,USA,2012.0,8.5
5,Andrew Stanton,132.0,Samantha Morton,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,Polly Walker,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,738.0,English,USA,2012.0,6.6


In [8]:
data.shape

(4760, 14)

In [9]:
# reindex

data.reset_index(inplace = True)
data.drop("index", axis = 1, inplace = True)
data.head()

Unnamed: 0,director_name,duration,actor_2_name,genres,actor_1_name,movie_title,num_voted_users,actor_3_name,movie_imdb_link,num_user_for_reviews,language,country,title_year,imdb_score
0,James Cameron,178.0,Joel David Moore,Action|Adventure|Fantasy|Sci-Fi,CCH Pounder,Avatar,886204,Wes Studi,http://www.imdb.com/title/tt0499549/?ref_=fn_t...,3054.0,English,USA,2009.0,7.9
1,Gore Verbinski,169.0,Orlando Bloom,Action|Adventure|Fantasy,Johnny Depp,Pirates of the Caribbean: At World's End,471220,Jack Davenport,http://www.imdb.com/title/tt0449088/?ref_=fn_t...,1238.0,English,USA,2007.0,7.1
2,Sam Mendes,148.0,Rory Kinnear,Action|Adventure|Thriller,Christoph Waltz,Spectre,275868,Stephanie Sigman,http://www.imdb.com/title/tt2379713/?ref_=fn_t...,994.0,English,UK,2015.0,6.8
3,Christopher Nolan,164.0,Christian Bale,Action|Thriller,Tom Hardy,The Dark Knight Rises,1144337,Joseph Gordon-Levitt,http://www.imdb.com/title/tt1345836/?ref_=fn_t...,2701.0,English,USA,2012.0,8.5
4,Andrew Stanton,132.0,Samantha Morton,Action|Adventure|Sci-Fi,Daryl Sabara,John Carter,212204,Polly Walker,http://www.imdb.com/title/tt0401729/?ref_=fn_t...,738.0,English,USA,2012.0,6.6


# Content Based Recommenders

In [10]:
# use content of each movies

In [11]:
df = data[["movie_title", "genres", "director_name", "actor_1_name","actor_2_name", "actor_3_name", "language"]]
df.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,language
0,Avatar,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,English
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,English
2,Spectre,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,English
3,The Dark Knight Rises,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,English
4,John Carter,Action|Adventure|Sci-Fi,Andrew Stanton,Daryl Sabara,Samantha Morton,Polly Walker,English


In [12]:
# we can't do that in separate columns, (combine all contenet)

x = " ".join(np.array(df.iloc[0, 1:]))
x

'Action|Adventure|Fantasy|Sci-Fi James Cameron CCH Pounder Joel David Moore Wes Studi English'

In [13]:
" ".join(x.split("|"))

'Action Adventure Fantasy Sci-Fi James Cameron CCH Pounder Joel David Moore Wes Studi English'

In [14]:
" ".join(" ".join(np.array(df.iloc[0, 1:])).split("|"))

'Action Adventure Fantasy Sci-Fi James Cameron CCH Pounder Joel David Moore Wes Studi English'

In [15]:
features = []

for i in range(df.shape[0]):
    features.append(" ".join(" ".join(np.array(df.iloc[i, 1:])).split("|")))
    
features

['Action Adventure Fantasy Sci-Fi James Cameron CCH Pounder Joel David Moore Wes Studi English',
 'Action Adventure Fantasy Gore Verbinski Johnny Depp Orlando Bloom Jack Davenport English',
 'Action Adventure Thriller Sam Mendes Christoph Waltz Rory Kinnear Stephanie Sigman English',
 'Action Thriller Christopher Nolan Tom Hardy Christian Bale Joseph Gordon-Levitt English',
 'Action Adventure Sci-Fi Andrew Stanton Daryl Sabara Samantha Morton Polly Walker English',
 'Action Adventure Romance Sam Raimi J.K. Simmons James Franco Kirsten Dunst English',
 'Adventure Animation Comedy Family Fantasy Musical Romance Nathan Greno Brad Garrett Donna Murphy M.C. Gainey English',
 'Action Adventure Sci-Fi Joss Whedon Chris Hemsworth Robert Downey Jr. Scarlett Johansson English',
 'Adventure Family Fantasy Mystery David Yates Alan Rickman Daniel Radcliffe Rupert Grint English',
 'Action Adventure Sci-Fi Zack Snyder Henry Cavill Lauren Cohan Alan D. Purwin English',
 'Action Adventure Sci-Fi Bryan 

In [16]:
df["feature"] = features
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["feature"] = features


Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,language,feature
0,Avatar,Action|Adventure|Fantasy|Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,English,Action Adventure Fantasy Sci-Fi James Cameron ...
1,Pirates of the Caribbean: At World's End,Action|Adventure|Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,English,Action Adventure Fantasy Gore Verbinski Johnny...
2,Spectre,Action|Adventure|Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,English,Action Adventure Thriller Sam Mendes Christoph...
3,The Dark Knight Rises,Action|Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,English,Action Thriller Christopher Nolan Tom Hardy Ch...
4,John Carter,Action|Adventure|Sci-Fi,Andrew Stanton,Daryl Sabara,Samantha Morton,Polly Walker,English,Action Adventure Sci-Fi Andrew Stanton Daryl S...


In [17]:
df = df[["movie_title", "feature"]]
df.head()

Unnamed: 0,movie_title,feature
0,Avatar,Action Adventure Fantasy Sci-Fi James Cameron ...
1,Pirates of the Caribbean: At World's End,Action Adventure Fantasy Gore Verbinski Johnny...
2,Spectre,Action Adventure Thriller Sam Mendes Christoph...
3,The Dark Knight Rises,Action Thriller Christopher Nolan Tom Hardy Ch...
4,John Carter,Action Adventure Sci-Fi Andrew Stanton Daryl S...


# # Fetaure count matrix - CountVectorizer

In [18]:
cvec = CountVectorizer()
cv_df = cvec.fit_transform(df["feature"])

In [19]:
cv_df.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
cv_df.shape

(4760, 8450)

In [21]:
df.shape

(4760, 2)

# Cosine similarity

In [22]:
cs = cosine_similarity(cv_df)
cs

array([[1.        , 0.2981424 , 0.2236068 , ..., 0.1490712 , 0.06900656,
        0.06900656],
       [0.2981424 , 1.        , 0.25      , ..., 0.08333333, 0.07715167,
        0.07715167],
       [0.2236068 , 0.25      , 1.        , ..., 0.16666667, 0.07715167,
        0.07715167],
       ...,
       [0.1490712 , 0.08333333, 0.16666667, ..., 1.        , 0.15430335,
        0.07715167],
       [0.06900656, 0.07715167, 0.07715167, ..., 0.15430335, 1.        ,
        0.07142857],
       [0.06900656, 0.07715167, 0.07715167, ..., 0.07715167, 0.07142857,
        1.        ]])

In [23]:
cs.shape

(4760, 4760)

In [24]:
cs[0]

array([1.        , 0.2981424 , 0.2236068 , ..., 0.1490712 , 0.06900656,
       0.06900656])

# Recommendations

In [25]:
title = "John Carter"       # we can't set title like that, because title has some hidden character

df[df["movie_title"] == title]

Unnamed: 0,movie_title,feature


No any movie like "John Carter"

In [26]:
df["movie_title"][4]

'John Carter\xa0'

In [27]:
title in df["movie_title"][4]

True

In [28]:
df[[title in name for name in df["movie_title"]]]

Unnamed: 0,movie_title,feature
4,John Carter,Action Adventure Sci-Fi Andrew Stanton Daryl S...


In [29]:
movie_idx = df[[title in name for name in df["movie_title"]]].index[0]
movie_idx

4

In [30]:
score = list(enumerate(cs[movie_idx]))
score

[(0, 0.35805743701971643),
 (1, 0.24019223070763074),
 (2, 0.24019223070763074),
 (3, 0.16012815380508716),
 (4, 0.9999999999999998),
 (5, 0.2508726030021272),
 (6, 0.14322297480788657),
 (7, 0.37062465833055064),
 (8, 0.15384615384615385),
 (9, 0.38461538461538464),
 (10, 0.38461538461538464),
 (11, 0.2508726030021272),
 (12, 0.24019223070763074),
 (13, 0.24019223070763074),
 (14, 0.37062465833055064),
 (15, 0.3076923076923077),
 (16, 0.37062465833055064),
 (17, 0.24019223070763074),
 (18, 0.34668762264076824),
 (19, 0.16724840200141816),
 (20, 0.3202563076101743),
 (21, 0.21483446221182984),
 (22, 0.16724840200141816),
 (23, 0.15384615384615385),
 (24, 0.23076923076923078),
 (25, 0.08362420100070908),
 (26, 0.37062465833055064),
 (27, 0.37062465833055064),
 (28, 0.35805743701971643),
 (29, 0.24019223070763074),
 (30, 0.24019223070763074),
 (31, 0.37062465833055064),
 (32, 0.16012815380508716),
 (33, 0.35805743701971643),
 (34, 0.14824986333222026),
 (35, 0.38461538461538464),
 (36, 0

In [31]:
# most recommended movies

sorted_scores = sorted(score, key = lambda x : x[1], reverse = True)
sorted_scores[1:11]        

[(1363, 0.501280411827603),
 (1767, 0.501280411827603),
 (38, 0.46153846153846156),
 (483, 0.46153846153846156),
 (57, 0.44474958999666075),
 (162, 0.44474958999666075),
 (972, 0.44474958999666075),
 (43, 0.4003203845127179),
 (47, 0.4003203845127179),
 (108, 0.4003203845127179)]

In [32]:
# index of most similarity movies

top_10 = [i[0] for i in sorted_scores[1:11]]
top_10

[1363, 1767, 38, 483, 57, 162, 972, 43, 47, 108]

In [33]:
# most similarity movies for "John Carter"

df.iloc[top_10]["movie_title"]

1363                                     Spy Kids 
1767    Spy Kids: All the Time in the World in 4D 
38                       The Amazing Spider-Man 2 
483                                      Timeline 
57                                         WALL·E 
162                                       Stealth 
972                                      The Host 
43                           Terminator Salvation 
47                        Star Trek Into Darkness 
108                            Terminator Genisys 
Name: movie_title, dtype: object

# Function for get top 10 Recommend

In [34]:
def top_10(name, n):
    movie_idx = df[[title in name for name in df["movie_title"]]].index[0]
    score = list(enumerate(cs[movie_idx]))
    orted_scores = sorted(score, key = lambda x : x[1], reverse = True)
    sorted_scores[1:n+1]    
    top_n = [i[0] for i in sorted_scores[1:n+1]]
    
    return df.iloc[top_n]["movie_title"]

In [35]:
top_10("Spy Kids", 20)

1363                                     Spy Kids 
1767    Spy Kids: All the Time in the World in 4D 
38                       The Amazing Spider-Man 2 
483                                      Timeline 
57                                         WALL·E 
162                                       Stealth 
972                                      The Host 
43                           Terminator Salvation 
47                        Star Trek Into Darkness 
108                            Terminator Genisys 
158                                     Star Trek 
182                                       Ant-Man 
279                    Terminator 2: Judgment Day 
9              Batman v Superman: Dawn of Justice 
10                               Superman Returns 
35            Transformers: Revenge of the Fallen 
36                Transformers: Age of Extinction 
39                                   TRON: Legacy 
41                                  Green Lantern 
52                 Transformers