# Content Based Recommendation Model

In [1]:
import pandas as pd
import numpy as np

In [2]:
#import final dataset having required attributes and after text preprocessing
df = pd.read_csv("../Dataset/reqAttr.csv")

In [3]:
df.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combined
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi James Cameron ...
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy Gore Verbinski Johnny...
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller Sam Mendes Christoph...
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller Christopher Nolan Tom Hardy Ch...
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary Doug Walker Doug Walker Rob Walker...


In [4]:
from sklearn.feature_extraction.text import CountVectorizer
import pickle

#### Using count vectorizer and cosine similarity we will recommend movies

count vectorizer will create a sparse matrix with row equal to number of movies in our database. This matrix will be used by the cosing similarity to make a square matrix of dimension equal to number of movies in the list. Each row in the square matrix will represent a movie and columns of that particular row will give the similarity of that movie with other movies.

In [5]:
cntVec = CountVectorizer()

In [6]:
cntMat = cntVec.fit_transform(df['combined'])

In [7]:
cntMat.shape

(6670, 10649)

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

In [9]:
similarity = cosine_similarity(cntMat)

In [10]:
similarity.shape

(6670, 6670)

In [11]:
for i in similarity[0]:
    print(i,end=", ")

0.9999999999999999, 0.24174688920761409, 0.16116459280507606, 0.08058229640253803, 0.0, 0.3086066999241839, 0.253546276418555, 0.14285714285714288, 0.2964997266644405, 0.2314550249431379, 0.3086066999241839, 0.3086066999241839, 0.16903085094570333, 0.24174688920761409, 0.16116459280507606, 0.37062465833055064, 0.2314550249431379, 0.2964997266644405, 0.24174688920761409, 0.3450327796711771, 0.253546276418555, 0.24174688920761409, 0.14285714285714288, 0.253546276418555, 0.15430334996209194, 0.15430334996209194, 0.16903085094570333, 0.2964997266644405, 0.2964997266644405, 0.28571428571428575, 0.16116459280507606, 0.3223291856101521, 0.2964997266644405, 0.16116459280507606, 0.3571428571428572, 0.14824986333222026, 0.3086066999241839, 0.3086066999241839, 0.24174688920761409, 0.38575837490522985, 0.38575837490522985, 0.07412493166611013, 0.3086066999241839, 0.14824986333222026, 0.3223291856101521, 0.16116459280507606, 0.28571428571428575, 0.3571428571428572, 0.3223291856101521, 0.16903085094

, 0.0, 0.0, 0.0, 0.0, 0.08058229640253803, 0.0, 0.0, 0.08058229640253803, 0.08451542547285167, 0.0, 0.08451542547285167, 0.0, 0.0, 0.0, 0.0, 0.0, 0.15430334996209194, 0.0, 0.2314550249431379, 0.16116459280507606, 0.0, 0.08058229640253803, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08058229640253803, 0.0, 0.28571428571428575, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07142857142857144, 0.0, 0.0, 0.0, 0.0, 0.2223747949983304, 0.0, 0.0, 0.14285714285714288, 0.2964997266644405, 0.15430334996209194, 0.0, 0.0, 0.0, 0.06900655593423542, 0.07715167498104597, 0.0, 0.16116459280507606, 0.0, 0.15430334996209194, 0.0, 0.2314550249431379, 0.08451542547285167, 0.07142857142857144, 0.0, 0.0, 0.0, 0.0, 0.2223747949983304, 0.2314550249431379, 0.08058229640253803, 0.0, 0.0, 0.14824986333222026, 0.0, 0.0, 0.08451542547285167, 0.0, 0.08451542547285167, 0.3086066999241839, 0.0, 0.0, 0.0, 0.0, 0.07142857142857144, 0.15430334996209194, 0.0, 0.08058229640253803, 0.14285714285714288, 0.0, 0.0, 0.0, 0.0, 0.0741249316

, 0.0, 0.07412493166611013, 0.0, 0.0, 0.0, 0.0, 0.0, 0.06900655593423542, 0.15430334996209194, 0.08058229640253803, 0.08058229640253803, 0.0, 0.0, 0.08451542547285167, 0.0, 0.08451542547285167, 0.08058229640253803, 0.07715167498104597, 0.0, 0.07715167498104597, 0.0, 0.0, 0.14824986333222026, 0.16116459280507606, 0.0, 0.0, 0.14285714285714288, 0.0, 0.08058229640253803, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.07715167498104597, 0.14285714285714288, 0.0, 0.08058229640253803, 0.0, 0.0, 0.0, 0.0, 0.08058229640253803, 0.08058229640253803, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08058229640253803, 0.14285714285714288, 0.0, 0.0, 0.0, 0.0, 0.08058229640253803, 0.0, 0.0, 0.08058229640253803, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.08451542547285167, 0.07715167498104597, 0.0, 0.08058229640253803, 0.08058229640253803, 0.0, 0.0, 0.0, 0.08058229640253803, 0.14824986333222026, 0.2142857142857143, 0.0, 0.0, 0.08451542547285167, 0.0, 0.0, 0.08451542547285167, 0.0, 0.08058229640253803, 0.0, 0.0741

In [12]:
df.loc[df["movie_title"]=="the amazing spider-man 2"]

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combined
39,the amazing spider-man 2,Action Adventure Fantasy Sci-Fi,Marc Webb,Emma Stone,Andrew Garfield,B.J. Novak,Action Adventure Fantasy Sci-Fi Marc Webb Emma...


In [13]:
entry = df.loc[df["movie_title"]=="the amazing spider-man 2"]
entry

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combined
39,the amazing spider-man 2,Action Adventure Fantasy Sci-Fi,Marc Webb,Emma Stone,Andrew Garfield,B.J. Novak,Action Adventure Fantasy Sci-Fi Marc Webb Emma...


In [14]:
ind = entry.index[0]
ind

39

In [15]:
for val in similarity[ind]:
    print("{:.4f}".format(val),end=", ")

0.3858, 0.2611, 0.1741, 0.0870, 0.0000, 0.4167, 0.1826, 0.1543, 0.3203, 0.1667, 0.3333, 0.3333, 0.2739, 0.2611, 0.1741, 0.4003, 0.3333, 0.3203, 0.2611, 0.3727, 0.1826, 0.7833, 0.1543, 0.1826, 0.1667, 0.1667, 0.0000, 0.3203, 0.3203, 0.3086, 0.1741, 0.2611, 0.3203, 0.1741, 0.3858, 0.1601, 0.3333, 0.3333, 0.1741, 1.0000, 0.3333, 0.0801, 0.3333, 0.1601, 0.3482, 0.0870, 0.3858, 0.3858, 0.3482, 0.1826, 0.0000, 0.2500, 0.2981, 0.3333, 0.2611, 0.1443, 0.1601, 0.3203, 0.3203, 0.0833, 0.3333, 0.0833, 0.3333, 0.1667, 0.2611, 0.3333, 0.0833, 0.0833, 0.2981, 0.2801, 0.0801, 0.2402, 0.2402, 0.3086, 0.0870, 0.3333, 0.3203, 0.3086, 0.1543, 0.1667, 0.2801, 0.2500, 0.2315, 0.3086, 0.3203, 0.2402, 0.3333, 0.1543, 0.3086, 0.2801, 0.2236, 0.1667, 0.3333, 0.1667, 0.2739, 0.3333, 0.2500, 0.3086, 0.2981, 0.1826, 0.0870, 0.0870, 0.3333, 0.2315, 0.2402, 0.1667, 0.1741, 0.1601, 0.2611, 0.3482, 0.1741, 0.0833, 0.3086, 0.2165, 0.1667, 0.1667, 0.0913, 0.1543, 0.1667, 0.0833, 0.1826, 0.1491, 0.1667, 0.3858, 0.2611, 

0.0833, 0.2500, 0.0772, 0.2315, 0.0000, 0.0833, 0.0000, 0.1601, 0.0801, 0.1667, 0.0000, 0.0870, 0.0000, 0.2981, 0.0000, 0.0833, 0.2165, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1667, 0.0000, 0.0000, 0.0870, 0.0000, 0.0772, 0.0870, 0.0833, 0.0000, 0.1601, 0.0000, 0.0000, 0.0962, 0.0000, 0.0833, 0.0870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1741, 0.0833, 0.0000, 0.3203, 0.0000, 0.2315, 0.0000, 0.0000, 0.0000, 0.0000, 0.0833, 0.0833, 0.0870, 0.0772, 0.0870, 0.0000, 0.0000, 0.0801, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0870, 0.0833, 0.1667, 0.0870, 0.0833, 0.0000, 0.0000, 0.0000, 0.1601, 0.0000, 0.0000, 0.0913, 0.0833, 0.1601, 0.0913, 0.0000, 0.0000, 0.0000, 0.1491, 0.0000, 0.1667, 0.0000, 0.0000, 0.0870, 0.0000, 0.0913, 0.0000, 0.0870, 0.0833, 0.0000, 0.1667, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2402, 0.0000, 0.0000, 0.1667, 0.0000, 0.2887, 0.0000, 0.0801, 0.1543, 0.1601, 0.2100, 0.2722, 0.0000, 0.0000, 0.0833, 0.0833, 0.0000, 0.0722, 0.0000, 0.0833, 0.0000, 

, 0.0000, 0.0000, 0.0000, 0.0000, 0.0833, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0870, 0.0870, 0.0000, 0.0000, 0.0000, 0.0801, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0801, 0.2315, 0.0000, 0.0870, 0.2611, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2981, 0.0000, 0.0000, 0.0870, 0.0000, 0.0870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1667, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1667, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1601, 0.0870, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0870, 0.0000, 0.2500, 0.0000, 0.0000, 0.0000, 0.0962, 0.1667, 0.0000, 0.0801, 0.0801, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0801, 0.2500, 0.0913, 0.0000, 0.0801, 0.0913, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0833, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0870, 0.0000, 0.0000, 0.0870, 0.0000, 0.0000, 0.0000, 0.0000

0.0833, 0.0000, 0.2981, 0.0000, 0.0000, 0.0801, 0.0000, 0.0833, 0.0000, 0.0000, 0.0000, 0.0000, 0.0913, 0.1667, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.1543, 0.2611, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3333, 0.0000, 0.0801, 0.0833, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0913, 0.0000, 0.1925, 0.0000, 0.1826, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0913, 0.2981, 0.0000, 0.0000, 0.0000, 0.1543, 0.0000, 0.2100, 0.0962, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0833, 0.0000, 0.0000, 0.0000, 0.0000, 0.0913, 0.0000, 0.0000, 0.0801, 0.0962, 0.1491, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.2315, 0.0772, 0.1826, 0.1601, 0.0000, 0.1667, 0.0000, 0.0000, 0.0000, 0.1543, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.3333, 0.2041, 0.0000, 0.0000, 0.0772, 0.0000, 0.1667, 0.0000, 0.0000, 0.0000, 0.0000, 0.0000, 0.0801, 0.0000, 0.0000, 0.1667, 0.0000, 0.1543, 0.0000, 0.0870, 0.2500, 0.0000, 0.0000, 0.0000, 0.0000, 0.0801, 0.1741, 0.0000, 

#### Enumerate will help to find id of the movies

enumerate function will generate unique key for each column in a row. <a href="https://www.geeksforgeeks.org/enumerate-in-python/">check here for more</a><br>
We will sort each the row equal to the movie id of the movie given by user. we will take 10 10 reult i.e having highest similarity score and return to user 

In [16]:
listOfSimilarity = list(enumerate(similarity[ind]))

In [17]:
for key,val in listOfSimilarity:
    val = "{:.4f}".format(val)
    print(f"{key}: {val}")

0: 0.3858
1: 0.2611
2: 0.1741
3: 0.0870
4: 0.0000
5: 0.4167
6: 0.1826
7: 0.1543
8: 0.3203
9: 0.1667
10: 0.3333
11: 0.3333
12: 0.2739
13: 0.2611
14: 0.1741
15: 0.4003
16: 0.3333
17: 0.3203
18: 0.2611
19: 0.3727
20: 0.1826
21: 0.7833
22: 0.1543
23: 0.1826
24: 0.1667
25: 0.1667
26: 0.0000
27: 0.3203
28: 0.3203
29: 0.3086
30: 0.1741
31: 0.2611
32: 0.3203
33: 0.1741
34: 0.3858
35: 0.1601
36: 0.3333
37: 0.3333
38: 0.1741
39: 1.0000
40: 0.3333
41: 0.0801
42: 0.3333
43: 0.1601
44: 0.3482
45: 0.0870
46: 0.3858
47: 0.3858
48: 0.3482
49: 0.1826
50: 0.0000
51: 0.2500
52: 0.2981
53: 0.3333
54: 0.2611
55: 0.1443
56: 0.1601
57: 0.3203
58: 0.3203
59: 0.0833
60: 0.3333
61: 0.0833
62: 0.3333
63: 0.1667
64: 0.2611
65: 0.3333
66: 0.0833
67: 0.0833
68: 0.2981
69: 0.2801
70: 0.0801
71: 0.2402
72: 0.2402
73: 0.3086
74: 0.0870
75: 0.3333
76: 0.3203
77: 0.3086
78: 0.1543
79: 0.1667
80: 0.2801
81: 0.2500
82: 0.2315
83: 0.3086
84: 0.3203
85: 0.2402
86: 0.3333
87: 0.1543
88: 0.3086
89: 0.2801
90: 0.2236
91: 0.166

2042: 0.0000
2043: 0.0000
2044: 0.2236
2045: 0.0000
2046: 0.0801
2047: 0.1667
2048: 0.4003
2049: 0.1667
2050: 0.0000
2051: 0.4003
2052: 0.0801
2053: 0.0000
2054: 0.0000
2055: 0.0000
2056: 0.1741
2057: 0.0000
2058: 0.1667
2059: 0.0000
2060: 0.0000
2061: 0.1741
2062: 0.0000
2063: 0.0000
2064: 0.0000
2065: 0.0833
2066: 0.0833
2067: 0.0000
2068: 0.0000
2069: 0.0000
2070: 0.0801
2071: 0.0000
2072: 0.0000
2073: 0.0000
2074: 0.0000
2075: 0.0870
2076: 0.0000
2077: 0.0870
2078: 0.0913
2079: 0.2402
2080: 0.0000
2081: 0.0833
2082: 0.0000
2083: 0.0833
2084: 0.0000
2085: 0.0000
2086: 0.0000
2087: 0.1741
2088: 0.0000
2089: 0.0000
2090: 0.0801
2091: 0.0000
2092: 0.0000
2093: 0.1741
2094: 0.0000
2095: 0.0000
2096: 0.0000
2097: 0.0000
2098: 0.0000
2099: 0.1491
2100: 0.0000
2101: 0.0000
2102: 0.0000
2103: 0.0000
2104: 0.0000
2105: 0.3086
2106: 0.0000
2107: 0.0000
2108: 0.0000
2109: 0.0833
2110: 0.1667
2111: 0.0000
2112: 0.2500
2113: 0.3727
2114: 0.0000
2115: 0.0000
2116: 0.0870
2117: 0.0000
2118: 0.0000

3541: 0.3858
3542: 0.0000
3543: 0.0000
3544: 0.0000
3545: 0.0000
3546: 0.0000
3547: 0.0000
3548: 0.0000
3549: 0.0000
3550: 0.0000
3551: 0.0000
3552: 0.0772
3553: 0.0833
3554: 0.0000
3555: 0.0000
3556: 0.0000
3557: 0.0000
3558: 0.0000
3559: 0.0000
3560: 0.0833
3561: 0.0000
3562: 0.0000
3563: 0.1741
3564: 0.1741
3565: 0.0000
3566: 0.0000
3567: 0.0000
3568: 0.0000
3569: 0.0000
3570: 0.0000
3571: 0.0000
3572: 0.0000
3573: 0.1667
3574: 0.0000
3575: 0.2611
3576: 0.0000
3577: 0.0000
3578: 0.1667
3579: 0.0000
3580: 0.0000
3581: 0.0000
3582: 0.0913
3583: 0.0000
3584: 0.0000
3585: 0.0833
3586: 0.0000
3587: 0.1667
3588: 0.0000
3589: 0.0000
3590: 0.0000
3591: 0.0000
3592: 0.0000
3593: 0.0801
3594: 0.0000
3595: 0.0000
3596: 0.0000
3597: 0.0000
3598: 0.2611
3599: 0.0000
3600: 0.0000
3601: 0.0602
3602: 0.0000
3603: 0.0000
3604: 0.0000
3605: 0.0833
3606: 0.0000
3607: 0.0000
3608: 0.0000
3609: 0.0000
3610: 0.0870
3611: 0.0000
3612: 0.0000
3613: 0.0000
3614: 0.0000
3615: 0.0000
3616: 0.0000
3617: 0.0000

5541: 0.1826
5542: 0.0000
5543: 0.0700
5544: 0.1601
5545: 0.0000
5546: 0.1667
5547: 0.0000
5548: 0.2611
5549: 0.0833
5550: 0.0000
5551: 0.0000
5552: 0.0000
5553: 0.0000
5554: 0.0000
5555: 0.0000
5556: 0.0801
5557: 0.0913
5558: 0.0000
5559: 0.2500
5560: 0.2500
5561: 0.0000
5562: 0.0000
5563: 0.1826
5564: 0.0000
5565: 0.0000
5566: 0.0000
5567: 0.0000
5568: 0.1667
5569: 0.1741
5570: 0.0000
5571: 0.0000
5572: 0.3086
5573: 0.0000
5574: 0.0000
5575: 0.2402
5576: 0.0000
5577: 0.0000
5578: 0.0870
5579: 0.2500
5580: 0.0870
5581: 0.0000
5582: 0.0000
5583: 0.0000
5584: 0.0000
5585: 0.1667
5586: 0.0000
5587: 0.0000
5588: 0.0000
5589: 0.0000
5590: 0.0000
5591: 0.0000
5592: 0.0000
5593: 0.0000
5594: 0.0000
5595: 0.0000
5596: 0.0000
5597: 0.0870
5598: 0.2315
5599: 0.0870
5600: 0.0000
5601: 0.0000
5602: 0.0000
5603: 0.0000
5604: 0.0000
5605: 0.0000
5606: 0.0000
5607: 0.0000
5608: 0.0000
5609: 0.0000
5610: 0.0913
5611: 0.0000
5612: 0.0000
5613: 0.0000
5614: 0.0000
5615: 0.0000
5616: 0.0000
5617: 0.0000

In [18]:
len(similarity)

6670

In [19]:
listOfSimilarity = sorted(listOfSimilarity,key = lambda x: x[1], reverse=True)

In [20]:
for i in range(1,11):
    num = "{:.4f}".format(listOfSimilarity[i][1])
    print(f"{listOfSimilarity[i][0]}: {num}")

21: 0.7833
3634: 0.4804
5: 0.4167
15: 0.4003
236: 0.4003
237: 0.4003
240: 0.4003
520: 0.4003
1002: 0.4003
1536: 0.4003


#### Movies simillar to "the amazing spider-man 2"

In [21]:
for i in range(1,11):
    print(f"{df.loc[listOfSimilarity[i][0]].movie_title}")

the amazing spider-man
beastmaster 2: through the portal of time
john carter
man of steel
star wars: episode iii - revenge of the sith
star wars: episode ii - attack of the clones
star wars: episode i - the phantom menace
the league of extraordinary gentlemen
the host
star wars: episode vi - return of the jedi


# Another approach for Content Based recommender system

<p>Major drawback of our previous model was that it will fail if any new movie is present to it. Existing algorithm for previous model was to make a correlation matrix among every movie <b>known to it</b>. Based on correlation score it was sorting matrix and giving you result.</p>
<p>
    Now to extend its potential we will form a string with al the required attribute i.e. <b>genres	director_name, actor_1_name, actor_2_name and actor_3_name</b>. Now using conventional method to find correlation we will find top 10 movies among them.
</p>

In [22]:
df.head()

Unnamed: 0,movie_title,genres,director_name,actor_1_name,actor_2_name,actor_3_name,combined
0,avatar,Action Adventure Fantasy Sci-Fi,James Cameron,CCH Pounder,Joel David Moore,Wes Studi,Action Adventure Fantasy Sci-Fi James Cameron ...
1,pirates of the caribbean: at world's end,Action Adventure Fantasy,Gore Verbinski,Johnny Depp,Orlando Bloom,Jack Davenport,Action Adventure Fantasy Gore Verbinski Johnny...
2,spectre,Action Adventure Thriller,Sam Mendes,Christoph Waltz,Rory Kinnear,Stephanie Sigman,Action Adventure Thriller Sam Mendes Christoph...
3,the dark knight rises,Action Thriller,Christopher Nolan,Tom Hardy,Christian Bale,Joseph Gordon-Levitt,Action Thriller Christopher Nolan Tom Hardy Ch...
4,star wars: episode vii - the force awakens ...,Documentary,Doug Walker,Doug Walker,Rob Walker,unknown,Documentary Doug Walker Doug Walker Rob Walker...


In [24]:
df.iloc[0]["combined"]

'Action Adventure Fantasy Sci-Fi James Cameron CCH Pounder Joel David Moore Wes Studi'

### A test string to find movies relevant to Suicide Squad released in year 2016
###### It has every required attributes like genres protagonists name and director name  

In [25]:
testStr = "Action Adventure Fantasy David Ayer Will Smith Jaime Fitz Simons Ike BArinholtz"

In [26]:
import math
import re
from collections import Counter

WORD = re.compile(r"\w+")


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


In [27]:
def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [67]:
corr = []

In [78]:
vector1 = text_to_vector(testStr)
for ind, row in df.iterrows():
    vector2 = text_to_vector(row["combined"].lower())
    cosine = get_cosine(vector1, vector2)
    corr.append((row["movie_title"],cosine))

In [79]:
corr

[('avatar', 0.3086066999241839),
 ("pirates of the caribbean: at world's end", 0.26111648393354675),
 ('spectre', 0.17407765595569785),
 ('the dark knight rises', 0.08703882797784893),
 ('star wars: episode vii - the force awakens\xa0           ', 0.0),
 ('john carter', 0.16666666666666669),
 ('spider-man 3', 0.16666666666666669),
 ('tangled', 0.14433756729740646),
 ('avengers: age of ultron', 0.16012815380508716),
 ('harry potter and the half-blood prince', 0.25000000000000006),
 ('batman v superman: dawn of justice', 0.16012815380508716),
 ('superman returns', 0.16666666666666669),
 ('quantum of solace', 0.18257418583505536),
 ("pirates of the caribbean: dead man's chest", 0.26111648393354675),
 ('the lone ranger', 0.17407765595569785),
 ('man of steel', 0.24019223070763074),
 ('the chronicles of narnia: prince caspian', 0.25000000000000006),
 ('the avengers', 0.16012815380508716),
 ('pirates of the caribbean: on stranger tides', 0.26111648393354675),
 ('men in black 3', 0.3726779962

In [80]:
dfCorr = pd.DataFrame(corr, columns = ["movie_title","correlation"])

In [81]:
dfCorr.head()

Unnamed: 0,movie_title,correlation
0,avatar,0.308607
1,pirates of the caribbean: at world's end,0.261116
2,spectre,0.174078
3,the dark knight rises,0.087039
4,star wars: episode vii - the force awakens ...,0.0


In [82]:
res= dfCorr.sort_values(by=["correlation"], ascending=False)

##### After arranging in descending orders we have our top 10 recommended movies

In [88]:
res.head(10)

Unnamed: 0,movie_title,correlation
73,suicide squad,0.540062
19,men in black 3,0.372678
152,men in black ii,0.360844
6273,pokémon detective pikachu,0.348155
522,independence day,0.333333
6284,aladdin,0.333333
197,after earth,0.320256
482,mighty joe young,0.320256
6252,hellboy,0.320256
0,avatar,0.308607


In [86]:
df.iloc[73]

movie_title                                          suicide squad
genres                              Action Adventure Comedy Sci-Fi
director_name                                           David Ayer
actor_1_name                                            Will Smith
actor_2_name                                    Robin Atkin Downes
actor_3_name                                        Ike Barinholtz
combined         Action Adventure Comedy Sci-Fi David Ayer Will...
Name: 73, dtype: object

In [87]:
df.iloc[19]

movie_title                                         men in black 3
genres               Action Adventure Comedy Family Fantasy Sci-Fi
director_name                                     Barry Sonnenfeld
actor_1_name                                            Will Smith
actor_2_name                                     Michael Stuhlbarg
actor_3_name                                    Nicole Scherzinger
combined         Action Adventure Comedy Family Fantasy Sci-Fi ...
Name: 19, dtype: object