In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [22]:
!pip install ipynb
#%run Preprocessing.ipynb 
from ipynb.fs.full.preprocessing import *



In [14]:
movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [15]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorized = vectorizer.fit_transform(movies['overview'])
print("shape" , vectorized.shape)
# it is mxn matrix with m equal to the total rows of  dataset. 
# n is total no of different words used in overview column
# now if given a row or title we need to find other rows with similar data in the matrix

shape (4803, 20978)


In [16]:
similarityMatrix = linear_kernel(vectorized, vectorized)
# it finds the similarity with all other rows in matrix and returns a list 
similarityMatrix

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

Similar score matrix data can be fetched by using index of the required row. So if user searches for a particular movie, we need to get the index of that movie and then get the similartiy scores data. For this conversion instead of converting each time during search we can create a reverse map to store tiles as key & index as values. 

In [17]:
indexMap = pd.Series(movies.index, index=movies['title'])
indexMap.head()
#indexMap.duplicated().sum()

title
Avatar                                      0
Pirates of the Caribbean: At World's End    1
Spectre                                     2
The Dark Knight Rises                       3
John Carter                                 4
dtype: int64

Once we get similarity scores for a particular movie, we need to sort them to find top 10 similar movies indices & then fetch the titles to recommend to the user. As any movie will have 1 as similarity score when compared with itself, its own index appears at top of list. So we ignore it and take next 10 items from the results. 

In [18]:
#sorting
sim_sorted = np.argsort(similarityMatrix[0])
#top 10 
sim_indices = sim_sorted[::-1][:11]
#titles
sim_titles = movies.iloc[sim_indices]['title']
sim_titles[1:]

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object

In [19]:
def overviewRecommender(name):
    index = indexMap[name]
    scores = similarityMatrix[index]
    print("Top 10 similarity scores", np.sort(scores)[::-1][1:11])
    sim_sorted = np.argsort(scores)
    top_10 = sim_sorted[::-1][1:11]
    print(top_10)
    recommendations = movies.iloc[top_10]['title']
    return recommendations

In [20]:
overviewRecommender('Spider-Man')

Top 10 similarity scores [0.28269378 0.24281586 0.223084   0.20914974 0.18502292 0.16479718
 0.14647248 0.13076654 0.11323179 0.11282744]
[   5   38   20   30 1534  953 1720 2740 3216 1572]


5                    Spider-Man 3
38       The Amazing Spider-Man 2
20         The Amazing Spider-Man
30                   Spider-Man 2
1534                Arachnophobia
953     Gremlins 2: The New Batch
1720                     Kick-Ass
2740                  The New Guy
3216                     Election
1572    Forgetting Sarah Marshall
Name: title, dtype: object