In [231]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

In [232]:
%%capture capt
%run userAnalysis.ipynb

In [233]:
movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [234]:
tfidf = TfidfVectorizer(stop_words='english')
overviewMatrix = tfidf.fit_transform(movies['overview'])
print("shape" , overviewMatrix.shape)
# it is mxn matrix with m equal to the total rows of  dataset. 
# n is total no of different words used in overview column
# now if given a row or title we need to find other rows with similar data in the matrix

shape (4803, 20978)


In [235]:
overviewSimilarity = linear_kernel(overviewMatrix, overviewMatrix)
# it finds the similarity with all other rows in matrix and returns a list 
overviewSimilarity

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

Similar score matrix data can be fetched by using index of the required row. So if user searches for a particular movie, we need to get the index of that movie and then get the similartiy scores data. For this conversion instead of converting each time during search we can create a reverse map to store tiles as key & index as values. 

In [236]:
indexMap = pd.Series(movies.index, index=movies['tmdbId'])
indexMap.head()
#indexMap.duplicated().sum()

tmdbId
19995     0
285       1
206647    2
49026     3
49529     4
dtype: int64

Once we get similarity scores for a particular movie, we need to sort them to find top 10 similar movies indices & then fetch the titles to recommend to the user. As any movie will have 1 as similarity score when compared with itself, its own index appears at top of list. So we ignore it and take next 10 items from the results. 

In [237]:
#sorting
sim_sorted = np.argsort(overviewSimilarity[0])
#top 10 
sim_indices = sim_sorted[::-1][:11]
#titles
sim_titles = movies.iloc[sim_indices]['title']
sim_titles[1:]

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object

In [247]:
def contenSimilarMovies(tmdbId,similarity):
    index = indexMap[tmdbId]
    scores = similarity[index]
    score_sorted = np.sort(scores)
    score_index = np.argsort(scores)
    top = score_index[::-1][1:11]
    recommendations = np.array(movies.iloc[top]['tmdbId'])
    return recommendations,score_sorted[::-1][1:11]

userId :  388
total movies rated :  503
avg ratings : 3.6888667992047712


84               Pirates of the Caribbean: At World's End
317            Pirates of the Caribbean: Dead Man's Chest
615                                               Titanic
782                                          Spider-Man 2
1258    The Chronicles of Narnia: The Lion, the Witch ...
Name: title, dtype: object

In [241]:
def contentBasedRecom(randomUser,similarity):
    likedMovies = ratedMovies[ratedMovies['rating'] >= 4 ]
    ids = likedMovies['tmdbId']
    moviesDict = dict()
    for i in ids:
        movies,score = overviewRecommender(i,similarity)
        for j in range(len(movies)):
            if(movies[j] in moviesDict):
                if(moviesDict[movies[j]] < score[j]):
                    moviesDict.update({movies[j]:score[j]})
            else:
                moviesDict[movies[j]] = score[j]
    duplicates = []
    for i in moviesDict:
        if(i in ratedMovies['tmdbId']):
            duplicates.append(i)
    for i in duplicates:
        moviesDict.pop(i)
    return moviesDict

In [242]:
def getContentRecomMovies(user,similarity):
    moviesDict = contentBasedRecom(user,similarity)
    recommendedMovieIds = moviesDict.keys()
    contentRecomIds = np.array(list(recommendedMovieIds))
    contentRecomDf = pd.DataFrame({'tmdbId':moviesDict.keys(),'score':moviesDict.values()})
    contentRecomDf = contentRecomDf.sort_values('score',ascending=False)
    contentRecomTitles = []
    for i in contentRecomIds:
        contentRecomTitles.append(movies['title'].iloc[indexMap[i]])
    contentRecomDf['titles'] = contentRecomTitles
    contentRecomDf.head()
    return contentRecomDf

In [251]:
getContentRecomMovies(randomUser, overviewSimilarity)

Unnamed: 0,tmdbId,score,titles
442,296099,0.470145,Pirates of the Caribbean: The Curse of the Bla...
118,1891,0.422981,Pirates of the Caribbean: On Stranger Tides
123,11,0.422981,Firewall
552,238,0.412329,Highway
551,240,0.412329,Ladder 49
...,...,...,...
1053,37718,0.062835,College
1055,16653,0.062302,Be Kind Rewind
72,49018,0.061650,Sex Tape
328,10521,0.059626,Vaalu


In [244]:
contentMatrix = tfidf.fit_transform(movies['content'])
contentSimilarity = cosine_similarity(contentMatrix,contentMatrix)

In [259]:
rec1 = getContentRecomMovies(randomUser, contentSimilarity)

In [273]:
rec1['score']

81     0.994632
80     0.912649
79     0.912649
158    0.818371
57     0.804658
Name: score, dtype: float64