In [21]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [22]:
%%capture capt
%run userAnalysis.ipynb

In [23]:
movies['overview'].head()

0    In the 22nd century, a paraplegic Marine is di...
1    Captain Barbossa, long believed to be dead, ha...
2    A cryptic message from Bond’s past sends him o...
3    Following the death of District Attorney Harve...
4    John Carter is a war-weary, former military ca...
Name: overview, dtype: object

In [24]:
vectorizer = TfidfVectorizer(stop_words='english')
vectorized = vectorizer.fit_transform(movies['overview'])
print("shape" , vectorized.shape)
# it is mxn matrix with m equal to the total rows of  dataset. 
# n is total no of different words used in overview column
# now if given a row or title we need to find other rows with similar data in the matrix

shape (4803, 20978)


In [25]:
similarityMatrix = linear_kernel(vectorized, vectorized)
# it finds the similarity with all other rows in matrix and returns a list 
similarityMatrix

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.02160533, 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.01488159, 0.        ,
        0.        ],
       ...,
       [0.        , 0.02160533, 0.01488159, ..., 1.        , 0.01609091,
        0.00701914],
       [0.        , 0.        , 0.        , ..., 0.01609091, 1.        ,
        0.01171696],
       [0.        , 0.        , 0.        , ..., 0.00701914, 0.01171696,
        1.        ]])

Similar score matrix data can be fetched by using index of the required row. So if user searches for a particular movie, we need to get the index of that movie and then get the similartiy scores data. For this conversion instead of converting each time during search we can create a reverse map to store tiles as key & index as values. 

In [26]:
indexMap = pd.Series(movies.index, index=movies['tmdbId'])
indexMap.head()
#indexMap.duplicated().sum()

tmdbId
19995     0
285       1
206647    2
49026     3
49529     4
dtype: int64

Once we get similarity scores for a particular movie, we need to sort them to find top 10 similar movies indices & then fetch the titles to recommend to the user. As any movie will have 1 as similarity score when compared with itself, its own index appears at top of list. So we ignore it and take next 10 items from the results. 

In [27]:
#sorting
sim_sorted = np.argsort(similarityMatrix[0])
#top 10 
sim_indices = sim_sorted[::-1][:11]
#titles
sim_titles = movies.iloc[sim_indices]['title']
sim_titles[1:]

3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object

In [103]:
def overviewRecommender(tmdbId):
    index = indexMap[tmdbId]
    scores = similarityMatrix[index]
    score_sorted = np.sort(scores)
    score_index = np.argsort(scores)
    top_10 = score_index[::-1][1:11]
    recommendations = np.array(movies.iloc[top_10]['tmdbId'])
    return recommendations,score_sorted[::-1][1:11]

In [29]:
overviewRecommender(19995)

Top 10 similarity scores [0.18681001 0.17015105 0.13230697 0.12393265 0.12248959 0.10469903
 0.10091388 0.09799623 0.09447938 0.09313854]
[3604 2130  634 1341  529 1610  311  847  775 2628]


3604                       Apollo 18
2130                    The American
634                       The Matrix
1341            The Inhabited Island
529                 Tears of the Sun
1610                           Hanna
311     The Adventures of Pluto Nash
847                         Semi-Pro
775                        Supernova
2628             Blood and Chocolate
Name: title, dtype: object

In [104]:
randomUser = random.choice(movieRatings['userId'])
ratedMovies = movieRatings[movieRatings['userId']==randomUser]
avgRating = np.mean(ratedMovies['rating'])
print("userId : ", randomUser)
print("total movies rated : ",len(ratedMovies))
print("avg ratings :" , avgRating)
ratedMovies['title']

userId :  217
total movies rated :  79
avg ratings : 4.265822784810126


3219                                       Armageddon
3525                                       Spider-Man
4516                                               X2
5211     Star Wars: Episode II - Attack of the Clones
5453        Star Wars: Episode I - The Phantom Menace
                             ...                     
61117                                       Airplane!
61639                                    Animal House
62737                              Young Frankenstein
63275                           From Russia with Love
65762                        The Kentucky Fried Movie
Name: title, Length: 79, dtype: object

In [105]:
def contentBasedRecom(randomUser):
    likedMovies = ratedMovies[ratedMovies['rating'] >= avgRating ]
    ids = likedMovies['tmdbId']
    moviesDict = dict()
    for i in ids:
        movies,score = overviewRecommender(i)
        for j in range(len(movies)):
            if(movies[j] in moviesDict):
                if(moviesDict[movies[j]] < score[j]):
                    moviesDict.update({movies[j]:score[j]})
            else:
                moviesDict[movies[j]] = score[j]
    return moviesDict
    
        

In [119]:
moviesDict = contentBasedRecom(randomUser)
recommendedMovieIds = moviesDict.keys()
np.array(list(recommendedMovieIds))

array([ 18712,  49538,  36668,  36657,   2080,   7504, 356987,  26428,
         9433,   9824,   1895, 333355,   1893,  10179,  27329,   1891,
         5820,   9869,   1892,  39833,   1894, 193610,  13515,  17768,
       146203,  18885,    218,  87101,  53457,    296,    534,  92182,
        12090,  11648,  20024, 157909,    810,  10192,    808,  12220,
       417859,   9716,  11091,  11247,  82505,  14442,   1597,  39451,
        10571,  20766,  60307,  82693,  18808, 254904,   9754,    693,
        82650, 361505,  53862,  16205,   1389,    809, 294272,  17047,
        16300,   8584,   2310,    848,     85,    177,  67911,    818,
       354110,     58,     87, 268171, 125052,    542,  68812,  49081,
         8060, 266857,  11615,  16633, 332411,   1907,  94348,   4148,
          157,    154,    168,    172, 192577,  54138,    174, 188927,
        34016,  11935,  18886,   2275,  13909, 279759,  10592, 373314,
         9422,   1885,   8078, 205587,     11,  18442,   9550, 407887,
      

In [87]:
range(10)

range(0, 10)