In [None]:
'''
Principle:
Using correlation matrix to find similar movies. Similar movies tend to have higher correlation factor.

Output:
This recommender system will recommend movies to an USER, based on all the movies he/she has rated.

Logic: Item Based Collaborative filtering
1. make an user/movie matrix, keeping ratings as the values.

2. now make a correlation matrix of the above user/movie matrix.

3. decide a threshold and keep only those movies which are rated by maximum numbers of users i.e, above the threshold.

4. extract all movies rated by an user, also extract the movies which have higher correlation to the movies he rated.

5. remove the movies from the list which he already rated, if present.

6. recommend the top movies with higher correlation factor.
'''

In [1]:
import numpy as np
import pandas as pd

In [2]:
r_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv("C:\\Users\\DHRUBAJIT\\Desktop\\Datasets\\movielens\\ml-100k\\u.data",names=r_cols,usecols=range(3), sep='\t')
print(ratings.head())
print("")
m_cols = ['movie_id','title']
movies = pd.read_csv("C:/Users/DHRUBAJIT/Desktop/Datasets/movielens/ml-100k/u.item",names=m_cols, sep='|',usecols=range(2),encoding='latin-1')
print(movies.head())

   user_id  movie_id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1

   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [3]:
#Merging both dataframes
ratings = pd.merge(movies, ratings)
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [4]:
#user/movie rating matrix
user_movie_matrix = ratings.pivot_table(index='user_id', columns='title',values='rating')
user_movie_matrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,2.0,5.0,,,3.0,4.0,,,...,,,,5.0,3.0,,,,4.0,
2,,,,,,,,,1.0,,...,,,,,,,,,,
3,,,,,2.0,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,2.0,,,,,4.0,,,...,,,,4.0,,,,,4.0,


In [5]:
#correlation matrix for each movie with all other movies where rating count is greater than 250.
user_movie_corr = user_movie_matrix.corr(method='pearson', min_periods=250)
user_movie_corr.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
1-900 (1994),,,,,,,,,,,...,,,,,,,,,,
101 Dalmatians (1996),,,,,,,,,,,...,,,,,,,,,,
12 Angry Men (1957),,,,,,,,,,,...,,,,,,,,,,
187 (1997),,,,,,,,,,,...,,,,,,,,,,


In [15]:
def recommend_movie(data_matrix,corr_matrix, userid):
    user_rating = data_matrix.loc[userid].dropna()

    similarity_score = pd.Series()
    for i in range(0, len(user_rating.index)):
    
        #extracting the correlation of all movies to the ones the user has rated.
        sims = corr_matrix[user_rating.index[i]].dropna()
        
        #scale the similarity score by how well the user rated the movies.
        sims = sims.apply(lambda x: x*user_rating[i])
    
        similarity_score = similarity_score.append(sims)

    #there are few movies that appeared more than once, so lets add their similarity scores.
    similarity_score = similarity_score.groupby(similarity_score.index).sum()
    similarity_score = similarity_score.sort_values(ascending='False')


    #filter out all the movies that the user has already rated.
    filter_movies = []
    for elements in similarity_score.index:
        if elements not in user_rating.index:
            filter_movies.append(elements)

    filter_movies = pd.DataFrame(filter_movies)
    filter_movies = filter_movies.rename(columns={0:'top 5 movies'})
    print(filter_movies.head(5))

In [16]:
recommend_movie(user_movie_matrix, user_movie_corr, 100)

                    top 5 movies
0                   Fargo (1996)
1  Independence Day (ID4) (1996)
2               Star Wars (1977)
3      Return of the Jedi (1983)
4               Toy Story (1995)
