In [1]:
import numpy as np
import pandas as pd

In [2]:
r_cols = ['user_id','movie_id','rating']
ratings = pd.read_csv("C:\\Users\\DHRUBAJIT\\Desktop\\Datasets\\movielens\\ml-100k\\u.data",names=r_cols,usecols=range(3), sep='\t')
print(ratings.head())
print("")
m_cols = ['movie_id','title']
movies = pd.read_csv("C:/Users/DHRUBAJIT/Desktop/Datasets/movielens/ml-100k/u.item",names=m_cols, sep='|',usecols=range(2),encoding='latin-1')
print(movies.head())

   user_id  movie_id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1

   movie_id              title
0         1   Toy Story (1995)
1         2   GoldenEye (1995)
2         3  Four Rooms (1995)
3         4  Get Shorty (1995)
4         5     Copycat (1995)


In [3]:
#Merging both dataframes
ratings = pd.merge(movies, ratings)
ratings.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story (1995),308,4
1,1,Toy Story (1995),287,5
2,1,Toy Story (1995),148,4
3,1,Toy Story (1995),280,4
4,1,Toy Story (1995),66,3


In [4]:
#sorting the values based on user_id

df = ratings.sort_values('user_id')
df = df[['user_id','title','rating']].reset_index()
df = df.drop(['index'], axis=1)
df.head()

Unnamed: 0,user_id,title,rating
0,1,Strange Days (1995),4
1,1,Jurassic Park (1993),5
2,1,Richard III (1995),3
3,1,Monty Python and the Holy Grail (1974),5
4,1,Ed Wood (1994),4


In [5]:
#calculating the total sum of ratings of each movie.

total_ratings = df.groupby(by='title')['rating'].sum().reset_index()
total_ratings = total_ratings.rename(columns={'rating':'total_rating'})
total_ratings.head()

Unnamed: 0,title,total_rating
0,'Til There Was You (1997),21
1,1-900 (1994),13
2,101 Dalmatians (1996),317
3,12 Angry Men (1957),543
4,187 (1997),124


In [6]:
#merging both the dataframes.

new_df = df.merge(total_ratings, left_on='title', right_on='title', how='left')
new_df.head()

Unnamed: 0,user_id,title,rating,total_rating
0,1,Strange Days (1995),4,284
1,1,Jurassic Park (1993),5,971
2,1,Richard III (1995),3,341
3,1,Monty Python and the Holy Grail (1974),5,1285
4,1,Ed Wood (1994),4,479


In [7]:
new_df.total_rating.describe()

count    100000.000000
mean        626.952900
std         489.334281
min           1.000000
25%         248.000000
50%         513.000000
75%         902.000000
max        2541.000000
Name: total_rating, dtype: float64

In [8]:
#keeping only those movies whose total ratings are greater than 248, i.e. 25% as per the above description. 

new_df = new_df[new_df['total_rating'] > 248]
new_df

Unnamed: 0,user_id,title,rating,total_rating
0,1,Strange Days (1995),4,284
1,1,Jurassic Park (1993),5,971
2,1,Richard III (1995),3,341
3,1,Monty Python and the Holy Grail (1974),5,1285
4,1,Ed Wood (1994),4,479
6,1,Blade Runner (1982),5,1138
7,1,Eat Drink Man Woman (1994),5,324
8,1,Dead Poets Society (1989),5,983
9,1,"Last of the Mohicans, The (1992)",4,454
10,1,"Wrong Trousers, The (1993)",5,527


In [9]:
#deleteing duplicate rows where the same user has rated the same movie more than once.

initial_rows = new_df.shape[0]
print("Initial shape: {0}".format(initial_rows))
new_df = new_df.drop_duplicates(['user_id','title'])
current_rows = new_df.shape[0]
print("Current shape: {0}".format(current_rows))
print("total rows deleted: {0}".format(initial_rows - current_rows))

Initial shape: 74710
Current shape: 74479
total rows deleted: 231


In [10]:
#Reshaping the data into a sparse matrix for more efficient calculations.

from scipy.sparse import csr_matrix

data_pivot = new_df.pivot_table(index='title', columns='user_id', values='rating').fillna(0)
data_sparse = csr_matrix(data_pivot.values)
print(data_sparse)

  (0, 0)	2.0
  (0, 4)	2.0
  (0, 12)	2.0
  (0, 14)	3.0
  (0, 37)	5.0
  (0, 42)	2.0
  (0, 44)	4.0
  (0, 48)	2.0
  (0, 55)	2.0
  (0, 56)	3.0
  (0, 59)	3.0
  (0, 61)	3.0
  (0, 62)	2.0
  (0, 69)	3.0
  (0, 74)	2.0
  (0, 81)	3.0
  (0, 82)	3.0
  (0, 83)	4.0
  (0, 91)	3.0
  (0, 93)	3.0
  (0, 100)	3.0
  (0, 140)	3.0
  (0, 158)	4.0
  (0, 160)	1.0
  (0, 166)	3.0
  :	:
  (456, 708)	5.0
  (456, 710)	3.0
  (456, 711)	3.0
  (456, 714)	4.0
  (456, 726)	3.0
  (456, 745)	3.0
  (456, 748)	4.0
  (456, 756)	3.0
  (456, 772)	3.0
  (456, 773)	2.0
  (456, 780)	3.0
  (456, 789)	4.0
  (456, 795)	3.0
  (456, 825)	3.0
  (456, 845)	3.0
  (456, 863)	4.0
  (456, 867)	1.0
  (456, 879)	4.0
  (456, 885)	3.0
  (456, 888)	3.0
  (456, 895)	3.0
  (456, 896)	5.0
  (456, 915)	3.0
  (456, 932)	1.0
  (456, 942)	4.0


In [11]:
#fitting the sparse matrix to the NearestNeighbors model.

from sklearn.neighbors import NearestNeighbors
knn = NearestNeighbors(n_neighbors=10,metric='cosine', algorithm='brute')
knn.fit(data_sparse)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='cosine',
         metric_params=None, n_jobs=1, n_neighbors=10, p=2, radius=1.0)

In [39]:
def recommend_movies(data, movie_id, topmovies):
    
    #calculate distance and extract the top movies from the specified movie_id.
    distances, indices = knn.kneighbors(data.iloc[movie_id, :].values.reshape(1, -1), n_neighbors = topmovies)

    for i in range(0, len(distances.flatten())):
        if i == 0:
            print('Recommendations for movie : {0}\n'.format(data.index[movie_id]))
        else:
            print('{0}: {1}, with distance of {2}'.format(i, data.index[indices.flatten()[i]], distances.flatten()[i]))

In [40]:
#recommending top 5 movies for MOVIE_ID = 200
recommend_movies(data_pivot, 200, 6)

Recommendations for movie : High Noon (1952)

1: Treasure of the Sierra Madre, The (1948), with distance of 0.4598674110773895
2: Lawrence of Arabia (1962), with distance of 0.4728286574136151
3: Bridge on the River Kwai, The (1957), with distance of 0.48589953579845657
4: Some Like It Hot (1959), with distance of 0.49666743427199955
5: Graduate, The (1967), with distance of 0.522065620703685


In [63]:
#recommending top 10 movies for MOVIE_ID = 150
recommend_movies(data_pivot, 150, 11)

Recommendations for movie : Field of Dreams (1989)

1: Forrest Gump (1994), with distance of 0.31955147909131
2: E.T. the Extra-Terrestrial (1982), with distance of 0.3197929685094689
3: Dead Poets Society (1989), with distance of 0.32851432603624786
4: Dances with Wolves (1990), with distance of 0.3615932433466067
5: Wizard of Oz, The (1939), with distance of 0.37689775611422915
6: Back to the Future (1985), with distance of 0.37931027704312636
7: Empire Strikes Back, The (1980), with distance of 0.399855036637936
8: Lion King, The (1994), with distance of 0.40004068564888784
9: Apollo 13 (1995), with distance of 0.4027203978453807
10: Indiana Jones and the Last Crusade (1989), with distance of 0.4029386555433191
