## To make a movie recommender system, first I import necessary csv files.The movie file contains movielists and rating file contains rating given to various movies by different users

In [None]:
import numpy as np
import pandas as pd

In [2]:
movie=pd.read_csv("movie.csv",encoding='latin-1',error_bad_lines=False)
rating=pd.read_csv("rating.csv",encoding='latin-1',error_bad_lines=False)

In [3]:
movie.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
rating.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,2005-04-02 23:53:47
1,1,29,3.5,2005-04-02 23:31:16
2,1,32,3.5,2005-04-02 23:33:39
3,1,47,3.5,2005-04-02 23:32:07
4,1,50,3.5,2005-04-02 23:29:40


In [6]:
print(movie.shape)
print(rating.shape)

(27278, 3)
(20000263, 4)


In [7]:
# Total number of movies and users in rating dataset

rating['movieId'].nunique()

26744

In [8]:
rating['userId'].nunique()

138493

In [9]:
movie.columns

Index(['movieId', 'title', 'genres'], dtype='object')

In [10]:
rating.columns

Index(['userId', 'movieId', 'rating', 'timestamp'], dtype='object')

In [11]:
movie['genres'].value_counts()

Drama                                                4520
Comedy                                               2294
Documentary                                          1942
Comedy|Drama                                         1264
Drama|Romance                                        1075
                                                     ... 
Action|Adventure|Crime|Drama|Sci-Fi                     1
Adventure|Drama|Thriller|War                            1
Adventure|Animation|Drama|Fantasy|Musical|Romance       1
Mystery|Romance|Sci-Fi|Thriller                         1
Action|Animation|Children|Comedy|Musical                1
Name: genres, Length: 1342, dtype: int64

In [12]:
rating['userId'].value_counts()

118205    9254
8405      7515
82418     5646
121535    5520
125794    5491
          ... 
59390       20
23558       20
34668       20
80291       20
58028       20
Name: userId, Length: 138493, dtype: int64

## There are total 138493 users who rated 26744 different movies in total. Some of them are movielovers i.e. they have seen and rated many movies and some have rated so small amount of movies that we may not include in our system.

In [13]:
# Users who have rated movies at least 300 times

x=rating['userId'].value_counts()>300
movlov=x[x].index.tolist()
movlov

[118205,
 8405,
 82418,
 121535,
 125794,
 74142,
 34576,
 131904,
 83090,
 59477,
 130767,
 79159,
 8963,
 15617,
 92011,
 71975,
 20132,
 46470,
 88820,
 63147,
 130459,
 120575,
 9544,
 31122,
 18611,
 125978,
 18138,
 91193,
 111549,
 68026,
 41267,
 51703,
 92269,
 70201,
 35128,
 105580,
 14705,
 54465,
 114406,
 136268,
 12131,
 53346,
 24688,
 107326,
 131347,
 26867,
 27469,
 119048,
 123606,
 67346,
 86529,
 22901,
 129583,
 131894,
 91867,
 7201,
 24219,
 62812,
 61168,
 68063,
 51558,
 97853,
 32344,
 80092,
 103223,
 107640,
 128258,
 79531,
 128309,
 92956,
 118754,
 76630,
 106441,
 59414,
 113668,
 122995,
 116189,
 50297,
 52260,
 72008,
 33736,
 52009,
 43194,
 117144,
 3907,
 137202,
 27053,
 31404,
 42929,
 119531,
 135425,
 66763,
 116317,
 64843,
 131961,
 2261,
 42204,
 903,
 69793,
 73611,
 49554,
 58953,
 95301,
 23173,
 4358,
 80920,
 16676,
 72983,
 4222,
 133811,
 55765,
 101044,
 34651,
 99754,
 52636,
 110758,
 134567,
 32514,
 75810,
 60159,
 57735,
 2139

In [14]:
ratings=rating[rating['userId'].isin(movlov)]
ratings.shape

(9891292, 4)

In [15]:
len(movlov)

16184

## We have found 16184 such users who rated a lot of movies, now I am going to remove timestamp column which is not useful in this case and then I will merge movie and rating files. 

In [16]:
movie=movie[['movieId','title','genres']]
ratings=ratings[['userId','movieId','rating']]

In [17]:
movie_with_rating=ratings.merge(movie,on='movieId')

## Now I would keep those movies, which have got at least 300 times rating from users i.e. only famous movies will be there in the list.

In [18]:
a=movie_with_rating['movieId'].value_counts()>300
famous=a[a].index.tolist()
len(famous)

4887

In [19]:
movie_with_rating=movie_with_rating[movie_with_rating['movieId'].isin(famous)]

In [20]:
movie_with_rating.shape

(9057628, 5)

In [21]:
movie_with_rating.drop_duplicates(subset=['userId','title'],inplace=True)

In [22]:
movie_with_rating.shape

(9057628, 5)

## Now I shall create a pivot table where the index will contain movie names, columns will contain user ids and values will be rating given to a particular movie by a particular user 

In [23]:
pt=movie_with_rating.pivot_table(columns='userId',index='title',values='rating')

In [24]:
pt

userId,11,24,54,58,91,96,104,116,131,132,...,138406,138411,138414,138436,138437,138454,138456,138472,138474,138493
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The (1989)",,,3.0,,,,,,,,...,,2.0,,,,,3.0,,4.0,
(500) Days of Summer (2009),,,,,,,,,,,...,,,,2.5,4.5,,,3.5,,
*batteries not included (1987),5.0,,,,3.0,,,,,,...,,,,,,,,,,
...And Justice for All (1979),,,4.0,,,,,,,,...,,,,,,,,,,
10 (1979),,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC] (2007),,,,,,,,,,,...,4.0,,,,,,,,,
eXistenZ (1999),5.0,3.0,,,,,3.0,,,,...,,,,,,,,,,
xXx (2002),,,,,,,,2.0,,,...,,,,,,,,,,
xXx: State of the Union (2005),,,,,,,,,,,...,,,,,,,,,,


In [25]:
pt.shape

(4887, 16184)

## The pivot table contains 4887 different movies and 16184 different users; My next aim is to replace the NaN values with -1 in the pivot table to get rid of ambiguity. 

In [26]:
pt.fillna(-1,inplace=True)

In [27]:
from scipy.sparse import csr_matrix
matrix=csr_matrix(pt)

In [28]:
matrix

<4887x16184 sparse matrix of type '<class 'numpy.float64'>'
	with 79091208 stored elements in Compressed Sparse Row format>

## I have converted the pivot table to sparse matrix to train the model 

In [29]:
from sklearn.neighbors import NearestNeighbors
NN=NearestNeighbors(algorithm='brute')

In [30]:
pt.shape

(4887, 16184)

In [31]:
NN.fit(matrix)

NearestNeighbors(algorithm='brute', leaf_size=30, metric='minkowski',
                 metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                 radius=1.0)

## The n_neighbors is set 11 by me to predict 10 similar movies with the selected movie. 

In [39]:
d,s=NN.kneighbors(pt.iloc[500,:].values.reshape(1,16184),n_neighbors=11)

In [40]:
pt.iloc[0,:].values.reshape(1,16184)

array([[-1., -1.,  3., ..., -1.,  4., -1.]])

In [106]:
# It is kept here to check different movies with index no.

pt.index[1933]

'Harry Potter and the Deathly Hallows: Part 1 (2010)'

## Few examples: 

In [48]:
d,s=NN.kneighbors(pt.iloc[500,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[ 500  499 4082 3439 3440 4556 3441 2937 3692 3438 3706]]
Index(['Beverly Hills Cop III (1994)', 'Beverly Hills Cop II (1987)',
       'Specialist, The (1994)', 'Police Academy 4: Citizens on Patrol (1987)',
       'Police Academy 5: Assignment: Miami Beach (1988)',
       'Under Siege 2: Dark Territory (1995)',
       'Police Academy 6: City Under Siege (1989)', 'Money Train (1995)',
       'RoboCop 3 (1993)', 'Police Academy 3: Back in Training (1986)',
       'Rocky V (1990)'],
      dtype='object', name='title') 

In [47]:
d,s=NN.kneighbors(pt.iloc[3436,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[3436 3437 3438 3439 3440 3441 1024 1399 3454 2388 2935]]
Index(['Police Academy (1984)',
       'Police Academy 2: Their First Assignment (1985)',
       'Police Academy 3: Back in Training (1986)',
       'Police Academy 4: Citizens on Patrol (1987)',
       'Police Academy 5: Assignment: Miami Beach (1988)',
       'Police Academy 6: City Under Siege (1989)',
       'Crocodile Dundee II (1988)',
       'European Vacation (aka National Lampoon's European Vacation) (1985)',
       'Porky's (1982)', 'Karate Kid, Part III, The (1989)',
       'Money Pit, The (1986)'],
      dtype='object', name='title') 

In [52]:
d,s=NN.kneighbors(pt.iloc[4000,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[4000  882 3366 3634  396    8 4266  161 3633 3365 1297]]
Index(['Sleeping Beauty (1959)', 'Cinderella (1950)', 'Peter Pan (1953)',
       'Rescuers, The (1977)', 'Bambi (1942)',
       '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
       'Sword in the Stone, The (1963)', 'Alice in Wonderland (1951)',
       'Rescuers Down Under, The (1990)', 'Pete's Dragon (1977)',
       'Dumbo (1941)'],
      dtype='object', name='title') 

In [68]:
d,s=NN.kneighbors(pt.iloc[2261,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[2261 2262   99 2263 2910 2909 2817 2545  554 2425 4166]]
Index(['Iron Eagle (1986)', 'Iron Eagle II (1988)',
       'Aces: Iron Eagle III (1992)', 'Iron Eagle IV (1995)',
       'Missing in Action 2: The Beginning (1985)', 'Missing in Action (1984)',
       'Meatballs Part II (1984)', 'Leonard Part 6 (1987)', 'Black Dog (1998)',
       'King Kong Lives (1986)', 'Steel (1997)'],
      dtype='object', name='title') 

In [58]:
d,s=NN.kneighbors(pt.iloc[46,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[  46   47   48 4052 4518 2094 1390 1761 3432 2263 4166]]
Index(['3 Ninjas (1992)', '3 Ninjas Kick Back (1994)',
       '3 Ninjas: High Noon On Mega Mountain (1998)', 'Son of the Mask (2005)',
       'Turbo: A Power Rangers Movie (1997)', 'House of the Dead, The (2003)',
       'Ernest Scared Stupid (1991)', 'Glitter (2001)',
       'PokÃ©mon 3: The Movie (2001)', 'Iron Eagle IV (1995)', 'Steel (1997)'],
      dtype='object', name='title') 

In [88]:
d,s=NN.kneighbors(pt.iloc[2266,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[2266 4377  774 4840  341 2267 4838  196 3923 2915 3672]]
Index(['Iron Man 2 (2010)', 'Thor (2011)',
       'Captain America: The First Avenger (2011)',
       'X-Men: First Class (2011)', 'Avengers, The (2012)',
       'Iron Man 3 (2013)', 'X-Men Origins: Wolverine (2009)',
       'Amazing Spider-Man, The (2012)',
       'Sherlock Holmes: A Game of Shadows (2011)',
       'Mission: Impossible - Ghost Protocol (2011)',
       'Rise of the Planet of the Apes (2011)'],
      dtype='object', name='title') 

In [89]:
d,s=NN.kneighbors(pt.iloc[4377,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[4377  774 4378  196 2267 4840 1844  775 2743 2915 4473]]
Index(['Thor (2011)', 'Captain America: The First Avenger (2011)',
       'Thor: The Dark World (2013)', 'Amazing Spider-Man, The (2012)',
       'Iron Man 3 (2013)', 'X-Men: First Class (2011)',
       'Green Lantern (2011)', 'Captain America: The Winter Soldier (2014)',
       'Man of Steel (2013)', 'Mission: Impossible - Ghost Protocol (2011)',
       'Transformers: Dark of the Moon (2011)'],
      dtype='object', name='title') 

In [90]:
d,s=NN.kneighbors(pt.iloc[774,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[ 774 4377  196  775 1844 4378 2267 2743 1000 4473 4808]]
Index(['Captain America: The First Avenger (2011)', 'Thor (2011)',
       'Amazing Spider-Man, The (2012)',
       'Captain America: The Winter Soldier (2014)', 'Green Lantern (2011)',
       'Thor: The Dark World (2013)', 'Iron Man 3 (2013)',
       'Man of Steel (2013)', 'Cowboys & Aliens (2011)',
       'Transformers: Dark of the Moon (2011)', 'Wolverine, The (2013)'],
      dtype='object', name='title') 

In [98]:
d,s=NN.kneighbors(pt.iloc[1933,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[1933 1934 1936 4377 3403  774  196 2127 3923 2267 4294]]
Index(['Harry Potter and the Deathly Hallows: Part 1 (2010)',
       'Harry Potter and the Deathly Hallows: Part 2 (2011)',
       'Harry Potter and the Half-Blood Prince (2009)', 'Thor (2011)',
       'Pirates of the Caribbean: On Stranger Tides (2011)',
       'Captain America: The First Avenger (2011)',
       'Amazing Spider-Man, The (2012)', 'Hunger Games, The (2012)',
       'Sherlock Holmes: A Game of Shadows (2011)', 'Iron Man 3 (2013)',
       'Tangled (2010)'],
      dtype='object', name='title') 

In [101]:
d,s=NN.kneighbors(pt.iloc[1932,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[1932 1939 1938 1935 1937 2843 4141 3947 2798 4842 4095]]
Index(['Harry Potter and the Chamber of Secrets (2002)',
       'Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)',
       'Harry Potter and the Prisoner of Azkaban (2004)',
       'Harry Potter and the Goblet of Fire (2005)',
       'Harry Potter and the Order of the Phoenix (2007)',
       'Men in Black II (a.k.a. MIIB) (a.k.a. MIB 2) (2002)',
       'Star Wars: Episode II - Attack of the Clones (2002)', 'Shrek 2 (2004)',
       'Matrix Reloaded, The (2003)', 'X2: X-Men United (2003)',
       'Spider-Man 2 (2004)'],
      dtype='object', name='title') 

In [102]:
d,s=NN.kneighbors(pt.iloc[4094,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[4094 4095 2894 3404 2639 2637 4141 3201 4837 3946 4842]]
Index(['Spider-Man (2002)', 'Spider-Man 2 (2004)', 'Minority Report (2002)',
       'Pirates of the Caribbean: The Curse of the Black Pearl (2003)',
       'Lord of the Rings: The Two Towers, The (2002)',
       'Lord of the Rings: The Fellowship of the Ring, The (2001)',
       'Star Wars: Episode II - Attack of the Clones (2002)',
       'Ocean's Eleven (2001)', 'X-Men (2000)', 'Shrek (2001)',
       'X2: X-Men United (2003)'],
      dtype='object', name='title') 

In [105]:
d,s=NN.kneighbors(pt.iloc[4474,:].values.reshape(1,16184),n_neighbors=11)

print(s)

for i in s:
    print(pt.index[i],end=' ')

[[4474 4473 1684 1844  902 2140 1688 3014 2804  431 3541]]
Index(['Transformers: Revenge of the Fallen (2009)',
       'Transformers: Dark of the Moon (2011)',
       'G.I. Joe: The Rise of Cobra (2009)', 'Green Lantern (2011)',
       'Clash of the Titans (2010)', 'I Am Number Four (2011)', 'Gamer (2009)',
       'Mummy: Tomb of the Dragon Emperor, The (2008)', 'Max Payne (2008)',
       'Battle: Los Angeles (2011)', 'Push (2009)'],
      dtype='object', name='title') 