In [2]:
#Dataframe manipulation library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [3]:
#Storing the movie information into a pandas dataframe
movies = pd.read_csv('movies.csv')
#Storing the user information into a pandas dataframe
ratings = pd.read_csv('ratings.csv')

In [4]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


In [6]:
#Using regular expressions to find a year stored between parentheses
#We specify the parantheses so we don't conflict with movies that have years in their titles
movies['year'] = movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses
movies['year'] = movies.year.str.extract('(\d\d\d\d)',expand=False)
#Removing the years from the 'title' column
movies['title'] = movies.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending whitespace characters that may have appeared
movies['title'] = movies['title'].apply(lambda x: x.strip())

lets look at the data frame now after extracting the year

In [7]:
movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,Adventure|Animation|Children|Comedy|Fantasy,1995
1,2,Jumanji,Adventure|Children|Fantasy,1995
2,3,Grumpier Old Men,Comedy|Romance,1995
3,4,Waiting to Exhale,Comedy|Drama|Romance,1995
4,5,Father of the Bride Part II,Comedy,1995


we don't need a genre column here, so lets drop the genre column

In [8]:
#Dropping the genres column
movies = movies.drop('genres', 1)

In [9]:
movies.head()

Unnamed: 0,movieId,title,year
0,1,Toy Story,1995
1,2,Jumanji,1995
2,3,Grumpier Old Men,1995
3,4,Waiting to Exhale,1995
4,5,Father of the Bride Part II,1995


In [10]:
movies.tail(10)

Unnamed: 0,movieId,title,year
34198,151685,Alone Yet Not Alone,2013
34199,151687,Risen,2016
34200,151689,Obsession: Radical Islam's War Against the West,2005
34201,151691,Hollywood High,1976
34202,151695,The Survivalist,2015
34203,151697,Grand Slam,1967
34204,151701,Bloodmoney,2010
34205,151703,The Butterfly Circus,2009
34206,151709,Zero,2015
34207,151711,The 2000 Year Old Man,1975


Next, let's look at the ratings dataframe.

In [11]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496


we don't need a timestamp colimn here,so let's drop the tiemstamp column

In [12]:
#Drop removes a specified row or column from a dataframe
ratings = ratings.drop('timestamp', 1)

In [13]:
ratings.head()

Unnamed: 0,userId,movieId,rating
0,1,169,2.5
1,1,2471,3.0
2,1,48516,5.0
3,2,2571,3.5
4,2,109487,4.0


In [14]:
ratings.shape

(22884377, 3)

In [15]:
movies.shape

(34208, 3)

In [16]:
a=ratings['userId'].value_counts()

In [17]:
b=ratings['userId'].value_counts()>200

In [18]:
new_users=list(a[b].index)

In [19]:
len(new_users)

28729

In [20]:
rating=ratings[ratings['userId'].isin(new_users)]

In [21]:
rating.shape

(13524074, 3)

In [22]:
rating.head()

Unnamed: 0,userId,movieId,rating
749,15,1,4.0
750,15,7,3.0
751,15,17,3.5
752,15,24,4.0
753,15,32,4.5


In [23]:
rating_with_movie_name=rating.merge(movies,on='movieId')

In [24]:
rating_with_movie_name.head()

Unnamed: 0,userId,movieId,rating,title,year
0,15,1,4.0,Toy Story,1995
1,17,1,5.0,Toy Story,1995
2,37,1,4.5,Toy Story,1995
3,47,1,4.0,Toy Story,1995
4,50,1,4.0,Toy Story,1995


In [25]:
rating_with_movie_name.shape

(13524074, 5)

In [26]:
movie_num_ratings=rating_with_movie_name.groupby('title')['rating'].count().reset_index()

In [27]:
movie_num_ratings.head()

Unnamed: 0,title,rating
0,"""Great Performances"" Cats",49
1,#1 Cheerleader Camp,3
2,#chicagoGirl: The Social Network Takes on a Di...,2
3,$ (Dollars),11
4,$5 a Day,18


In [28]:
movie_num_ratings=movie_num_ratings[movie_num_ratings['rating']>500]

In [29]:
movie_num_ratings.head()

Unnamed: 0,title,rating
20,"'burbs, The",2348
22,(500) Days of Summer,3703
28,*batteries not included,1082
32,...And Justice for All,824
40,10,564


In [30]:
movie_num_ratings.shape

(4203, 2)

In [36]:
ratings_final=rating_with_movie_name.merge(movie_num_ratings,on='title')

In [37]:
ratings_final.shape

(12183811, 6)

In [38]:
ratings_final=ratings_final.drop_duplicates(subset=['userId','title'])

In [39]:
ratings_final.shape

(12117522, 6)

In [40]:
ratings_final

Unnamed: 0,userId,movieId,rating_x,title,year,rating_y
0,15,1,4.0,Toy Story,1995,20504
1,17,1,5.0,Toy Story,1995,20504
2,37,1,4.5,Toy Story,1995,20504
3,47,1,4.0,Toy Story,1995,20504
4,50,1,4.0,Toy Story,1995,20504
...,...,...,...,...,...,...
12183806,246450,1670,3.0,Welcome to Sarajevo,1997,509
12183807,246553,1670,5.0,Welcome to Sarajevo,1997,509
12183808,247348,1670,2.0,Welcome to Sarajevo,1997,509
12183809,247424,1670,3.5,Welcome to Sarajevo,1997,509


In [41]:
movie_pivot=ratings_final.pivot_table(columns='userId',index='title',values='rating_x')

In [43]:
movie_pivot

userId,15,17,20,37,38,39,47,50,59,68,...,247624,247662,247700,247702,247704,247705,247725,247730,247732,247735
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The",,3.0,,,,,,,,,...,,,,3.0,,4.0,,,,
(500) Days of Summer,,,,,,,4.0,,,,...,,,,,,,,,4.5,
*batteries not included,,,,,,,,,,,...,,,,0.5,,,3.0,,,
...And Justice for All,,,,,,,,,,,...,,,,1.5,,,,,,
10,,,,,,,,,,,...,,,,3.0,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC],,,,,,,,,,,...,,,,,,,,,,
eXistenZ,,,,,,,,,,,...,4.0,3.5,,,,,,,,
xXx,,,,,,,,,,,...,,0.5,,,,,,,,4.0
xXx: State of the Union,,,,,,,,,,,...,,1.0,,,,,,,,


In [44]:
movie_pivot=movie_pivot.fillna(0)

In [45]:
movie_pivot

userId,15,17,20,37,38,39,47,50,59,68,...,247624,247662,247700,247702,247704,247705,247725,247730,247732,247735
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
"'burbs, The",0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,4.0,0.0,0.0,0.0,0.0
(500) Days of Summer,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.5,0.0
*batteries not included,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.5,0.0,0.0,3.0,0.0,0.0,0.0
...And Justice for All,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.5,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
[REC],0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
eXistenZ,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,4.0,3.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
xXx,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
xXx: State of the Union,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
from scipy.sparse import csr_matrix
movie_matrix=csr_matrix(movie_pivot)

In [47]:
movie_matrix.toarray()

array([[0. , 3. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 4.5, 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [0. , 0. , 0. , ..., 0. , 0. , 4. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 2. , 0. , ..., 0. , 0. , 0. ]])

In [57]:
from sklearn.neighbors import NearestNeighbors

In [235]:
model=NearestNeighbors(algorithm='brute',leaf_size=30,metric='minkowski',metric_params=None,n_neighbors=5,p=2,radius=1.0)

In [236]:
model.fit(movie_matrix)

NearestNeighbors(algorithm='brute')

In [237]:
movie_pivot.index[166]

'Altered States'

In [238]:
distance,suggestion=model.kneighbors(movie_pivot.iloc[1,:].values.reshape(1,-1),n_neighbors=6)

In [239]:
suggestion

array([[   1,   56,  867, 3945,   97, 2866]], dtype=int64)

In [240]:
def recommend(movie):
    m_id=np.where(movie_pivot.index==movie)[0][0]
    distance,suggestion=model.kneighbors(movie_pivot.iloc[m_id,:].values.reshape(1,-1),n_neighbors=6)
    for i in suggestion[0]:
        print(movie_pivot.index[i])

In [259]:
recommend("Mean Girls")

Mean Girls
13 Going on 30
She's the Man
Saved!
Sisterhood of the Traveling Pants, The
Legally Blonde 2: Red, White & Blonde
