# Movie recommendations

### This notebook contains 3 functions for three different recommenders for a movie database.
This project attempts to launch a webpage supported by a database of more than 9000 movies and the idea is to come up with recommendations based on 3 types of user inputs. 

1. The user can input a number and a function will diplay the entered number of top hits.
2. The user can enter the name of a movie and a number resulting in the entered number of similar movies.
3. The user can enter a user ID and will get a list of movies that were favored by that user.

In [145]:
import numpy as np
import pandas as pd
pd.pandas.set_option('display.max_rows', None)

In [139]:
links = pd.read_csv('links.csv')
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
tags = pd.read_csv('tags.csv')

In [146]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
5,6,Heat (1995),Action|Crime|Thriller
6,7,Sabrina (1995),Comedy|Romance
7,8,Tom and Huck (1995),Adventure|Children
8,9,Sudden Death (1995),Action
9,10,GoldenEye (1995),Action|Adventure|Thriller


In [4]:
movies.shape

(9742, 3)

In [5]:
features_with_na=[features for features in movies.columns if movies[features].isnull().sum()>0]
for feature in features_with_na:
    print(feature, np.round((movies[feature].isnull().mean())*100, 3), ' % null values')

In [6]:
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [7]:
links.isnull().sum()

movieId    0
imdbId     0
tmdbId     8
dtype: int64

In [8]:
features_with_na=[features for features in links.columns if links[features].isnull().sum()>0]
for feature in features_with_na:
    print(feature, np.round((links[feature].isnull().mean())*100, 3), ' % null values')

tmdbId 0.082  % null values


In [9]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [43]:
ratings.query('movieId ==88448')

Unnamed: 0,userId,movieId,rating,timestamp
77875,483,88448,5.0,1315437602


In [17]:
features_with_na=[features for features in ratings.columns if ratings[features].isnull().sum()>0]
for feature in features_with_na:
    print(feature, np.round((ratings[feature].isnull().mean())*100, 3), ' % null values')

In [18]:
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200


In [19]:
features_with_na=[features for features in tags.columns if tags[features].isnull().sum()>0]
for feature in features_with_na:
    print(feature, np.round((tags[feature].isnull().mean())*100, 3), ' % null values')

In [20]:
movie_ratings = movies.merge(ratings, on = 'movieId', how = 'left')
movie_ratings

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,4.0,9.649827e+08
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,8.474350e+08
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,7.0,4.5,1.106636e+09
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,15.0,2.5,1.510578e+09
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,17.0,4.5,1.305696e+09
...,...,...,...,...,...,...
100849,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,184.0,4.0,1.537109e+09
100850,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,184.0,3.5,1.537110e+09
100851,193585,Flint (2017),Drama,184.0,3.5,1.537110e+09
100852,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,184.0,3.5,1.537110e+09


In [31]:
sorted_movies = pd.DataFrame(movie_ratings.groupby('movieId').rating.mean().sort_values(ascending = False))
sorted_movies

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
88448,5.0
100556,5.0
143031,5.0
143511,5.0
143559,5.0
...,...
30892,
32160,
32371,
34482,


In [41]:
tags.query('movieId == 88448')

Unnamed: 0,userId,movieId,tag,timestamp


In [44]:
ratings.groupby('movieId').userId.count().sort_values(ascending = False)

movieId
356       329
318       317
296       307
593       279
2571      278
         ... 
4093        1
4089        1
58351       1
4083        1
193609      1
Name: userId, Length: 9724, dtype: int64

In [None]:
sorted_movies['number_of_ratings']=ratings.groupby('movieId').userId.count()
sorted_movies = sorted_movies.drop(columns = ['no_ratings'], axis = 1)


In [55]:
sorted_movies

Unnamed: 0_level_0,rating,number_of_ratings
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
88448,5.0,1.0
100556,5.0,1.0
143031,5.0,1.0
143511,5.0,1.0
143559,5.0,1.0
...,...,...
30892,,
32160,,
32371,,
34482,,


In [57]:
sorted_movies['rating_value'] = sorted_movies['rating'] * sorted_movies['number_of_ratings']

In [72]:
top_list = pd.DataFrame(sorted_movies['rating_value'].sort_values(ascending = False).reset_index())

In [73]:
top_list

Unnamed: 0,movieId,rating_value
0,318,1404.0
1,356,1370.0
2,296,1288.5
3,2571,1165.5
4,593,1161.0
...,...,...
9737,30892,
9738,32160,
9739,32371,
9740,34482,


In [90]:
top_movies = []
for movieId in top_list['movieId']:
    top_movies.append(movies.loc[movies['movieId'] == movieId, 'title'].to_list())

In [92]:
top_movies = pd.DataFrame(top_movies)

In [102]:
top_movies.rename(columns={'O':'Title'}, inplace = True)

In [103]:
top_movies

Unnamed: 0,0
0,"Shawshank Redemption, The (1994)"
1,Forrest Gump (1994)
2,Pulp Fiction (1994)
3,"Matrix, The (1999)"
4,"Silence of the Lambs, The (1991)"
...,...
9737,In the Realms of the Unreal (2004)
9738,Twentieth Century (1934)
9739,Call Northside 777 (1948)
9740,"Browning Version, The (1951)"


In [63]:
movies[movies['movieId']== 318]

Unnamed: 0,movieId,title,genres
277,318,"Shawshank Redemption, The (1994)",Crime|Drama


In [116]:
def hit_movies(n):
    movie_ratings = movies.merge(ratings, on = 'movieId', how = 'left')
    sorted_movies = pd.DataFrame(movie_ratings.groupby('movieId').rating.mean().sort_values(ascending = False))
    sorted_movies['number_of_ratings']=ratings.groupby('movieId').userId.count()
    sorted_movies['rating_value'] = sorted_movies['rating'] * sorted_movies['number_of_ratings']
    top_list = pd.DataFrame(sorted_movies['rating_value'].sort_values(ascending = False).reset_index())
    top_movies = []
    for movieId in top_list['movieId']:
        top_movies.append(movies.loc[movies['movieId'] == movieId, 'title'].items())
    top_movies = pd.DataFrame(top_movies)
    return top_movies.head(n)

hit_movies(5)

Unnamed: 0,0
0,"(277, Shawshank Redemption, The (1994))"
1,"(314, Forrest Gump (1994))"
2,"(257, Pulp Fiction (1994))"
3,"(1939, Matrix, The (1999))"
4,"(510, Silence of the Lambs, The (1991))"


## Item based selection

Here we are trying to recommend movies based on the name of a movie that is given as input.  "Similarity" will be defined by how well other movies correlate with input. We are creating a matrix where we have all the users in the rows and all the movies in the columns. It has many NaNs because most of the time users have not seen or rated the movie. This is a sparse matrix.

In [147]:
names = movies['title']
names

0                                        Toy Story (1995)
1                                          Jumanji (1995)
2                                 Grumpier Old Men (1995)
3                                Waiting to Exhale (1995)
4                      Father of the Bride Part II (1995)
5                                             Heat (1995)
6                                          Sabrina (1995)
7                                     Tom and Huck (1995)
8                                     Sudden Death (1995)
9                                        GoldenEye (1995)
10                         American President, The (1995)
11                     Dracula: Dead and Loving It (1995)
12                                           Balto (1995)
13                                           Nixon (1995)
14                                Cutthroat Island (1995)
15                                          Casino (1995)
16                           Sense and Sensibility (1995)
17            

In [118]:
rating_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
rating_crosstab.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


## Evaluating Similarity Based on Correlation

In [119]:
#Enter a movie ID
m_ID = 314
m_ratings = rating_crosstab[m_ID]
m_ratings[m_ratings>=0] # exclude NaNs
similar_to_m_ID = rating_crosstab.corrwith(m_ratings)
similar_to_m_ID

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


movieId
1         0.298926
2         0.862354
3         0.743593
4              NaN
5         1.000000
            ...   
193581         NaN
193583         NaN
193585         NaN
193587         NaN
193609         NaN
Length: 9724, dtype: float64

### Drop the NaNs

In [121]:
corr_m_ID = pd.DataFrame(similar_to_m_ID, columns=['PearsonR'])
corr_m_ID.dropna(inplace=True)
corr_m_ID.head(10)

Unnamed: 0_level_0,PearsonR
movieId,Unnamed: 1_level_1
1,0.298926
2,0.862354
3,0.743593
5,1.0
6,0.05886
7,0.897348
10,-0.135174
11,0.280336
13,-0.5
15,0.897731


### Adding a column rating count to the DF above

In [123]:
rating2 = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
rating2['rating_count'] = ratings.groupby('movieId')['rating'].count()

In [124]:
movie_corr_summary = corr_m_ID.join(rating2['rating_count'])
movie_corr_summary.drop(m_ID, inplace=True) # drop the selected movie
movie_corr_summary

Unnamed: 0_level_0,PearsonR,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.298926,215
2,0.862354,110
3,0.743593,52
5,1.000000,49
6,0.058860,102
...,...,...
176371,1.000000,18
177615,1.000000,3
178061,1.000000,3
179817,1.000000,3


Create a DF by selecting the movies that were rated by atleat 10 users (Arbitrary value) and sort the PearsonR column

In [126]:
top10 = movie_corr_summary[movie_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(10)
top10

Unnamed: 0_level_0,PearsonR,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
176371,1.0,18
1438,1.0,24
45950,1.0,13
1199,1.0,59
45720,1.0,34
44840,1.0,11
44199,1.0,40
1223,1.0,28
1231,1.0,22
40583,1.0,16


Create another DF consisting of movie Id and name

In [130]:
m_name = movies[['movieId', 'title']]


Merge m_name with the top 10

In [131]:
top10 = top10.merge(m_name, left_index=True, right_on="movieId")
top10

Unnamed: 0,PearsonR,rating_count,movieId,title
9604,1.0,18,176371,Blade Runner 2049 (2017)
1105,1.0,24,1438,Dante's Peak (1997)
6228,1.0,13,45950,"Inconvenient Truth, An (2006)"
901,1.0,59,1199,Brazil (1985)
6220,1.0,34,45720,"Devil Wears Prada, The (2006)"
6181,1.0,11,44840,"Benchwarmers, The (2006)"
6155,1.0,40,44199,Inside Man (2006)
924,1.0,28,1223,"Grand Day Out with Wallace and Gromit, A (1989)"
931,1.0,22,1231,"Right Stuff, The (1983)"
6054,1.0,16,40583,Syriana (2005)


# Create a function that takes the name of a movie and a number 

In [137]:
s_movies = []
def sim_movies(name, n):
    rating_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
    m_ID = name
    m_ratings = rating_crosstab[m_ID]
    m_ratings[m_ratings>=0] # exclude NaNs
    similar_to_m_ID = rating_crosstab.corrwith(m_ratings)
    corr_m_ID = pd.DataFrame(similar_to_m_ID, columns=['PearsonR'])
    corr_m_ID.dropna(inplace=True)
    rating2 = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
    rating2['rating_count'] = ratings.groupby('movieId')['rating'].count()
    movie_corr_summary = corr_m_ID.join(rating2['rating_count'])
    movie_corr_summary.drop(m_ID, inplace=True) # drop the selected movie
    topn = movie_corr_summary[movie_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(n)
    m_name = movies[['movieId', 'title']]
    topn = topn.merge(m_name, left_index=True, right_on="movieId")
    return topn

In [138]:
sim_movies(234, 19)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0,PearsonR,rating_count,movieId,title
5260,1.0,79,8636,Spider-Man 2 (2004)
1263,1.0,10,1678,"Joy Luck Club, The (1993)"
1349,1.0,17,1835,City of Angels (1998)
1435,1.0,22,1958,Terms of Endearment (1983)
1592,1.0,22,2134,Weird Science (1985)
324,1.0,12,366,Wes Craven's New Nightmare (Nightmare on Elm S...
695,1.0,44,913,"Maltese Falcon, The (1941)"
290,1.0,12,332,Village of the Damned (1995)
1824,1.0,50,2424,You've Got Mail (1998)
1886,1.0,18,2505,8MM (1999)
