# Recommender Systems

[Movielens](https://grouplens.org/datasets/movielens/)

In [1]:
import numpy as np
import pandas as pd

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

In [3]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [5]:
df = pd.merge(ratings, movies, on='movieId')

In [6]:
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,31,2.5,1260759144,Dangerous Minds (1995),Drama
1,7,31,3.0,851868750,Dangerous Minds (1995),Drama
2,31,31,4.0,1273541953,Dangerous Minds (1995),Drama
3,32,31,4.0,834828440,Dangerous Minds (1995),Drama
4,36,31,3.0,847057202,Dangerous Minds (1995),Drama


In [7]:
df.groupby('title')['rating'].mean().sort_values(ascending = False).head()

title
Burn Up! (1991)                                     5.0
Absolute Giganten (1999)                            5.0
Gentlemen of Fortune (Dzhentlmeny udachi) (1972)    5.0
Erik the Viking (1989)                              5.0
Reality (2014)                                      5.0
Name: rating, dtype: float64

In [8]:
df.groupby('title')['rating'].count().sort_values(ascending = False).head()

title
Forrest Gump (1994)                          341
Pulp Fiction (1994)                          324
Shawshank Redemption, The (1994)             311
Silence of the Lambs, The (1991)             304
Star Wars: Episode IV - A New Hope (1977)    291
Name: rating, dtype: int64

In [9]:
rating = pd.DataFrame(df.groupby('title')['rating'].mean())

In [10]:
rating.head()

Unnamed: 0_level_0,rating
title,Unnamed: 1_level_1
"""Great Performances"" Cats (1998)",1.75
$9.99 (2008),3.833333
'Hellboy': The Seeds of Creation (2004),2.0
'Neath the Arizona Skies (1934),0.5
'Round Midnight (1986),2.25


In [None]:
rating['n_ratings'] = pd.DataFrame(df.groupby('title')['rating'].count())
rating.head()

Unnamed: 0_level_0,rating,n_ratings
title,Unnamed: 1_level_1,Unnamed: 2_level_1
"""Great Performances"" Cats (1998)",1.75,2
$9.99 (2008),3.833333,3
'Hellboy': The Seeds of Creation (2004),2.0,1
'Neath the Arizona Skies (1934),0.5,1
'Round Midnight (1986),2.25,2


## EDA 


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
f, (ax1, ax2) = plt.subplots(nrows=1, ncols=2, figsize=(10,6))

# For no. of ratings 
ax1.set_title('No. of ratings')
ax1.hist(rating['n_ratings'], bins =30);
ax1.set_xlabel('No of ratings for a movie')
ax1.set_ylabel('No. of movies / frequency')

# For rating
ax2.set_title('Rating')
ax2.hist(rating['rating'], bins =30);
ax2.set_xlabel('Rating (1 to 5)')
ax2.set_ylabel('No. of movies / frequency');

In [None]:
sns.jointplot(x='rating',y='n_ratings',data=rating,alpha=0.5);

In [None]:
df.head()

In [None]:
rating_mat = df.pivot_table(index='userId',
                            columns='title',
                            values='rating')

In [None]:
rating_mat.head()

In [None]:
rating.sort_values('n_ratings', ascending=False).head(10)

In [None]:
movies[movies['title']=='Forrest Gump (1994)']

In [None]:
movies[movies['title']=='Matrix, The (1999)']

In [None]:
#Getting ratings from rating_mat
FG_user_ratings = rating_mat['Forrest Gump (1994)']
Matrix_user_ratings = rating_mat['Matrix, The (1999)']
#Displaying the heads
FG_user_ratings.head(), Matrix_user_ratings.head()

In [None]:
similar_to_FG = rating_mat.corrwith(FG_user_ratings)#.head(10)
similar_to_matrix = rating_mat.corrwith(Matrix_user_ratings)

In [None]:
corr_FG = pd.DataFrame(similar_to_FG, columns = ['correlation'])
corr_FG.head()

Let's drop NaN and check the head again!

In [None]:
corr_FG.dropna(inplace=True)
corr_FG.head()

Let's do the same for matrix (similar_to_matrix)

In [None]:
corr_matrix = pd.DataFrame(similar_to_matrix, columns = ['correlation'])
corr_matrix.dropna(inplace=True)
corr_matrix.head()

So, in the recently created dataframes (corr_FG and corr_matrix), the index is the title of the movie whereas the correlation column tells how correlated the user rating of Forrest Gump and Matrix are to the user rating of the other movies. <br>
In principle, if we sort our dataframes by correlation, we should get the most similar movies to Forrest Gump and Matrix in order, in the respective dataframes. <br>

Let's see if this works for matrix movie only. (you can do the same for Forrest Gump after this)

In [None]:
corr_matrix.sort_values('correlation',ascending=False).head(10)

In [None]:
plt.hist(rating['n_ratings'], bins=50);
plt.ylim(0,500);
plt.xlim(0,200);

In [None]:
corr_matrix = corr_matrix.join(rating['n_ratings'])
corr_matrix.head()

We need to sort the values in order from high to low!<br>

In [None]:
corr_matrix[corr_matrix['n_ratings']>50].sort_values(
    'correlation',ascending=False).head()

In [None]:
corr_FG = corr_FG.join(rating['n_ratings'])
corr_FG[corr_FG['n_ratings']>50].sort_values(
    'correlation',ascending=False).head()