# Collaborative Filter Recomendation System

Referenced code from: Banik, Rounak. 2018. Hands-On Recommendation Systems with Python: Start building powerful and personalized, recommendation engines with Python. Packt Publishing.

In [1]:
import pandas as pd
import numpy as np
import os
os.environ['DATA_PATH'] = '/Users/connorranson/Downloads/'

In [56]:
# configure file path
data_path = os.environ['DATA_PATH']
movies_filename = 'movies.csv'
ratings_filename = 'ratings.csv'

# read data
df_movies = pd.read_csv(
    os.path.join(data_path, movies_filename),
    usecols=['movieId', 'title', 'genres'],
    dtype={'movieId': 'int32', 'title': 'str', 'genres': 'str'})

df_ratings = pd.read_csv(
    os.path.join(data_path, ratings_filename),
    usecols=['userId', 'movieId', 'rating'],
    dtype={'userId': 'int32', 'movieId': 'int32', 'rating': 'float32'})


In [3]:
df_movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [4]:
#Using regular expressions to find a year stored between parentheses
#We specify the parentheses so we don't conflict with movies that have years in their titles.
df_movies['year'] = df_movies.title.str.extract('(\(\d\d\d\d\))',expand=False)
#Removing the parentheses.
df_movies['year'] = df_movies.year.str.extract('(\d\d\d\d)',expand=False)
# Note that expand=False simply means do not add this adjustment as an additional column to the data frame.
#Removing the years from the 'title' column.
df_movies['title'] = df_movies.title.str.replace('(\(\d\d\d\d\))', '')
#Applying the strip function to get rid of any ending white space characters that may have appeared, using lambda function.
df_movies['title'] = df_movies['title'].apply(lambda x: x.strip())

In [5]:
#Every genre is separated by a | so we simply have to call the split function on |.
df_movies['genres'] = df_movies.genres.str.split('|')
df_movies.head()

Unnamed: 0,movieId,title,genres,year
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995
2,3,Grumpier Old Men,"[Comedy, Romance]",1995
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995
4,5,Father of the Bride Part II,[Comedy],1995


In [6]:
df_movies.isna().sum()

movieId     0
title       0
genres      0
year       13
dtype: int64

In [7]:
# First let's make a copy of the movies_df.
movies_with_genres = df_movies.copy(deep=True)

In [8]:
# Let's iterate through movies_df, then append the movie genres as columns of 1s or 0s.
# 1 if that column contains movies in the genre at the present index and 0 if not.
x = []
for index, row in df_movies.iterrows():
    x.append(index)
    for genre in row['genres']:
        movies_with_genres.at[index, genre] = 1
# Confirm that every row has been iterated and acted upon.
print(len(x) == len(df_movies))
movies_with_genres.head(3)

True


Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,,...,,,,,,,,,,
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,,1.0,,1.0,,...,,,,,,,,,,
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,,,,1.0,,1.0,...,,,,,,,,,,


In [27]:
#Filling in the NaN values with 0 to show that a movie doesn't have that column's genre.
movies_with_genres = movies_with_genres.fillna(0)
movies_with_genres.head(-5)

Unnamed: 0,movieId,title,genres,year,Adventure,Animation,Children,Comedy,Fantasy,Romance,...,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
0,1,Toy Story,"[Adventure, Animation, Children, Comedy, Fantasy]",1995,1.0,1.0,1.0,1.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,Jumanji,"[Adventure, Children, Fantasy]",1995,1.0,0.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,Grumpier Old Men,"[Comedy, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Waiting to Exhale,"[Comedy, Drama, Romance]",1995,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Father of the Bride Part II,[Comedy],1995,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9732,193565,Gintama: The Movie,"[Action, Animation, Comedy, Sci-Fi]",2010,0.0,1.0,0.0,1.0,0.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9733,193567,anohana: The Flower We Saw That Day - The Movie,"[Animation, Drama]",2013,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9734,193571,Silver Spoon,"[Comedy, Drama]",2014,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9735,193573,Love Live! The School Idol Movie,[Animation],2015,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [10]:
# print out the shape and first five rows of ratings data.
print('Df_ratings shape:',df_ratings.shape)          
df_ratings.head()

Df_ratings shape: (100836, 3)


Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0
3,1,47,5.0
4,1,50,5.0


In [11]:
df_ratings.isna().sum()

userId     0
movieId    0
rating     0
dtype: int64

In [12]:
#Import the train_test_split function
from sklearn.model_selection import train_test_split

#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = df_ratings.copy()
y = df_ratings['userId']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify=y, random_state=42)

In [13]:
#Import the mean_squared_error function
from sklearn.metrics import mean_squared_error

#Function that computes the root mean squared error (or RMSE)
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

In [14]:
#Define the baseline model to always return 3.
def baseline(userId, movieId):
    return 3.0

In [192]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['userId'], X_test['movieId'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return round(rmse(y_true, y_pred),2)

In [193]:
score(baseline)

1.15

## User Based Collaborative Filtering

### Ratings Matrix

In [17]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='userId', columns='movieId')

r_matrix.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190213,191005,193565,193571,193573,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [18]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(userId, movieId):
    
    #Check if movie_id exists in r_matrix
    if movieId in r_matrix:
        #Compute the mean of all the ratings given to the movie
        mean_rating = r_matrix[movieId].mean()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        mean_rating = 3.0
    
    return mean_rating

In [194]:
#Compute RMSE for the Mean model
score(cf_user_mean)

0.98

In [203]:
#Create a dummy ratings matrix with all null values imputed to 0
r_matrix_dummy = r_matrix.copy().fillna(0)
r_matrix_dummy.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,190213,191005,193565,193571,193573,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [204]:
# Import cosine_score 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [205]:
#Convert into pandas dataframe 
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)
cosine_sim.head(10)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.01762,0.057824,0.121903,0.109929,0.090673,0.12022,0.04858,0.043903,0.0,...,0.070603,0.097086,0.162277,0.061565,0.071947,0.125049,0.183326,0.207957,0.057411,0.116425
2,0.01762,1.0,0.0,0.0,0.021348,0.032609,0.027067,0.035505,0.0,0.0351,...,0.146052,0.021671,0.0,0.0,0.0,0.030044,0.016681,0.040715,0.035274,0.072883
3,0.057824,0.0,1.0,0.003343,0.007344,0.004038,0.0,0.007328,0.0,0.0,...,0.004101,0.004473,0.018068,0.0,0.015745,0.013637,0.002066,0.015913,0.0,0.022464
4,0.121903,0.0,0.003343,1.0,0.067377,0.06805,0.093982,0.02017,0.015231,0.029079,...,0.080902,0.08618,0.21227,0.048952,0.039061,0.161836,0.122741,0.104404,0.042941,0.100051
5,0.109929,0.021348,0.007344,0.067377,1.0,0.184347,0.075121,0.358651,0.0,0.006518,...,0.072024,0.342528,0.049651,0.218031,0.090653,0.071691,0.091188,0.128873,0.235796,0.053052
6,0.090673,0.032609,0.004038,0.06805,0.184347,1.0,0.06141,0.301626,0.007884,0.027391,...,0.016695,0.354044,0.083393,0.289172,0.065278,0.076179,0.111342,0.149578,0.135835,0.045021
7,0.12022,0.027067,0.0,0.093982,0.075121,0.06141,1.0,0.117624,0.067652,0.114285,...,0.148202,0.100809,0.066935,0.061259,0.13312,0.14417,0.182677,0.22943,0.043897,0.128478
8,0.04858,0.035505,0.007328,0.02017,0.358651,0.301626,0.117624,1.0,0.0,0.032519,...,0.077764,0.46645,0.063702,0.180316,0.108231,0.062385,0.147373,0.163574,0.193468,0.056998
9,0.043903,0.0,0.0,0.015231,0.0,0.007884,0.067652,0.0,1.0,0.027503,...,0.075983,0.0,0.040407,0.025207,0.077419,0.036465,0.0,0.067541,0.0,0.052496
10,0.0,0.0351,0.0,0.029079,0.006518,0.027391,0.114285,0.032519,0.027503,1.0,...,0.14578,0.013894,0.020117,0.015653,0.115945,0.071631,0.010695,0.047045,0.029077,0.092937


In [206]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(userId, movieId):
    
    #Check if movie_id exists in r_matrix
    if movieId in r_matrix:
        
        #Get the similarity scores for the user in question with every other user
        sim_scores = cosine_sim[userId]
        
        #Get the user ratings for the movie in question
        m_ratings = r_matrix[movieId]
        
        #Extract the indices containing NaN in the m_ratings series
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series
        m_ratings = m_ratings.dropna()
        
        #Drop the corresponding cosine scores from the sim_scores series
        sim_scores = sim_scores.drop(idx)
        
        #Compute the final weighted mean
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum()
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        wmean_rating = 3.0
    
    return wmean_rating

In [207]:
cf_user_wmean(1, 1)

3.9022305

In [208]:
score(cf_user_wmean)



ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## Model Based Approaches

In [28]:
#Import the required classes and methods from the surprise library
from surprise import Reader, Dataset, KNNBasic
from surprise import SVD
from surprise import Dataset
from surprise.model_selection import cross_validate
#Define a Reader object
#The Reader object helps in parsing the file or dataframe containing ratings
reader = Reader()

#Create the dataset to be used for building the filter
data = Dataset.load_from_df(df_ratings, reader)

#Define the algorithm object; in this case kNN
knn = KNNBasic()

#Evaluate the performance in terms of RMSE
cross_validate(knn, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9477  0.9436  0.9444  0.9422  0.9469  0.9450  0.0021  
MAE (testset)     0.7264  0.7249  0.7238  0.7233  0.7247  0.7246  0.0011  
Fit time          0.27    0.41    0.19    0.18    0.23    0.26    0.08    
Test time         3.31    2.62    2.10    2.43    2.71    2.64    0.40    


{'test_rmse': array([0.94774274, 0.94361742, 0.94436259, 0.94215987, 0.94687294]),
 'test_mae': array([0.72644085, 0.72492256, 0.72379782, 0.72327396, 0.72472266]),
 'fit_time': (0.2749900817871094,
  0.41469287872314453,
  0.19458675384521484,
  0.1783609390258789,
  0.23356103897094727),
 'test_time': (3.313129186630249,
  2.622143030166626,
  2.096595048904419,
  2.432569742202759,
  2.711932897567749)}

In [29]:
#Import SVD
from surprise import SVD

#Define the SVD algorithm object
svd = SVD()

#Evaluate the performance in terms of RMSE
cross_validate(svd, data, measures=['RMSE','MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8752  0.8673  0.8682  0.8856  0.8762  0.8745  0.0066  
MAE (testset)     0.6729  0.6660  0.6688  0.6787  0.6712  0.6715  0.0043  
Fit time          7.58    5.46    5.59    5.49    5.41    5.91    0.84    
Test time         0.20    0.27    0.18    0.25    0.16    0.21    0.04    


{'test_rmse': array([0.87520641, 0.86728427, 0.86823701, 0.88560993, 0.87618423]),
 'test_mae': array([0.6729143 , 0.66600136, 0.66880197, 0.67871784, 0.67120016]),
 'fit_time': (7.581944227218628,
  5.462339162826538,
  5.587456941604614,
  5.4919891357421875,
  5.407311916351318),
 'test_time': (0.19724583625793457,
  0.26877379417419434,
  0.1750020980834961,
  0.2529489994049072,
  0.16002416610717773)}