## Building Collaborative Filters 

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [2]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']

users = pd.read_csv('u.users', sep='|', names=u_cols, encoding='latin-1')

users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [3]:
#Load the u.item file into a dataframe
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

items = pd.read_csv('u.item',sep = '|', names = i_cols, encoding = 'latin-1')

items.head()

Unnamed: 0,movie_id,title,release date,video release date,IMDb URL,unknown,Action,Adventure,Animation,Children's,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,Toy Story (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,0,1,1,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?GoldenEye%20(...,0,1,1,0,0,...,0,0,0,0,0,0,0,1,0,0
2,3,Four Rooms (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Four%20Rooms%...,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0
3,4,Get Shorty (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Get%20Shorty%...,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,Copycat (1995),01-Jan-1995,,http://us.imdb.com/M/title-exact?Copycat%20(1995),0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [4]:
#Remove all information except Movie ID and title
items = items[['movie_id', 'title']]

In [5]:
#Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']

ratings = pd.read_csv('u.data', sep = '\t', names = r_cols, encoding = 'latin-1')

ratings.head()

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [6]:
from sklearn.model_selection import train_test_split

X = ratings.copy()
y = ratings['user_id']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y, random_state = 1234)

In [7]:
from sklearn.metrics import mean_squared_error

def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))


In [8]:
def baseline(user_id, movie_id):
    return 3

### Memory-Based Approach

In [9]:
#Function to compute the RMSE score obtained on the testing set by a model
def score(cf_model):
    
    #Construct a list of user-movie tuples from the testing dataset
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    #Predict the rating for every user-movie tuple
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score
    return rmse(y_true, y_pred)

In [10]:
score(baseline)

1.2425176055090728

In [11]:
#Build the ratings matrix using pivot_table function
r_matrix = X_train.pivot_table(values='rating', index='user_id', columns='movie_id')

r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,,4.0,1.0,,3.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [12]:
#User Based Collaborative Filter using Mean Ratings
def cf_user_mean(user_id, movie_id):
    
    if movie_id in r_matrix: #Check if movie_id exists in r_matrix
        mean_rating = r_matrix[movie_id].mean() #Compute the mean of all the ratings given to the movie
    
    else:
        mean_rating = 3.0  #Default to a rating of 3.0 in the absence of any information
    
    return mean_rating

In [13]:
score(cf_user_mean)

1.024301898161866

In [14]:
## In the previous model, we assigned equal weights to all the users. However, it makes intuitive sense to give more preference to those users whose ratings are similar to the user in question than the other users whose ratings are not. 
## Therefore, let's alter our previous model by introducing a weight coefficient. 

In [15]:
r_matrix_dummy = r_matrix.copy().fillna(0) 
from sklearn.metrics.pairwise import cosine_similarity

#Compute the cosine similarity matrix using the dummy ratings matrix
cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

In [16]:
cosine_sim = pd.DataFrame(cosine_sim, index=r_matrix.index, columns=r_matrix.index)
cosine_sim.head(5)

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.125281,0.034147,0.049267,0.324706,0.389582,0.388138,0.227552,0.074174,0.322966,...,0.278933,0.08017,0.217887,0.150727,0.133058,0.06701,0.252319,0.125382,0.16653,0.352302
2,0.125281,1.0,0.077349,0.119518,0.073371,0.206238,0.076969,0.125264,0.134383,0.141437,...,0.141113,0.251849,0.330861,0.386714,0.232602,0.163137,0.233202,0.102082,0.189146,0.052515
3,0.034147,0.077349,1.0,0.268606,0.0,0.045174,0.066061,0.106839,0.022381,0.040034,...,0.009009,0.054608,0.143081,0.049379,0.124511,0.034575,0.132474,0.10351,0.154617,0.034068
4,0.049267,0.119518,0.268606,1.0,0.028655,0.061581,0.062597,0.172654,0.131779,0.063905,...,0.02947,0.0,0.088558,0.090451,0.113832,0.0,0.149822,0.072232,0.09742,0.076415
5,0.324706,0.073371,0.0,0.028655,1.0,0.193508,0.309779,0.204629,0.063691,0.176067,...,0.249967,0.064749,0.067593,0.04982,0.107642,0.046711,0.208778,0.071408,0.10796,0.265664


In [17]:
#User Based Collaborative Filter using Weighted Mean Ratings
def cf_user_wmean(user_id, movie_id):
    
    if movie_id in r_matrix: #Check if movie_id exists in r_matrix
        
        sim_scores = cosine_sim[user_id] #Get the similarity scores for the user in question with every other user
        
        m_ratings = r_matrix[movie_id] #Get the user ratings for the movie in question
        
        idx = m_ratings[m_ratings.isnull()].index #Extract the indices containing NaN in the m_ratings series
        
        m_ratings = m_ratings.dropna() #Drop the NaN values from the m_ratings Series
        
        sim_scores = sim_scores.drop(idx) #Drop the corresponding cosine scores from the sim_scores series
        
        wmean_rating = np.dot(sim_scores, m_ratings)/ sim_scores.sum() #Compute the final weighted mean
    
    else:
        wmean_rating = 3.0 #Default to a rating of 3.0 in the absence of any information
    
    return wmean_rating

In [18]:
score(cf_user_wmean)

1.0180929427414687

In [19]:
#Merge the original users dataframe with the training set 
merged_df = pd.merge(X_train, users)

merged_df.head()

Unnamed: 0,user_id,movie_id,rating,timestamp,age,sex,occupation,zip_code
0,489,312,2,891366748,55,M,other,45218
1,489,343,5,891447913,55,M,other,45218
2,489,1293,5,891446623,55,M,other,45218
3,489,321,3,891447845,55,M,other,45218
4,489,682,4,891366606,55,M,other,45218


In [20]:
#Compute the mean rating of every movie by gender
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])['rating'].mean()

In [21]:
#Set the index of the users dataframe to the user_id
users = users.set_index('user_id')

In [22]:
#Gender Based Collaborative Filter using Mean Ratings
def cf_gender(user_id, movie_id):
    
    #Check if movie_id exists in r_matrix (or training set)
    if movie_id in r_matrix:
        #Identify the gender of the user
        gender = users.loc[user_id]['sex']
        
        #Check if the gender has rated the movie
        if gender in gender_mean[movie_id]:
            
            #Compute the mean rating given by that gender to the movie
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    
    else:
        #Default to a rating of 3.0 in the absence of any information
        gender_rating = 3.0
    
    return gender_rating

In [23]:
score(cf_gender)

1.034558645630906

In [24]:
#Compute the mean rating by gender and occupation
gen_occ_mean = merged_df[['sex', 'rating', 'movie_id', 'occupation']].pivot_table(
    values='rating', index='movie_id', columns=['occupation', 'sex'], aggfunc='mean')

gen_occ_mean.head()

occupation,administrator,administrator,artist,artist,doctor,educator,educator,engineer,engineer,entertainment,...,salesman,salesman,scientist,scientist,student,student,technician,technician,writer,writer
sex,F,M,F,M,M,F,M,F,M,F,...,F,M,F,M,F,M,F,M,F,M
movie_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
1,4.083333,4.090909,4.4,3.25,3.666667,3.3,3.842105,4.0,4.026316,4.5,...,4.0,3.5,3.5,3.888889,3.826087,3.758621,4.0,4.142857,4.0,3.125
2,3.0,4.0,,3.0,,3.5,3.25,,3.066667,,...,,,,3.0,3.25,3.107143,,2.6,4.5,2.5
3,3.5,2.5,,,,4.0,2.0,,3.333333,,...,,1.0,,,2.0,3.434783,,4.333333,,1.0
4,2.666667,3.4,,4.5,3.0,3.0,3.583333,4.0,3.647059,,...,4.0,3.5,,3.75,3.222222,3.733333,,3.333333,4.5,3.5
5,4.0,2.5,,,,4.0,1.0,,2.75,,...,4.0,,,3.0,4.0,2.923077,,3.0,4.0,2.4


In [25]:
ratings = ratings.drop(['timestamp'],axis = 1)

### Model-Based Approach

In [26]:
from surprise import Reader, Dataset, KNNBasic
from surprise.model_selection import cross_validate

reader = Reader() #The Reader object helps in parsing the file or dataframe containing ratings

data = Dataset.load_from_df(df = ratings, reader = reader) ##Create the dataset to be used for building the filter

knn = KNNBasic()

cross_validate(knn, data, measures=['RMSE'], cv = 5)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.


{'test_rmse': array([0.97391451, 0.97533073, 0.98257617, 0.97805233, 0.98219584]),
 'fit_time': (0.5327925682067871,
  0.8020319938659668,
  0.5563888549804688,
  0.5578947067260742,
  0.6317310333251953),
 'test_time': (3.936882972717285,
  4.442366600036621,
  3.6521222591400146,
  3.75113844871521,
  3.9428300857543945)}

In [27]:
from surprise import SVD
#Define the SVD algorithm object 
svd = SVD()
#Evaluate the performance in terms of RMSE 
cross_validate(svd, data, measures=['RMSE'],cv = 5)

{'test_rmse': array([0.9449126 , 0.92946283, 0.9430689 , 0.93085542, 0.92548216]),
 'fit_time': (5.162607192993164,
  6.181471586227417,
  5.896905422210693,
  6.896382570266724,
  7.506019592285156),
 'test_time': (0.14709210395812988,
  0.18476295471191406,
  0.141432523727417,
  0.2373669147491455,
  0.13621950149536133)}