In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

- We see that this file gives us information regarding the movie's title, release date, IMDb URL, and its genre(s). Since we are focused on building only collaborative filters, we do not require any of this information, apart from the movie title and its corresponding ID.

In [2]:
#Load the u.user file into a dataframe
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('inputs/ml-100k/u.user', sep='|', names=u_cols, encoding='latin-1')

In [3]:
#Load the u.items file into a dataframe
i_cols = ['movie_id', 'title' ,'release date','video release date', 'IMDb', 'URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama','Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']

movies = pd.read_csv('inputs/ml-100k/u.item', sep='|', names=i_cols, encoding='latin-1')

In [4]:
#Remove all information except Movie ID and title
movies = movies[['movie_id', 'title']]


#Load the u.data file into a dataframe
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('inputs/ml-100k/u.data', sep='\t', names=r_cols, encoding='latin-1')

#Drop the timestamp column
ratings = ratings.drop('timestamp', axis=1)

In [5]:
#Assign X as the original ratings dataframe and y as the user_id column of ratings.
X = ratings.copy()
y = ratings['user_id']

#Split into training and test datasets, stratified along user_id
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state=42)

In [6]:
def rmse(y_true, y_pred):
    return np.sqrt(mean_squared_error(y_true, y_pred))

- All our collaborative filter (or CF) models will take in a user_id and movie_id as input and output a floating point number between 1 and 5.

In [7]:
#Define the baseline model to always return 3.
def baseline(user_id, movie_id):
    return 3.0

In [8]:
#We compute the RMSE obtained by that particular model for all user-movie pairs in the test dataset.
def score(cf_model):
    #Construct a list of user-movie tuples from the test dataset.
    id_pairs = zip(X_test['user_id'], X_test['movie_id'])
    
    #Predict the rating for every user-movie tuple.
    y_pred = np.array([cf_model(user, movie) for (user, movie) in id_pairs])
    
    #Extract the actual ratings given by the users in the test data.
    y_true = np.array(X_test['rating'])
    
    #Return the final RMSE score.
    return rmse(y_true, y_pred)
    

In [9]:
score(baseline)

1.2488234462885457

# User-based collaborative filtering.

In [10]:
r_matrix = X_train.pivot_table(values = 'rating', index = 'user_id', columns = 'movie_id')
r_matrix.head()

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1677,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


## Mean rating.

In [11]:
def cf_user_mean(user_id, movie_id):
    #Check if movie_id exists in the r_matrix.
    if movie_id in r_matrix:
        mean_rating = r_matrix[movie_id].mean()
    else:
        mean_rating = 3.0
        
    return mean_rating

In [12]:
score(cf_user_mean)

1.0300824802393536

## Weighted mean.

- We assigned equal weights to all the users. However, it makes intuitive sense to give more preference to those users whose ratings are similar to the user in question than the other users whose ratings are not.

In [13]:
#Use of cosine score as the similarity function.
r_matrix_dummy = r_matrix.copy().fillna(0)

cosine_sim = cosine_similarity(r_matrix_dummy, r_matrix_dummy)

#Compute the similarity matrix using the dummy ratings matrix.
cosine_sim = pd.DataFrame(cosine_sim, index = r_matrix.index, columns = r_matrix.index)

cosine_sim.head()

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.108361,0.046638,0.029577,0.245753,0.335853,0.344724,0.191582,0.057149,0.251979,...,0.257073,0.069412,0.231643,0.108093,0.176842,0.104799,0.232472,0.051528,0.129555,0.256333
2,0.108361,1.0,0.057613,0.130237,0.054918,0.190552,0.079399,0.076146,0.167992,0.147376,...,0.136993,0.252887,0.255454,0.285193,0.232751,0.149088,0.102807,0.062386,0.109143,0.107686
3,0.046638,0.057613,1.0,0.139805,0.0,0.032485,0.043869,0.080968,0.022263,0.059925,...,0.027402,0.0,0.17506,0.010343,0.105635,0.019052,0.127099,0.023917,0.060392,0.0
4,0.029577,0.130237,0.139805,1.0,0.0,0.04519,0.088586,0.199526,0.135013,0.026919,...,0.055392,0.049773,0.076549,0.139382,0.113886,0.0,0.130343,0.077357,0.15789,0.063911
5,0.245753,0.054918,0.0,0.0,1.0,0.176443,0.28186,0.132205,0.03879,0.1342,...,0.183969,0.019305,0.073714,0.041807,0.081088,0.029743,0.188392,0.068342,0.055557,0.207259


- With the user cosine similarity matrix in hand, we are now in a position to efficiently calculate the weighted mean scores for this model.

In [14]:
#User-based collaborative filtering using weighted mean ratings.
def cf_user_wmean(user_id, movie_id):
    if movie_id in r_matrix:
        
        #Get similarity scores for the user in question with every other user.
        sim_scores = cosine_sim[user_id]
        
        #Get the user ratings for the movie in question.
        m_ratings = r_matrix[movie_id]
        
        #Extract the indices containing NaN in the m_ratings series.
        idx = m_ratings[m_ratings.isnull()].index
        
        #Drop the NaN values from the m_ratings Series.
        m_ratings = m_ratings.dropna()
        
        sim_scores = sim_scores.drop(idx)
        wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()
    else:
        #Default to a rating of 3 in the abscence of any information.
        wmean_rating = 3.0
    
    return wmean_rating

In [15]:
score(cf_user_wmean)

  wmean_rating = np.dot(sim_scores, m_ratings) / sim_scores.sum()


ValueError: Input contains NaN, infinity or a value too large for dtype('float64').

## User demographics.
- The code below leverages on user demographic information.
- The basic idea behind these filters is that users of the same demographic tend to have similar tastes.
- Thus, these filters only look at those users that fit a certain demographic.

In [19]:
#Build a gender demographic filter.
merged_df = pd.merge(X_train, users)
merged_df.head()

Unnamed: 0,user_id,movie_id,rating,age,sex,occupation,zip_code
0,862,177,4,25,M,executive,13820
1,862,416,3,25,M,executive,13820
2,862,1093,5,25,M,executive,13820
3,862,168,4,25,M,executive,13820
4,862,568,3,25,M,executive,13820


In [20]:
#Compute the mean rating of every movie by gender.
gender_mean = merged_df[['movie_id', 'sex', 'rating']].groupby(['movie_id', 'sex'])
gender_mean['rating'].mean()

movie_id  sex
1         F      3.797872
          M      3.888446
2         F      3.285714
          M      3.202703
3         F      2.916667
                   ...   
1677      F      3.000000
1679      M      3.000000
1680      M      2.000000
1681      M      3.000000
1682      M      3.000000
Name: rating, Length: 3047, dtype: float64

In [21]:
#Define a function that identifies the gender of the user, extracts the average rating given to the movie by the particular gender and return the value as output.

#Set the index of the users dataframe  to the user_id
users = users.set_index('user_id')

#Gender-based collaborative filtering using mean ratings.
def cf_gender(user_id, movie_id):
    #Check if movie exists in the r_matrix (or train set)
    if movie_id in r_matrix:
        
        #Identify the gender of the user.
        gender = users.loc[user_id]['sex']
        
        #Check if the gender has rated the movie.
        if gender in gender_mean[movie_id]:
            
            #Compute the mean rating given by that gender to the movie.
            gender_rating = gender_mean[movie_id][gender]
        
        else:
            gender_rating = 3.0
    else:
        #Default to a rating of 3.0 in the abscence of any information.
        gender_rating = 3.0
        
    return gender_rating

In [22]:
score(cf_gender)

KeyError: 'Column not found: 16'