# Recommender System baseline version
## - Based on Collaborative Filter

In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn import model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error

### Load ratings.csv

In [14]:
ratings_df = pd.read_csv('../ml-latest-small/ratings.csv')
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100004 entries, 0 to 100003
Data columns (total 4 columns):
userId       100004 non-null int64
movieId      100004 non-null int64
rating       100004 non-null float64
timestamp    100004 non-null int64
dtypes: float64(1), int64(3)
memory usage: 3.1 MB


### Get train&test rating matrix

In [15]:
train, test = model_selection.train_test_split(ratings_df, test_size=0.25)
rating_matrix_train = np.zeros((ratings_df.userId.unique().shape[0], max(ratings_df.movieId)))
rating_matrix_test = np.zeros(rating_matrix_train.shape)

for row in train.itertuples():
    rating_matrix_train[row[1]-1, row[2]-1] = row[3]
for row in test.itertuples():
    rating_matrix_test[row[1]-1, row[2]-1] = row[3]
print 'There are {0: d} users, {1: d} movies.'.format(rating_matrix_train.shape[0], rating_matrix_train.shape[1])

There are  671 users,  163949 movies.


### Collaborative Filter Process

In [16]:
class CF(object):
    def get_similarity(self, rating_matrix, based_on='user'):
        if based_on == 'user':
            similarity = rating_matrix.dot(rating_matrix.T) + 1e-9
            similarity /= np.array([np.diagonal(similarity)])
        elif based_on == 'item':
            print 'Lack of memory...'
#             similarity = rating_matrix.T.dot(rating_matrix) + 1e-9
#             similarity /= np.array([np.diagonal(similarity)])
        return similarity
    
    def get_prediction(self, rating_matrix, similarity, based_on='user'):
        if based_on == 'user':
            rating_mean = rating_matrix.sum(axis=1) / (rating_matrix>0).sum(axis=1)
            rating_diff = rating_matrix - rating_mean[:, np.newaxis]
            prediction = rating_mean[:, np.newaxis] + similarity.dot(rating_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
        elif based_on == 'item':
            print 'Lack of memory...'
#             prediction = rating_matrix.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
        prediction[prediction < 0] = 0
        prediction[prediction > 5] = 5
        return prediction
    
    def get_prediction_topk(self, rating_matrix, similarity, based_on='user', k=5):
        prediction = np.zeros(rating_matrix.shape)
        if based_on == 'user':
            rating_mean = rating_matrix.sum(axis=1) / (rating_matrix>0).sum(axis=1)
            for u in xrange(rating_matrix.shape[0]):
                topk_users_index = [np.argsort(similarity[u,:])[:-k-1:-1]]
                topk_similarity = similarity[u, :][topk_users_index]
                topk_similarity = topk_similarity[np.newaxis, :]
                rating_diff = (rating_matrix[topk_users_index, :] - rating_mean[topk_users_index, np.newaxis])[0]
                prediction[u, :] = rating_mean[u] + topk_similarity.dot(rating_diff) / np.abs(topk_similarity).sum(axis=1)
        
        elif based_on == 'item':
            print 'Lack of memory...'
#             for i in xrange(rating_matrix.shape[1]):
#                 topk_movies_index = [np.argsort(similarity[:, i])[:-k-1:-1]]
#                 topk_similarity = similarity[:, i][topk_movies_index]
#                 topk_similarity = topk_similarity[:, np.newaxis]
#                 prediction[:, i] = rating_matrix[:, topk_movies_index].dot(topk_similarity) / np.abs(topk_similarity).sum(axis=0)
        prediction[prediction < 0] = 0
        prediction[prediction > 5] = 5
        return prediction
    
    def get_evaluation(self, rating_matrix, prediction, type='mae'):
        real = rating_matrix[rating_matrix.nonzero()].flatten()
        prediction = prediction[rating_matrix.nonzero()].flatten()
        if type == 'mae':
            return mean_absolute_error(real, prediction)
        elif type == 'mse':
            return mean_squared_error(real, prediction)
            

### Basic CF v.s. TopK CF(based on users)

In [17]:
user_cf = CF()
similarity = user_cf.get_similarity(rating_matrix_train, based_on='user')
prediction_base = user_cf.get_prediction(rating_matrix_test, similarity, based_on='user')
prediction_topk = user_cf.get_prediction_topk(rating_matrix_test, similarity, based_on='user', k=5)

mae_base = user_cf.get_evaluation(rating_matrix_test, prediction_base, type='mae')
mae_topk = user_cf.get_evaluation(rating_matrix_test, prediction_topk, type='mae')

print 'User-based CF baseVer MAE: {0: .2f}'.format(mae_base)
print 'User-based CF topkVer MAE: {0: .2f}'.format(mae_topk)

User-based CF baseVer MAE:  3.32
User-based CF topkVer MAE:  2.23


### Choose K 

In [21]:
k_set = [5, 10, 25, 50, 100]
for k in k_set:
    prediction_topk = user_cf.get_prediction_topk(rating_matrix_test, similarity, based_on='user', k=k)
    mae_topk = user_cf.get_evaluation(rating_matrix_test, prediction_topk, type='mae')
    print 'Top{0: d}-Users-based CF MAE: {1: .2f}'.format(k, mae_topk)

Top 5-Users-based CF MAE:  2.23
Top 10-Users-based CF MAE:  2.70
Top 25-Users-based CF MAE:  3.04
Top 50-Users-based CF MAE:  3.18
Top 100-Users-based CF MAE:  3.25
