In [1]:
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

In [2]:
class CollaborativeFiltering(object):
    def __init__(self, data, k, dist_func = cosine_similarity, uuCF = 1):
        self.uuCF = uuCF
        # data: a 2D numpy array, each row is a triple (u, i, r)
        self.data = data
        # k: number of neighbors
        self.k = k
        # distance function
        self.dist_func = dist_func
        self.ybar_data = None

        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.data[:, 0])) + 1 
        self.n_items = int(np.max(self.data[:, 1])) + 1

    def normalize(self):
        # Normalize data: subtract mean rating of each user
        users = self.data[:, 0]
        self.ybar_data = self.data.copy()
        # mu: mean rating of each user
        self.mu = np.zeros((self.n_users,))

        for n in range(self.n_users):
            # row indices of ratings made by user n
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all items rated by user n
            item_ids = self.data[ids, 1]
            # ratings made by user n
            ratings = self.data[ids, 2]
            # calculate mean rating of user n
            m = np.mean(ratings)
            # if m is nan (user n has not rated any item)
            if np.isnan(m):
                m = 0
            # normalize the ratings of user n
            self.ybar_data[ids, 2] = ratings - m
            self.mu[n] = m
        
        self.Ybar = sparse.coo_matrix((self.ybar_data[:, 2],
            (self.ybar_data[:, 1], self.ybar_data[:, 0])), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()
    
    def similarity(self):
        self.S = self.dist_func(self.Ybar.T, self.Ybar.T)

    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize()
        self.similarity() 
        
    def fit(self):
        self.refresh()

    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
    
    def recommend(self, u, normalized = 1):
        """
            Determine all items should be recommended for user u
            The decision is made based on all i such that:
            self.pred(u, i) > 0. Suppose we are considering items which
            have not been rated by u yet.
        """
        ids = np.where(self.data[:, 0] == u)[0]
        items_rated_by_u = self.data[ids, 1].tolist()
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.pred(u, i, normalized)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items
    
    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print ('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('Recommend item(s):', recommended_items, 'to user', u)
            else: 
                print ('Recommend item', u, 'to user(s) : ', recommended_items)

In [3]:
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('data/ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
data = np.array(ratings)

rs = CollaborativeFiltering(data, k = 2, uuCF = 1)
rs.fit()

rs.print_recommendation()

Recommendation: 
Recommend item(s): [2] to user 0
Recommend item(s): [1] to user 1
Recommend item(s): [] to user 2
Recommend item(s): [4] to user 3
Recommend item(s): [4] to user 4
Recommend item(s): [0, 3, 4] to user 5
Recommend item(s): [1] to user 6


In [4]:
rating = np.array(ratings)
users = rating[:, 0]
ids = np.where(users == 2)

---

In [6]:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']

ratings_base = pd.read_csv('data/ml-100k/ub.base', sep='\t', names=r_cols)
ratings_test = pd.read_csv('data/ml-100k/ub.test', sep='\t', names=r_cols)

rate_train = np.array(ratings_base)
rate_test = np.array(ratings_test)

# indices start from 0
rate_train[:, :2] -= 1
rate_test[:, :2] -= 1

In [7]:
%%time
rs = CollaborativeFiltering(rate_train, k = 30, uuCF = 1)
rs.fit()

n_tests = rate_test.shape[0]
SE = 0 # squared error
for n in range(n_tests):
    pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
    SE += (pred - rate_test[n, 2])**2 

RMSE = np.sqrt(SE/n_tests)
print('User-user CF, RMSE =', RMSE)

User-user CF, RMSE = 0.9951816576105769
CPU times: user 2.63 s, sys: 38.9 ms, total: 2.67 s
Wall time: 2.66 s


In [8]:
# rs = CollaborativeFiltering(rate_train, k = 30, uuCF = 0)
# rs.fit()

# n_tests = rate_test.shape[0]
# SE = 0 # squared error
# for n in range(n_tests):
#     pred = rs.pred(rate_test[n, 0], rate_test[n, 1], normalized = 0)
#     SE += (pred - rate_test[n, 2])**2 

# RMSE = np.sqrt(SE/n_tests)
# print('Item-item CF, RMSE =', RMSE)