<a href="https://colab.research.google.com/github/AnnaK8090/CIND-820_Big-Data-Analytics-Project/blob/main/3_Collaborative_Filtering_ItemBased_%26_UserBased.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Collaborative Filtering: Item Based and User Based Models

In [1]:
# 1. Importing libraries:
import numpy as np 
import pandas as pd      

In [2]:
# 2. Loading csv file and saving it into a dataframe:
result1 = pd.read_csv('result1.csv', on_bad_lines='skip')

In [3]:
result1.shape

(11375, 4)

In [4]:
# 3. Descriptive statistics:
# Number of users
print('The dataset has', result1['customer_unique_id'].nunique(), 'unique users')
# Number of products
print('The ratings dataset has', result1['product_id'].nunique(), 'unique products')
# Number of product categories
print('The ratings dataset has', result1['product_category_name_english'].nunique(), 'unique product categories')
# Number of ratings
print('The ratings dataset has', result1['review_score'].nunique(), 'unique ratings:', sorted(result1['review_score'].unique()))

The dataset has 5167 unique users
The ratings dataset has 7448 unique products
The ratings dataset has 68 unique product categories
The ratings dataset has 5 unique ratings: [1, 2, 3, 4, 5]


In [5]:
# 4. User-Item matrix

matrix = result1.pivot_table(index='customer_unique_id', columns='product_id', values='review_score')
matrix.head()

product_id,0011c512eb256aa0dbbb544d8dffcf6e,001b72dfd63e9833e8c02742adf472e3,00210e41887c2a8ef9f791ebc780cc36,00250175f79f584c14ab5cecd80553cd,002959d7a0b0990fe2d69988affcbc80,0042f1a9a7e0edd1400c6cd0fda065f8,005030ef108f58b46b78116f754d8d38,0060b415594c5e1200324ef1a18493c4,007c63ae4b346920756b5adcad8095de,008cff0e5792219fae03e570f980b330,...,ff96895c6b1d31f34b2d82f86670fa85,ffaaddefb271481c66d4bd79844ecdae,ffb2e8c1ddc7c3e590d2bc4c91de53e1,ffbb3c00e9687ad738ace3977e821da5,ffbbf6b9097237a1122f17e7341a3fb2,ffbc83054b3741a8d67fc59d9cf9d42d,ffc0b406806006602c5853b00ab5f7fd,ffcfaba393e8ef71937c6e8421bc2868,ffd4bf4306745865e5692f69bd237893,ffe8083298f95571b4a66bfbc1c05524
customer_unique_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00053a61a98854899e70ed204dd4bafe,,,,,,,,,,,...,,,,,,,,,,
000de6019bb59f34c099a907c151d855,,,,,,,,,,,...,,,,,,,,,,
000fbf0473c10fc1ab6f8d2d286ce20c,,,,,,,,,,,...,,,,,,,,,,
001926cef41060fae572e2e7b30bd2a4,,,,,,,,,,,...,,,,,,,,,,
001928b561575b2821c92254a2327d06,,,,,,,,,,,...,,,,,,,,,,


In [6]:
type(matrix)

pandas.core.frame.DataFrame

In [7]:
# 5. Transforming matrix into array:
matrixA = np.squeeze(np.asarray(matrix))

In [8]:
type(matrixA)

numpy.ndarray

In [9]:
matrixA[np. isnan(matrixA)] = 0 

In [10]:
matrixA.shape[0]

5167

In [11]:
matrixA.shape[1]

7448

In [12]:
float(len(matrixA.nonzero()[0]))

11375.0

In [13]:
# 6. Checking user-item matrix sparsity - % of the user-item ratings have a value. 
sparsity = float(len(matrixA.nonzero()[0]))
sparsity /= (matrixA.shape[0] * matrixA.shape[1])
sparsity *= 100
print ('Sparsity: {:4.2f}%'.format(sparsity))

Sparsity: 0.03%


In [14]:
ratings=matrixA.copy()
ratings

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [15]:
 # 7. Splitting the data into training and test sets by removing 1 rating per user from the training set and placing them in the test set:
 def train_test_split(ratings):
    test = np.zeros(ratings.shape)
    train = ratings.copy()
    for user in range(ratings.shape[0]):
        test_ratings = np.random.choice(ratings[user, :].nonzero()[0], 
                                        size=1, 
                                        replace=False)
        train[user, test_ratings] = 0.
        test[user, test_ratings] = ratings[user, test_ratings]
        
    # Test and training are truly disjoint
    assert(np.all((train * test) == 0)) 
    return train, test

In [16]:
train, test = train_test_split(ratings)

In [17]:
# 8. Collaborative filtering for user- and item-based scenario needs a similarity matrix. 
# For user-based collaborative filtering, the user-similarity matrix will consist of some distance metric that measures the similarity between any two pairs of users. 
# Likewise, the item-similarity matrix will measure the similarity between any two pairs of items.
# A common distance metric is cosine similarity. The metric is measured as the cosine of the angle between the two users’ or two items’ vectors. 

def fast_similarity(ratings, kind='user', epsilon=1e-9):
    # epsilon -> small number for handling dived-by-zero errors
    if kind == 'user':
        sim = ratings.dot(ratings.T) + epsilon
    elif kind == 'item':
        sim = ratings.T.dot(ratings) + epsilon
    norms = np.array([np.sqrt(np.diagonal(sim))])
    return (sim / norms / norms.T)

In [18]:
# 9. Similarity matrices:
item_similarity = fast_similarity(train, kind='item')
print (item_similarity[:4, :4])

[[1.00000000e+00 6.32455532e-06 1.00000000e+00 6.32455532e-06]
 [6.32455532e-06 1.00000000e+00 6.32455532e-06 4.00000000e-11]
 [1.00000000e+00 6.32455532e-06 1.00000000e+00 6.32455532e-06]
 [6.32455532e-06 4.00000000e-11 6.32455532e-06 1.00000000e+00]]


In [19]:
user_similarity = fast_similarity(train, kind='user')
print (user_similarity[:4, :4])

[[1.00e+00 5.00e-10 2.00e-10 2.50e-10]
 [5.00e-10 1.00e+00 1.00e-10 1.25e-10]
 [2.00e-10 1.00e-10 1.00e+00 5.00e-11]
 [2.50e-10 1.25e-10 5.00e-11 1.00e+00]]


In [20]:
# 10. We can now predict the ratings that were not included with the data. 
# Using these predictions, we can then compare them with the test data to attempt to validate the quality of our recommender model.
def predict_fast_simple(ratings, similarity, kind='user'):
    if kind == 'user':
        return similarity.dot(ratings) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif kind == 'item':
        return ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])

In [21]:
%timeit predict_fast_simple(train, user_similarity, kind='user')

13.2 s ± 176 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [22]:
# 11. We’ll use MSE, RMSE and MAE as our validation metrics for accuracy  
# Comparing user- and item-based collaborative filtering, it looks like item-based collaborative filtering gives us a slightly better result.

from sklearn.metrics import mean_squared_error

def get_mse(pred, actual):
    # Ignore nonzero terms.
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

In [23]:
import math
from sklearn.metrics import mean_absolute_error

In [24]:
# 12. Predictions and MSE metric calculation: 
item_prediction = predict_fast_simple(train, item_similarity, kind='item')
user_prediction = predict_fast_simple(train, user_similarity, kind='user')

print ('User-based CF MSE: ' + str(get_mse(user_prediction, test)))
print ('Item-based CF MSE: ' + str(get_mse(item_prediction, test)))

print ('User-based CF RMSE: ' + str(math.sqrt(get_mse(user_prediction, test))))
print ('Item-based CF RMSE: ' + str(math.sqrt(get_mse(item_prediction, test))))

print ('User-based CF MAE: ' + str(mean_absolute_error(user_prediction, test)))
print ('Item-based CF MAE: ' + str(mean_absolute_error(item_prediction, test)))

User-based CF MSE: 16.385412978361867
Item-based CF MSE: 16.375804039143066
User-based CF RMSE: 4.047889941483324
Item-based CF RMSE: 4.04670286024846
User-based CF MAE: 0.0011074870479455608
Item-based CF MAE: 0.0010987917725354354


In [25]:
# 13. Top-kk Collaborative Filtering - an attempt to improve our prediction MSE by only considering the top kk users who are most similar to the input user (or, similarly, the top kk items).
def predict_topk(ratings, similarity, kind='user', k=20):
    pred = np.zeros(ratings.shape)
    if kind == 'user':
        for i in range(ratings.shape[0]):
            top_k_users = [np.argsort(similarity[:,i])[:-k-1:-1]]
            for j in range(ratings.shape[1]):
                pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users]) 
                pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))
    if kind == 'item':
        for j in range(ratings.shape[1]):
            top_k_items = [np.argsort(similarity[:,j])[:-k-1:-1]]
            for i in range(ratings.shape[0]):
                pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T) 
                pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))        
    
    return pred

In [26]:
# Top-20 neighbors MSE calculation: 
pred = predict_topk(train, user_similarity, kind='user', k=20)
print ('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

pred = predict_topk(train, item_similarity, kind='item', k=20)
print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

  pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
  pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))


Top-k User-based CF MSE: 16.40787038451949


  pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
  pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))


Top-k Item-based CF MSE: 16.374597921649123


In [28]:
# Top-10 neighbors MSE calculation: 
pred = predict_topk(train, user_similarity, kind='user', k=10)
print ('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

pred = predict_topk(train, item_similarity, kind='item', k=10)
print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

  pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
  pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))


Top-k User-based CF MSE: 16.41465778508225


  pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
  pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))


Top-k Item-based CF MSE: 16.374587293738408


In [29]:
# Top-30 neighbors MSE calculation: 
pred = predict_topk(train, user_similarity, kind='user', k=30)
print ('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

pred = predict_topk(train, item_similarity, kind='item', k=30)
print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

  pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
  pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))


Top-k User-based CF MSE: 16.39554761549413


  pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
  pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))


Top-k Item-based CF MSE: 16.374602362110377


In [30]:
# Top-40 neighbors MSE calculation: 
pred = predict_topk(train, user_similarity, kind='user', k=40)
print ('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

pred = predict_topk(train, item_similarity, kind='item', k=40)
print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

  pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
  pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))


Top-k User-based CF MSE: 16.38649518530183


  pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
  pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))


Top-k Item-based CF MSE: 16.37460680148631


In [31]:
# Top-60 neighbors MSE calculation: 
pred = predict_topk(train, user_similarity, kind='user', k=60)
print ('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

pred = predict_topk(train, item_similarity, kind='item', k=60)
print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

  pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
  pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))


Top-k User-based CF MSE: 16.385517695330236


  pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
  pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))


Top-k Item-based CF MSE: 16.374615676985105


In [32]:
# Top-100 neighbors MSE calculation: 
pred = predict_topk(train, user_similarity, kind='user', k=100)
print ('Top-k User-based CF MSE: ' + str(get_mse(pred, test)))

pred = predict_topk(train, item_similarity, kind='item', k=100)
print ('Top-k Item-based CF MSE: ' + str(get_mse(pred, test)))

  pred[i, j] = similarity[i, :][top_k_users].dot(ratings[:, j][top_k_users])
  pred[i, j] /= np.sum(np.abs(similarity[i, :][top_k_users]))


Top-k User-based CF MSE: 16.3854129697592


  pred[i, j] = similarity[j, :][top_k_items].dot(ratings[i, :][top_k_items].T)
  pred[i, j] /= np.sum(np.abs(similarity[j, :][top_k_items]))


Top-k Item-based CF MSE: 16.374633414990793
