# Getting The Dataset

In [1]:
import numpy as np
import pandas as pd

In [2]:
books = pd.read_csv('books.csv', sep=',') #books in the dataset
ratings = pd.read_csv('ratings.csv', sep =',') #users and their ratings for books read
ratings.tail()

Unnamed: 0,user_id,book_id,rating
5976474,49925,510,5
5976475,49925,528,4
5976476,49925,722,4
5976477,49925,949,5
5976478,49925,1023,4


In [3]:
num_users = ratings.user_id.unique().shape[0]
num_books = books.book_id.unique().shape[0]
print ('Number of users in the set:' + str(num_users) + ' | Number of books in the set:' + str(num_books))

Number of users in the set:53424 | Number of books in the set:10000


# Split Data into Testing and Training

In [4]:
from sklearn import cross_validation as cv



In [19]:
train_data, test_data = cv.train_test_split(ratings, test_size = 0.25)

In [20]:
#Creating user-item matrices, one to train the model and one to test the model

#training data as a matrix
train_data_matrix = np.zeros((num_users, num_books))
for line in train_data.itertuples():
    train_data_matrix[line[1]-1, line[2]-1] = line[3]
#print(train_data_matrix.resize(5000, 25000))
#train_mag = np.sqrt(((train_data_matrix)*(train_data_matrix)).sum(axis=1))
    
#testing data as a matrix
test_data_matrix = np.zeros((num_users, num_books))
for line in test_data.itertuples():
    test_data_matrix[line[1]-1, line[2]-1] = line[3]

#test_mag = np.sqrt(((test_data_matrix)*(test_data_matrix)).sum(axis=1))


# Calculating Cosine Similarity

In [38]:
from sklearn.metrics.pairwise import pairwise_distances

In [52]:
user_similarity = pairwise_distances(train_data_matrix[:25000], metric = 'cosine')
#book_similarity = pairwise_distances(train_data_matrix.T[:3000], metric = 'cosine')

# Let's Make Predictions

In [47]:
def predict(ratings, similarity, type = 'user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis = 1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    
    #elif type == 'book':
        #pred = ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
    
    return pred

In [48]:
#item_prediction = predict(train_data_matrix[:3000], book_similarity, type='book')
user_prediction = predict(train_data_matrix[:25000], user_similarity, type ='user')

# Evaluate the Difference

In [49]:
from sklearn.metrics import mean_squared_error
from math import sqrt

In [50]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten()
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [51]:
print ('User-based CF RMSE: ' + str(rmse(user_prediction, train_data_matrix[:25000])))

User-based CF RMSE: 3.665727556815496
