In [139]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

In [140]:
df_user=pd.read_csv('BX-Users.csv',encoding='latin-1')

In [141]:
df_book=pd.read_csv("BX-Books.csv",encoding='latin-1')

In [142]:
df=pd.read_csv("BX-Book-Ratings.csv",encoding='latin-1',nrows=10000)

In [143]:
df=pd.merge(df,df_book,on='isbn')

In [144]:
n_users = df.user_id.nunique()
n_books = df.isbn.nunique()

print('Num. of Users: '+ str(n_users))
print('Num of Books: '+str(n_books))

Num. of Users: 828
Num of Books: 8051


In [145]:
isbn_list=df.isbn.unique()
def get_isbn_numeric_id(isbn):
    itemindex=np.where(isbn_list==isbn)
    return itemindex[0][0]

In [146]:
userid_list=df.user_id.unique()
def get_user_id_numeric_id(user_id):
    itemindex=np.where(userid_list==user_id)
    return itemindex[0][0]

In [147]:
df['user_id_order']=df['user_id'].apply(get_user_id_numeric_id)

In [148]:
df['isbn_id']=df['isbn'].apply(get_isbn_numeric_id)

In [149]:
df.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,user_id_order,isbn_id
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


In [150]:
df.columns

Index(['user_id', 'isbn', 'rating', 'book_title', 'book_author',
       'year_of_publication', 'publisher', 'user_id_order', 'isbn_id'],
      dtype='object')

In [151]:
new_col=['user_id_order', 'isbn_id', 'rating', 'book_title', 'book_author','year_of_publication','publisher','isbn','user_id']
df=df.reindex(columns=new_col)

In [152]:
df.head()

Unnamed: 0,user_id_order,isbn_id,rating,book_title,book_author,year_of_publication,publisher,isbn,user_id
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,034545104X,276725
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle,155061224,276726
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,276727
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books,446520802,278418
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,052165615X,276729


In [153]:
from sklearn.model_selection import train_test_split

In [154]:
train_data,test_data=train_test_split(df,test_size=0.3)

In [155]:
train_data_matix=np.zeros((n_users,n_books))

In [156]:
for line in train_data.itertuples():
    train_data_matix[line[1]-1,line[2]-1]=line[3]

In [157]:
test_data_matrix=np.zeros((n_users,n_books))

In [158]:
for line in test_data.itertuples():
    test_data_matrix[line[1]-1,line[2]-1]=line[3]

In [159]:
from sklearn.metrics.pairwise import pairwise_distances

In [160]:
user_similarity=pairwise_distances(train_data_matix,metric='cosine')
item_similarity=pairwise_distances(train_data_matix.T,metric='cosine')

In [161]:
item_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [162]:
def predict(ratings,similarity,type='user'):
    if type=='user':
        mean_user_rating=ratings.mean(axis=1)
        rating_diff=(ratings-mean_user_rating[:,np.newaxis])
        pred=mean_user_rating[:,np.newaxis]+similarity.dot(rating_diff)/np.array([np.abs(similarity).sum(axis=1)]).T
    elif type=='item':
        pred=ratings.dot(similarity)/np.array([np.abs(similarity).sum(axis=1)])
        
    return pred

In [163]:
item_prediction = predict(train_data_matix, item_similarity, type='item')
user_prediction = predict(train_data_matix, user_similarity, type='user')

In [164]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [165]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 7.744428889463506
Item-based CF RMSE: 7.743777737836296
