Importing required libraries

In [1]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

# Read the books dataset

In [2]:
users_data=pd.read_csv(r'BX-Users.csv',encoding='Latin-1')

In [3]:
users_data.head()

Unnamed: 0,user_id,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


Checking shape of dataset

In [4]:
users_data.shape

(278859, 3)

Check for null values

In [5]:
users_data.isnull().sum()

user_id          0
Location         1
Age         110763
dtype: int64

Replacing null values with zero in column age to avoid lose of valuable data

In [6]:
users_data['Age'].fillna(0,inplace=True)

In [7]:
users_data.isnull().sum()

user_id     0
Location    1
Age         0
dtype: int64

# Clean up NaN values
Dropping null values

In [8]:
users_data1=users_data.dropna()

In [9]:
users_data1.isnull().sum()

user_id     0
Location    0
Age         0
dtype: int64

Reading books data

In [10]:
book_data=pd.read_csv(r"BX-Books.csv",encoding="latin-1")

In [11]:
book_data.head()

Unnamed: 0,isbn,book_title,book_author,year_of_publication,publisher
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company


In [12]:
book_data.isnull().any()

isbn                   False
book_title             False
book_author             True
year_of_publication    False
publisher               True
dtype: bool

In [13]:
book_data.isnull().sum()

isbn                   0
book_title             0
book_author            1
year_of_publication    0
publisher              2
dtype: int64

In [14]:
book_data1=book_data.dropna()

In [15]:
book_data1.isnull().any()

isbn                   False
book_title             False
book_author            False
year_of_publication    False
publisher              False
dtype: bool

# Read the data where ratings are given by users

(reading only first 10000 rows to avoid memory issue)

In [16]:
book_ratings=pd.read_csv(r"BX-Book-Ratings.csv",encoding='latin-1', nrows=10000)

In [17]:
book_ratings.head()

Unnamed: 0,user_id,isbn,rating
0,276725,034545104X,0
1,276726,155061224,5
2,276727,446520802,0
3,276729,052165615X,3
4,276729,521795028,6


In [18]:
book_ratings.isnull().any()

user_id    False
isbn       False
rating     False
dtype: bool

In [19]:
book_ratings.describe()

Unnamed: 0,user_id,rating
count,10000.0,10000.0
mean,265844.3796,1.9747
std,56937.189618,3.424884
min,2.0,0.0
25%,277478.0,0.0
50%,278418.0,0.0
75%,278418.0,4.0
max,278854.0,10.0


Merging the dataframes

In [20]:
Dataframe=pd.merge(book_ratings,book_data1, on='isbn')

In [21]:
Dataframe.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press


# Take a quick look at the number of unique users and books

In [22]:
n_users=Dataframe.user_id.nunique()
n_books=Dataframe.isbn.nunique()

In [23]:
print("Number of users:",str(n_users))
print("Number of books:",str(n_books))

Number of users: 828
Number of books: 8051


# Convert ISBN variables to numeric numbers in the correct order

In [24]:
isbnLIST=Dataframe.isbn.unique()

In [25]:
isbnLIST

array(['034545104X', '155061224', '446520802', ..., '425098834',
       '425163407', '425164403'], dtype=object)

In [26]:
print('Length of isbn list', len(isbnLIST))

Length of isbn list 8051


In [27]:
def get_isbn_numeric_id(isbn):
    item_index=np.where(isbnLIST==isbn)
    return item_index[0][0]

# Convert the user_id variable to numeric numbers in the correct order

In [28]:
userid_list=Dataframe.user_id.unique()

In [29]:
userid_list

array([276725, 276726, 276727, 278418, 276729, 276733, 276744, 276746,
       277427, 278026, 276747, 278843, 276748, 276751, 276754, 276964,
       276755,     99, 276762, 276772, 276774, 276780, 276786, 276788,
       278356, 276796, 277195,    232, 276798, 277375, 276800, 278100,
       276804, 276808, 276811, 276904, 278314, 276813, 276814, 276817,
       276820, 276822, 278554, 276828, 277681, 276830, 276832, 277143,
       276833, 276835, 276837, 276838, 276840, 278633, 276847, 276861,
       276848, 276936, 276850, 277639, 276852, 276853, 276854, 276856,
       276859, 276862, 276866, 278778, 278249, 276869, 278251, 276872,
       276875, 277712, 276878, 276884, 276887, 276888, 276889, 277523,
       277710, 276890, 276896, 278194, 276905, 276911, 276912, 276915,
       276916, 276925, 277623, 277923, 277413, 277439, 277752, 277965,
          243, 277589, 277648, 277689, 278137, 278221, 278390,     67,
       276939, 276954, 277042, 277157, 277378, 277478, 277530, 277932,
      

In [30]:
print("length of user-id:",len(userid_list))

length of user-id: 828


In [31]:
def get_userid_numeric(user_id):
    userid_index=np.where(userid_list==user_id)
    return userid_index[0][0]

# Convert both user_id and ISBN to the ordered list, i.e., from 0...n-1

In [32]:
Dataframe['user_id_ordered']=Dataframe['user_id'].apply(get_userid_numeric)

In [33]:
Dataframe['user_id_ordered']

0        0
1        1
2        2
3        3
4        4
        ..
8696    96
8697    96
8698    96
8699    96
8700    96
Name: user_id_ordered, Length: 8701, dtype: int64

In [34]:
Dataframe['isbn_ordered']=Dataframe['isbn'].apply(get_isbn_numeric_id)

In [35]:
Dataframe['isbn_ordered']

0          0
1          1
2          2
3          2
4          3
        ... 
8696    8046
8697    8047
8698    8048
8699    8049
8700    8050
Name: isbn_ordered, Length: 8701, dtype: int64

In [36]:
Dataframe.head()

Unnamed: 0,user_id,isbn,rating,book_title,book_author,year_of_publication,publisher,user_id_ordered,isbn_ordered
0,276725,034545104X,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books,0,0
1,276726,155061224,5,Rites of Passage,Judith Rae,2001,Heinle,1,1
2,276727,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,2,2
3,278418,446520802,0,The Notebook,Nicholas Sparks,1996,Warner Books,3,2
4,276729,052165615X,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press,4,3


# Re-index the columns to build a matrix

In [37]:
new_columns=['user_id_ordered','isbn_ordered','rating','book_title','book_author','year_of_publication','publisher']
Dataframe=Dataframe.reindex(columns=new_columns)
Dataframe

Unnamed: 0,user_id_ordered,isbn_ordered,rating,book_title,book_author,year_of_publication,publisher
0,0,0,0,Flesh Tones: A Novel,M. J. Rose,2002,Ballantine Books
1,1,1,5,Rites of Passage,Judith Rae,2001,Heinle
2,2,2,0,The Notebook,Nicholas Sparks,1996,Warner Books
3,3,2,0,The Notebook,Nicholas Sparks,1996,Warner Books
4,4,3,3,Help!: Level 1,Philip Prowse,1999,Cambridge University Press
...,...,...,...,...,...,...,...
8696,96,8046,7,A Map of the World,Jane Hamilton,1999,Anchor Books/Doubleday
8697,96,8047,0,The Accidental Tourist,Anne Tyler,1994,Berkley Publishing Group
8698,96,8048,0,If Morning Ever Comes,Anne Tyler,1983,Berkley Publishing Group
8699,96,8049,9,Unnatural Exposure,Patricia Daniels Cornwell,1998,Berkley Publishing Group


# Split your data into two sets (training and testing)

In [38]:
from sklearn.model_selection import train_test_split
train_data,test_data=train_test_split(Dataframe,test_size=0.3)

creating two user-book matrices

In [39]:
train_data_matrix=np.zeros([n_users,n_books])
for line in train_data.itertuples():
    train_data_matrix[line[1]-1,line[2]-1]=line[3]

In [40]:
test_data_matrix=np.zeros([n_users,n_books])
for line in test_data.itertuples():
    test_data_matrix[line[1]-1,line[2]-1]=line[3]

using pairwise distances to check cosine similarity

In [41]:
from sklearn.metrics.pairwise import pairwise_distances
user_similarity=pairwise_distances(train_data_matrix,metric='cosine')
item_similarity=pairwise_distances(train_data_matrix.T,metric='cosine')

In [42]:
user_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

In [43]:
item_similarity

array([[0., 1., 1., ..., 1., 1., 1.],
       [1., 0., 1., ..., 1., 1., 1.],
       [1., 1., 0., ..., 1., 1., 1.],
       ...,
       [1., 1., 1., ..., 0., 1., 1.],
       [1., 1., 1., ..., 1., 0., 1.],
       [1., 1., 1., ..., 1., 1., 0.]])

# Making predictions based on user and item variables

In [44]:
def predict(ratings, similarity, type='user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis=1)
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis]) 
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return pred

In [45]:
item_prediction=predict(test_data_matrix,item_similarity,type='item')
user_prediction=predict(test_data_matrix,user_similarity,type='user')

In [46]:
item_prediction

array([[0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.02322981, 0.02322981, 0.02322981, ..., 0.02324545, 0.02322981,
        0.02322981],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [47]:
user_prediction

array([[-0.00073714, -0.00073714,  0.00289043, ..., -0.00073714,
        -0.00073714, -0.00073714],
       [-0.00073714, -0.00073714,  0.00289043, ..., -0.00073714,
        -0.00073714, -0.00073714],
       [ 0.02251778,  0.02251778,  0.02614585, ...,  0.02251778,
         0.02251778,  0.02251778],
       ...,
       [-0.00073714, -0.00073714,  0.00289043, ..., -0.00073714,
        -0.00073714, -0.00073714],
       [-0.00073714, -0.00073714,  0.00289043, ..., -0.00073714,
        -0.00073714, -0.00073714],
       [-0.00073714, -0.00073714,  0.00289043, ..., -0.00073714,
        -0.00073714, -0.00073714]])

# Use RMSE to evaluate the predictions

In [48]:
from sklearn.metrics import mean_squared_error
from math import sqrt
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [49]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 7.678649622020115
Item-based CF RMSE: 7.679310516921285


BOTH THE APPROACH YEILDS ALMOST SAME RESULT