# Book Recommendation System (Using KNN Algorithm)


## Minor Project


BACHELOR OF TECHNOLOGY\
Computer Science and Engineering\
2019-2023

## Team
1. Konark Lohat
2. Gautam Jain
3. Jaskamal Singh

Dataset - http://www2.informatik.uni-freiburg.de/~cziegler/BX/

# Importing necessary packages

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# Reading dataset with their respective (and required) features 

In [None]:
# Books dataset
books = pd.read_csv('./data/BX-Books.csv', sep = ';', error_bad_lines = False, encoding = 'latin-1')
books.columns = ['ISBN', 'bookTitle', 'bookAuthor', 'yearOfPublication', 'publisher', 'imageUrlS', 'imageUrlM', 'imageUrlL']

# Users dataset
users = pd.read_csv('./data/BX-Users.csv', sep = ';', error_bad_lines = False, encoding='latin-1')
users.columns = ['userID', 'Location', 'Age']

# Ratings dataset
ratings = pd.read_csv('./data/BX-Book-Ratings.csv', sep = ';', error_bad_lines = False, encoding='latin-1')
ratings.columns = ['userID', 'ISBN', 'bookRating']

In [None]:
# Books dataset
print("\033[1mBooks Dataset\033[0m")
print("Rows - ", books.shape[0])
print("Cols - ", books.shape[1])
print("Cols are - ", list(books.columns))

# Users dataset
print("\n\033[1mUsers Dataset\033[0m")
print("Rows - ", users.shape[0])
print("Cols - ", users.shape[1])
print("Cols are - ", list(users.columns))

# Ratings dataset
print("\n\033[1mRatings Dataset\033[0m")
print("Rows - ", ratings.shape[0])
print("Cols - ", ratings.shape[1])
print("Cols are - ", list(ratings.columns))

# Rating Distribution

In [None]:
plt.rc('font', size = 15)
ratings.bookRating.value_counts(sort = True).plot(kind = 'bar')
plt.title("Rating Distribution (Bar Graph)\n")
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('./figures/rating-distribution.png', bbox_inches = 'tight')
plt.show()

# User's Age Distribution

In [None]:
users.Age.hist(bins = [0, 10, 20, 30, 40, 50, 100])
plt.title('Age Distribution\n')
plt.xlabel('Age')
plt.ylabel('Count')
plt.savefig('./figures/user-age-distribution.png', bbox_inches = 'tight')
plt.show()

# To ensure statistical significance, users with less than 200 ratings, and books with less than 100 ratings won't be used my the model.

In [None]:
# User's ratings
userRatingCount = ratings['userID'].value_counts()
ratings = ratings[ratings['userID'].isin(userRatingCount[userRatingCount >= 200].index)]

# Book's ratings
bookRatingCount = ratings['bookRating'].value_counts()
print(bookRatingCount[bookRatingCount >= 100].index)
ratings = ratings[ratings['bookRating'].isin(bookRatingCount[bookRatingCount >= 100].index)]

In [None]:
# TESTING THE EXCLUSION
plt.rc('font', size = 15)
ratings.bookRating.value_counts(sort = True).plot(kind = 'bar')
plt.title("Rating Distribution (Bar Graph)\n")
plt.xlabel('Rating')
plt.ylabel('Count')
plt.savefig('./figures/rating-distribution.png', bbox_inches = 'tight')
plt.show()

# Collaborative Filtering Using K-Nearest Neighbours (KNN)


KNN is a machine learning algorithm used to find clusters of similar users based on common book ratings, and make predictions using the average rating of top-k nearest neighbours. For example, we first present ratings in a matrix with the matrix having one row for each item (book) and one column for each user.

In [None]:
# Combining the book and user dataset with ISBN as the primary key
combine_book_rating = pd.merge(ratings, books, on='ISBN')
columns = ['yearOfPublication', 'publisher', 'bookAuthor', 'imageUrlS', 'imageUrlM', 'imageUrlL'] # Dropped columns
combine_book_rating = combine_book_rating.drop(columns, axis = 1)
combine_book_rating.head()

#### Now we will group by books titles and create a new column for total rating count

In [None]:
combine_book_rating = combine_book_rating.dropna(axis = 0, subset = ['bookTitle'])

book_rating_count = (combine_book_rating.
                     groupby(by = ['bookTitle'])
                     ['bookRating']
                     .count()
                     .reset_index()
                     .rename(columns = {'bookRating': 'totalRatingCount'})
                     [['bookTitle', 'totalRatingCount']]
                    )

book_rating_count.head()

#### We combine the rating data with the total rating count data, this gives us exactly what we need to find out which books are popular and filter out lesser-known books.

In [None]:
rating_with_total_rating_count = combine_book_rating.merge(book_rating_count, left_on = 'bookTitle', right_on = 'bookTitle', how = 'left')
rating_with_total_rating_count.head()

In [None]:
popularity_threshold = 50
rating_popular_book = rating_with_total_rating_count.query('totalRatingCount >= @popularity_threshold')
rating_popular_book.head()

In [None]:
rating_popular_book.shape

# Filter users of USA and Canada only

In [None]:
combined = rating_popular_book.merge(users, left_on = 'userID', right_on = 'userID', how = 'left')

us_canada_user_rating = combined[combined['Location'].str.contains('usa|canada')]
us_canada_user_rating = us_canada_user_rating.drop('Age', axis = 1)
us_canada_user_rating.head()

# Implementing KNN (Cosine Similarity)


We convert our table to a 2D matrix, and fill the missing values with zeros (since we will calculate distances between rating vectors). We then transform the value (ratings) of the matrix dataframe into a scipy sparse matrix for more efficient calculations.


Finding the Nearest Neighbours, we use unsupervised algorithms with sklearn.neighbours. The algorithm we use to compute the nearest neighbours is "brute", and we specify "metric = cosine" so that the algorithm will calculate the cosine similarity between rating vectors. Finally, we fit the model.

In [None]:
from scipy.sparse import csr_matrix 
us_canada_user_rating = us_canada_user_rating.drop_duplicates(['userID', 'bookTitle'])
us_canada_user_rating_pivot = us_canada_user_rating.pivot(index = 'bookTitle', columns = 'userID', values = 'bookRating').fillna(0)
us_canada_user_rating_matrix = csr_matrix(us_canada_user_rating_pivot.values)

In [None]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(metric = 'cosine', algorithm = 'brute')
model_knn.fit(us_canada_user_rating_matrix)

In [None]:
query_index = np.random.choice(us_canada_user_rating_pivot.shape[0])
distances, indices = model_knn.kneighbors(us_canada_user_rating_pivot.iloc[query_index, :].values.reshape(1, -1), n_neighbors = 6)

In [None]:
us_canada_user_rating_pivot.index[query_index]

In [None]:
for i in range(0, len(distances.flatten())):
    if i == 0:
        print("Recommendations for {0}:\n".format(us_canada_user_rating_pivot.index[query_index]))
    else:
        print("{0}: {1}, with distance of {2}".format(i, us_canada_user_rating_pivot.index[indices.flatten()[i]], distances.flatten()[i]))