In [23]:
# import the necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix

In [2]:
# reading csv files into dataframes
user_data = pd.read_csv('data/BX-CSV/BX-Users.csv', delimiter=';', encoding='latin-1')
book_data = pd.read_csv('data/BX-CSV/BX-Books.csv', delimiter=';', encoding='latin-1', on_bad_lines='skip',low_memory=False)
rating_data = pd.read_csv('data/BX-CSV/BX-Book-Ratings.csv', delimiter=';', encoding='latin-1')

## Preprocess the data

In [3]:
# creating a new feature 'country' after spliting values in 'location' and getting a singular value
user_data['Country'] = user_data['Location'].map(lambda x: x.split(', '))
user_data['Country'] = user_data['Country'].map(lambda x: x[-1])

# dropping off unnecessary columns
user_data = user_data.drop(['Location', 'Age'], axis=1)

In [4]:
# selecting users from specific regions particularly usa and spain
regional_data = user_data[(user_data['Country'] == 'spain')|(user_data['Country'] == 'usa')]

In [5]:
book_data = book_data.drop(['Publisher','Image-URL-S', 'Image-URL-M', 'Image-URL-L'], axis=1)

In [6]:
# merging data to get the title of the books and their ratings
ratings = rating_data.merge(book_data, on='ISBN')

In [7]:
# getting the popularity of a book by knowing the total no of rating done on it
# setting the threshold to 50
count_ratings = ratings['Book-Title'].value_counts() > 50
col = count_ratings[count_ratings == True]
rated_data = ratings[ratings['Book-Title'].isin(col.index)]

## Collaborative filtering method

In [8]:
# getting specific features from the rated_data
ratings = rated_data[['Book-Title', 'User-ID', 'Book-Rating']]

In [9]:
# converting feature datatype to correct one
ratings['Book-Title'] = ratings['Book-Title'].astype(str)
ratings['User-ID'] = ratings['User-ID'].astype(str)
ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['Book-Title'] = ratings['Book-Title'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['User-ID'] = ratings['User-ID'].astype(str)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ratings['Book-Rating'] = ratings['Book-Rating'].astype(int)


In [10]:
# error in pivot due to shape
# dropping duplicates
ratings = ratings.drop_duplicates(['Book-Title', 'User-ID'])

In [29]:
# pivot table for cluster of users around a particular book
pivoted_data = pd.pivot_table(ratings, columns='User-ID', index='Book-Title', values='Book-Rating', fill_value=0)

In [70]:
# pivot table for pearson correlation mapping similar user experience of books for correlations
pivoted_data_corr =  pd.pivot_table(ratings, index='User-ID', columns='Book-Title', values='Book-Rating', fill_value=0)

In [30]:
pivoted_data.head()

User-ID,100001,100002,100004,100009,10001,100010,100015,100025,100029,100030,...,99954,99955,99963,99970,99973,99980,99982,99992,99996,99997
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
10 Lb. Penalty,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16 Lighthouse Road,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1984,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1st to Die: A Novel,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2010: Odyssey Two,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [74]:
pivoted_data_corr.head()

Book-Title,10 Lb. Penalty,16 Lighthouse Road,1984,1st to Die: A Novel,2010: Odyssey Two,204 Rosewood Lane,2061: Odyssey Three,24 Hours,2nd Chance,3rd Degree,...,YOU BELONG TO ME,Year of Wonders,You Belong To Me,You Shall Know Our Velocity,Young Wives,Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,Zoya,"\O\"" Is for Outlaw""","\Surely You're Joking, Mr. Feynman!\"": Adventures of a Curious Character""",stardust
User-ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
100001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100002,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100004,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
100009,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
10001,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [84]:
# creating sparse matrix for the cluster
sparse_matrix = csr_matrix(pivoted_data.values)

## Ranking using pearson correlation

In [63]:
name_rate = pivoted_data_corr['10 Lb. Penalty']
name = pivoted_data_corr.corrwith(name_rate)

In [68]:
name.sort_values(ascending=False)[:10]

10 Lb. Penalty                                             1.000000
Devil's Waltz (Alex Delaware Novels (Paperback))           0.125520
The Cat Who Played Post Office (Cat Who... (Paperback))    0.120795
Nemesis                                                    0.117074
I Is for Innocent                                          0.111409
Out of Africa ; and, Shadows on the grass                  0.109654
Decider                                                    0.101307
The 13th Juror                                             0.101028
Split Second (Maggie O'Dell Novels (Paperback))            0.090771
Parallel Lies                                              0.090357
dtype: float64

## Using K-Nearest Neighbours

In [103]:
pivoted_data.iloc[2380,:]

User-ID
100001    0
100002    0
100004    0
100009    0
10001     0
         ..
99980     0
99982     0
99992     0
99996     0
99997     0
Name: stardust, Length: 47740, dtype: int64

In [85]:
pivoted_data.shape

(2381, 47740)

In [107]:
pivoted_data.index

Index(['10 Lb. Penalty', '16 Lighthouse Road', '1984', '1st to Die: A Novel',
       '2010: Odyssey Two', '204 Rosewood Lane', '2061: Odyssey Three',
       '24 Hours', '2nd Chance', '3rd Degree',
       ...
       'YOU BELONG TO ME', 'Year of Wonders', 'You Belong To Me',
       'You Shall Know Our Velocity', 'Young Wives',
       'Zen and the Art of Motorcycle Maintenance: An Inquiry into Values',
       'Zoya', '\O\" Is for Outlaw"',
       '\Surely You're Joking, Mr. Feynman!\": Adventures of a Curious Character"',
       'stardust'],
      dtype='object', name='Book-Title', length=2381)

In [108]:
neigbours = NearestNeighbors(n_neighbors=10,metric="cosine", algorithm='brute')
neigbours.fit(sparse_matrix)

In [110]:
distance, books = neigbours.kneighbors(pivoted_data.iloc[0,:].values.reshape(1, -1))
for i in books:
    for j in i:
        print(pivoted_data.iloc[j,:].name)

10 Lb. Penalty
Devil's Waltz (Alex Delaware Novels (Paperback))
The Cat Who Played Post Office (Cat Who... (Paperback))
Nemesis
I Is for Innocent
Out of Africa ; and, Shadows on the grass
Decider
The 13th Juror
Split Second (Maggie O'Dell Novels (Paperback))
Parallel Lies
