Starting the project by reading in the data from all three files

In [3]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

books = pd.read_csv("Data/BX-Books.csv", sep=';', encoding="latin-1", error_bad_lines=False)
users = pd.read_csv("Data/BX-Users.csv", sep=';', encoding="latin-1", error_bad_lines=False)
ratings = pd.read_csv("Data/BX-Book-Ratings.csv", sep=';', encoding="latin-1", error_bad_lines=False)

b'Skipping line 6452: expected 8 fields, saw 9\nSkipping line 43667: expected 8 fields, saw 10\nSkipping line 51751: expected 8 fields, saw 9\n'
b'Skipping line 92038: expected 8 fields, saw 9\nSkipping line 104319: expected 8 fields, saw 9\nSkipping line 121768: expected 8 fields, saw 9\n'
b'Skipping line 144058: expected 8 fields, saw 9\nSkipping line 150789: expected 8 fields, saw 9\nSkipping line 157128: expected 8 fields, saw 9\nSkipping line 180189: expected 8 fields, saw 9\nSkipping line 185738: expected 8 fields, saw 9\n'
b'Skipping line 209388: expected 8 fields, saw 9\nSkipping line 220626: expected 8 fields, saw 9\nSkipping line 227933: expected 8 fields, saw 11\nSkipping line 228957: expected 8 fields, saw 10\nSkipping line 245933: expected 8 fields, saw 9\nSkipping line 251296: expected 8 fields, saw 9\nSkipping line 259941: expected 8 fields, saw 9\nSkipping line 261529: expected 8 fields, saw 9\n'


Next we will take a look at the head of one dataset

In [11]:
print(books.head())

         ISBN                                              title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 author  year                   publisher  
0    Mark P. O. Morford  2002     Oxford University Press  
1  Richard Bruce Wright  2001       HarperFlamingo Canada  
2          Carlo D'Este  1991             HarperPerennial  
3      Gina Bari Kolata  1999        Farrar Straus Giroux  
4       E. J. W. Barber  1999  W. W. Norton &amp; Company  


Next we preprocess the data

In [5]:
books = books[['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher']]
books.rename(columns = {'Book-Title':'title', 'Book-Author':'author', 'Year-Of-Publication':'year', 'Publisher':'publisher'}, inplace=True)
users.rename(columns = {'User-ID':'user_id', 'Location':'location', 'Age':'age'}, inplace=True)
ratings.rename(columns = {'User-ID':'user_id', 'Book-Rating':'rating'}, inplace=True)

Again we can look at the head

In [10]:
print(books.head())

         ISBN                                              title  \
0  0195153448                                Classical Mythology   
1  0002005018                                       Clara Callan   
2  0060973129                               Decision in Normandy   
3  0374157065  Flu: The Story of the Great Influenza Pandemic...   
4  0393045218                             The Mummies of Urumchi   

                 author  year                   publisher  
0    Mark P. O. Morford  2002     Oxford University Press  
1  Richard Bruce Wright  2001       HarperFlamingo Canada  
2          Carlo D'Este  1991             HarperPerennial  
3      Gina Bari Kolata  1999        Farrar Straus Giroux  
4       E. J. W. Barber  1999  W. W. Norton &amp; Company  


Now to prepare for modeling

In [7]:
ratings['user_id'].value_counts()
x = ratings['user_id'].value_counts() > 200
y = x[x].index  #user_ids
print(y.shape)
ratings = ratings[ratings['user_id'].isin(y)]
rating_with_books = ratings.merge(books, on='ISBN')

(899,)


Now looking at the head of rating_with_books

In [12]:
print(rating_with_books.head())

   user_id        ISBN  rating  \
0   277427  002542730X      10   
1     3363  002542730X       0   
2    11676  002542730X       6   
3    12538  002542730X      10   
4    13552  002542730X       0   

                                               title             author  year  \
0  Politically Correct Bedtime Stories: Modern Ta...  James Finn Garner  1994   
1  Politically Correct Bedtime Stories: Modern Ta...  James Finn Garner  1994   
2  Politically Correct Bedtime Stories: Modern Ta...  James Finn Garner  1994   
3  Politically Correct Bedtime Stories: Modern Ta...  James Finn Garner  1994   
4  Politically Correct Bedtime Stories: Modern Ta...  James Finn Garner  1994   

                   publisher  
0  John Wiley &amp; Sons Inc  
1  John Wiley &amp; Sons Inc  
2  John Wiley &amp; Sons Inc  
3  John Wiley &amp; Sons Inc  
4  John Wiley &amp; Sons Inc  


Looking for books with 50+ reviews

In [13]:
number_rating = rating_with_books.groupby('title')['rating'].count().reset_index()
number_rating.rename(columns= {'rating':'number_of_ratings'}, inplace=True)
final_rating = rating_with_books.merge(number_rating, on='title')
final_rating.shape
final_rating = final_rating[final_rating['number_of_ratings'] >= 50]
final_rating.drop_duplicates(['user_id','title'], inplace=True)

Creating a Pivot Table

In [14]:
book_pivot = final_rating.pivot_table(columns='user_id', index='title', values="rating")
book_pivot.fillna(0, inplace=True)

In [26]:
book_pivot.to_excel('book_pivot.xlsx')

In [16]:
from scipy.sparse import csr_matrix
book_sparse = csr_matrix(book_pivot)

Now to train the model

In [21]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm='brute')
model.fit(book_sparse)

In [37]:
distances, suggestions = model.kneighbors(book_pivot.iloc[191, :].values.reshape(1, -1))

In [41]:
print(f"Suggesting for: {book_pivot.iloc[191].name}")

Suggesting for: Fahrenheit 451


In [46]:
for i in range(len(suggestions)):
  print(book_pivot.index[suggestions[i]])

Index(['Fahrenheit 451', 'No Safe Place', 'Long After Midnight',
       'Primary Colors: A Novel of Politics', 'Abduction'],
      dtype='object', name='title')
