In [3]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
import matplotlib.pyplot as plt 
import os

In [10]:
books_filename = 'Dataset/BX-Books.csv'
ratings_filename = 'Dataset/BX-Book-Ratings.csv'

In [11]:
df_books = pd.read_csv(
    books_filename,
    encoding = "ISO-8859-1",
    sep=";",
    # error_bad_lines=False,
    header=0,
    names=['isbn', 'title', 'author'],
    usecols=['isbn', 'title', 'author'],
    dtype={'isbn': 'str', 'title': 'str', 'author': 'str'}
    )

df_ratings = pd.read_csv(
    ratings_filename,
    encoding = "ISO-8859-1",
    sep=";",
    # error_bad_lines=False
    header=0,
    names=['user', 'isbn', 'rating'],
    usecols=['user', 'isbn', 'rating'],
    dtype={'user': 'int32', 'isbn': 'str', 'rating': 'float32'}
    )

In [12]:
df_books

Unnamed: 0,isbn,title,author
0,0195153448,Classical Mythology,Mark P. O. Morford
1,0002005018,Clara Callan,Richard Bruce Wright
2,0060973129,Decision in Normandy,Carlo D'Este
3,0374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
4,0393045218,The Mummies of Urumchi,E. J. W. Barber
...,...,...,...
271374,0440400988,There's a Bat in Bunk Five,Paula Danziger
271375,0525447644,From One to One Hundred,Teri Sloat
271376,006008667X,Lily Dale : The True Story of the Town that Ta...,Christine Wicker
271377,0192126040,Republic (World's Classics),Plato


In [13]:
df_ratings

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0.0
1,276726,0155061224,5.0
2,276727,0446520802,0.0
3,276729,052165615X,3.0
4,276729,0521795028,6.0
...,...,...,...
1149775,276704,1563526298,9.0
1149776,276706,0679447156,0.0
1149777,276709,0515107662,10.0
1149778,276721,0590442449,10.0


In [14]:
counts1 = df_ratings.user.value_counts()
counts2 = df_ratings.isbn.value_counts()

df_ratings = df_ratings[df_ratings.user.isin(counts1[counts1>200].index)]

df_ratings = df_ratings[df_ratings.isbn.isin(counts2[counts2>100].index)]
df_ratings

df_ratings.isbn.value_counts()

0971880107    363
0316666343    270
0060928336    220
0440214041    218
0385504209    215
             ... 
076790592X     23
039592720X     22
0684833395     22
0091867770     19
0671027344     16
Name: isbn, Length: 721, dtype: int64

In [15]:
df_ratings = df_ratings.loc[df_ratings.rating!=0]
df_ratings

Unnamed: 0,user,isbn,rating
1456,277427,002542730X,10.0
1474,277427,0061009059,9.0
1522,277427,0316776963,8.0
1543,277427,0345413903,10.0
1581,277427,0385486804,9.0
...,...,...,...
1146735,275970,038572179X,9.0
1146737,275970,0385722206,10.0
1146809,275970,043936213X,9.0
1146820,275970,0440180295,9.0


In [16]:
temp = pd.merge(df_books,df_ratings,on='isbn')
final = pd.DataFrame(temp.groupby('isbn')['rating'].mean())
final['count'] = pd.DataFrame(temp.groupby('isbn')['isbn'].count())

temp = temp[['isbn', 'title']].drop_duplicates()
final = pd.merge(temp,final,on='isbn')
final = final.sort_values('count',ascending=False)
final.index=final['isbn']
final

Unnamed: 0_level_0,isbn,title,rating,count
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0316666343,0316666343,The Lovely Bones: A Novel,8.317307,104
0385504209,0385504209,The Da Vinci Code,8.582278,79
059035342X,059035342X,Harry Potter and the Sorcerer's Stone (Harry P...,8.893939,66
0312195516,0312195516,The Red Tent (Bestselling Backlist),8.629032,62
0446310786,0446310786,To Kill a Mockingbird,8.950000,60
...,...,...,...,...
0451207521,0451207521,Jackdaws,5.333333,3
1551668998,1551668998,Beach House,8.666667,3
0553571818,0553571818,Long After Midnight,7.666667,3
0446365505,0446365505,Pleading Guilty,5.500000,2


In [17]:
ratings_pivot = df_ratings.pivot(index='isbn',columns='user',values='rating').fillna(0)
ratings_pivot

user,254,2276,2766,2977,3363,4017,4385,6242,6251,6323,...,273979,274004,274061,274301,274308,274808,275970,277427,277639,278418
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
002542730X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0
0060008032,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060096195,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
006016848X,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
0060173289,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1573227331,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229326,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1573229571,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1592400876,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
model=NearestNeighbors(metric='cosine',algorithm='brute')
model.fit(csr_matrix(ratings_pivot.values))

NearestNeighbors(algorithm='brute', metric='cosine')

In [19]:
def get_name(isbn):
  return final.loc[final.isbn == isbn].title.item()

In [20]:
def get_recommends(book = ""): 
  distances,indices=model.kneighbors(ratings_pivot.loc[final.loc[final.title==book].isbn].values,n_neighbors=6)
  out=[]
  for i in range(len(distances.flatten())): 
    out.append([get_name(ratings_pivot.index[indices.flatten()[i]]),distances.flatten()[i]])
  out
  recommended_books = []
  recommended_books.append(book)
  temp=out[1:]
  recommended_books.append(temp[::-1])
  return recommended_books

In [21]:
get_recommends("The Queen of the Damned (Vampire Chronicles (Paperback))")

['The Queen of the Damned (Vampire Chronicles (Paperback))',
 [['Lasher: Lives of the Mayfair Witches (Lives of the Mayfair Witches)',
   0.7833433],
  ['The Witching Hour (Lives of the Mayfair Witches)', 0.7362787],
  ['Interview with the Vampire', 0.73450685],
  ['The Tale of the Body Thief (Vampire Chronicles (Paperback))', 0.5298544],
  ['The Vampire Lestat (Vampire Chronicles, Book II)', 0.51784116]]]