<b><h1>Books Recommendation System

In [1]:
import re
import operator
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
from pandas.api.types import is_numeric_dtype
from sklearn.metrics.pairwise import cosine_similarity

import warnings
warnings.filterwarnings("ignore")

<b><h3>Dataset

In [2]:
books=pd.read_csv('Books.csv')
users=pd.read_csv('users.csv')
ratings=pd.read_csv('Ratings.csv')

In [3]:
print("Books Data:    ", books.shape)
print("Users Data:    ", users.shape)
print("Book_ratings: ", ratings.shape)

Books Data:     (271360, 8)
Users Data:     (278858, 3)
Book_ratings:  (1149780, 3)


<b><h3>Pre-processing

<b>Books Dataset Pre-processing

In [4]:
print("Columns: ", list(books.columns))
books.head()

Columns:  ['ISBN', 'Book-Title', 'Book-Author', 'Year-Of-Publication', 'Publisher', 'Image-URL-S', 'Image-URL-M', 'Image-URL-L']


Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...,http://images.amazon.com/images/P/0195153448.0...
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...,http://images.amazon.com/images/P/0002005018.0...
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...,http://images.amazon.com/images/P/0060973129.0...
3,374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...,http://images.amazon.com/images/P/0374157065.0...
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...,http://images.amazon.com/images/P/0393045218.0...


In [5]:
## Checking for null values
books.isnull().sum() 

ISBN                   0
Book-Title             0
Book-Author            1
Year-Of-Publication    0
Publisher              2
Image-URL-S            0
Image-URL-M            0
Image-URL-L            3
dtype: int64

In [6]:
books.loc[books['Book-Author'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
187689,9627982032,The Credit Suisse Guide to Managing Your Perso...,,1995,Edinburgh Financial Publishing,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...,http://images.amazon.com/images/P/9627982032.0...


In [7]:
books.loc[books['Publisher'].isnull(),:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
128890,193169656X,Tyrant Moon,Elaine Corvidae,2002,,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...,http://images.amazon.com/images/P/193169656X.0...
129037,1931696993,Finders Keepers,Linnea Sinclair,2001,,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...,http://images.amazon.com/images/P/1931696993.0...


In [8]:
books.at[187689 ,'Book-Author'] = 'Other'

books.at[128890 ,'Publisher'] = 'Other'
books.at[129037 ,'Publisher'] = 'Other'

In [9]:
## Checking for column Year-of-publication
books['Year-Of-Publication'].unique()

array([2002, 2001, 1991, 1999, 2000, 1993, 1996, 1988, 2004, 1998, 1994,
       2003, 1997, 1983, 1979, 1995, 1982, 1985, 1992, 1986, 1978, 1980,
       1952, 1987, 1990, 1981, 1989, 1984, 0, 1968, 1961, 1958, 1974,
       1976, 1971, 1977, 1975, 1965, 1941, 1970, 1962, 1973, 1972, 1960,
       1966, 1920, 1956, 1959, 1953, 1951, 1942, 1963, 1964, 1969, 1954,
       1950, 1967, 2005, 1957, 1940, 1937, 1955, 1946, 1936, 1930, 2011,
       1925, 1948, 1943, 1947, 1945, 1923, 2020, 1939, 1926, 1938, 2030,
       1911, 1904, 1949, 1932, 1928, 1929, 1927, 1931, 1914, 2050, 1934,
       1910, 1933, 1902, 1924, 1921, 1900, 2038, 2026, 1944, 1917, 1901,
       2010, 1908, 1906, 1935, 1806, 2021, '2000', '1995', '1999', '2004',
       '2003', '1990', '1994', '1986', '1989', '2002', '1981', '1993',
       '1983', '1982', '1976', '1991', '1977', '1998', '1992', '1996',
       '0', '1997', '2001', '1974', '1968', '1987', '1984', '1988',
       '1963', '1956', '1970', '1985', '1978', '1973', '1980'

In [10]:
pd.set_option('display.max_colwidth', -1)

In [11]:
books.loc[books['Year-Of-Publication'] == 'DK Publishing Inc',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
209538,078946697X,"DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)\"";Michael Teitelbaum""",2000,DK Publishing Inc,http://images.amazon.com/images/P/078946697X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/078946697X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/078946697X.01.LZZZZZZZ.jpg,
221678,0789466953,"DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)\"";James Buckley""",2000,DK Publishing Inc,http://images.amazon.com/images/P/0789466953.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0789466953.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0789466953.01.LZZZZZZZ.jpg,


In [12]:
books.loc[books['Year-Of-Publication'] == 'Gallimard',:]

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
220731,2070426769,"Peuple du ciel, suivi de 'Les Bergers\"";Jean-Marie Gustave Le ClÃ?Â©zio""",2003,Gallimard,http://images.amazon.com/images/P/2070426769.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/2070426769.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/2070426769.01.LZZZZZZZ.jpg,


In [13]:
books.at[209538 ,'Publisher'] = 'DK Publishing Inc'
books.at[209538 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How It All Began (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'Michael Teitelbaum'

books.at[221678 ,'Publisher'] = 'DK Publishing Inc'
books.at[221678 ,'Year-Of-Publication'] = 2000
books.at[209538 ,'Book-Title'] = 'DK Readers: Creating the X-Men, How Comic Books Come to Life (Level 4: Proficient Readers)'
books.at[209538 ,'Book-Author'] = 'James Buckley'

books.at[220731 ,'Publisher'] = 'Gallimard'
books.at[220731 ,'Year-Of-Publication'] = '2003'
books.at[209538 ,'Book-Title'] = 'Peuple du ciel - Suivi de Les bergers '
books.at[209538 ,'Book-Author'] = 'Jean-Marie Gustave Le ClÃ?Â©zio'

In [14]:
## Converting year of publication in Numbers
books['Year-Of-Publication'] = books['Year-Of-Publication'].astype(int)

In [15]:
print(sorted(list(books['Year-Of-Publication'].unique())))

[0, 1376, 1378, 1806, 1897, 1900, 1901, 1902, 1904, 1906, 1908, 1909, 1910, 1911, 1914, 1917, 1919, 1920, 1921, 1922, 1923, 1924, 1925, 1926, 1927, 1928, 1929, 1930, 1931, 1932, 1933, 1934, 1935, 1936, 1937, 1938, 1939, 1940, 1941, 1942, 1943, 1944, 1945, 1946, 1947, 1948, 1949, 1950, 1951, 1952, 1953, 1954, 1955, 1956, 1957, 1958, 1959, 1960, 1961, 1962, 1963, 1964, 1965, 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983, 1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2008, 2010, 2011, 2012, 2020, 2021, 2024, 2026, 2030, 2037, 2038, 2050]


In [16]:
## Replacing Invalid years with max year
count = Counter(books['Year-Of-Publication'])
[k for k, v in count.items() if v == max(count.values())]

[2002]

In [17]:
books.loc[books['Year-Of-Publication'] > 2021, 'Year-Of-Publication'] = 2002
books.loc[books['Year-Of-Publication'] == 0, 'Year-Of-Publication'] = 2002

In [18]:
## Uppercasing all alphabets in ISBN
books['ISBN'] = books['ISBN'].str.upper()

In [19]:
## Drop duplicate rows
books.drop_duplicates(keep='last', inplace=True) 
books.reset_index(drop = True, inplace = True)

In [20]:
books.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 271047 entries, 0 to 271046
Data columns (total 8 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 271047 non-null  object
 1   Book-Title           271047 non-null  object
 2   Book-Author          271047 non-null  object
 3   Year-Of-Publication  271047 non-null  int32 
 4   Publisher            271047 non-null  object
 5   Image-URL-S          271047 non-null  object
 6   Image-URL-M          271047 non-null  object
 7   Image-URL-L          271044 non-null  object
dtypes: int32(1), object(7)
memory usage: 15.5+ MB


In [21]:
books.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,195153448,Classical Mythology,Mark P. O. Morford,2002,Oxford University Press,http://images.amazon.com/images/P/0195153448.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0195153448.01.LZZZZZZZ.jpg
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg
2,60973129,Decision in Normandy,Carlo D'Este,1991,HarperPerennial,http://images.amazon.com/images/P/0060973129.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060973129.01.LZZZZZZZ.jpg
3,374157065,Flu: The Story of the Great Influenza Pandemic of 1918 and the Search for the Virus That Caused It,Gina Bari Kolata,1999,Farrar Straus Giroux,http://images.amazon.com/images/P/0374157065.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0374157065.01.LZZZZZZZ.jpg
4,393045218,The Mummies of Urumchi,E. J. W. Barber,1999,W. W. Norton &amp; Company,http://images.amazon.com/images/P/0393045218.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0393045218.01.LZZZZZZZ.jpg


<b>Users Dataset Pre-processing

In [22]:
print("Columns: ", list(users.columns))
users.head()

Columns:  ['User-ID', 'Location', 'Age']


Unnamed: 0,User-ID,Location,Age
0,1,"nyc, new york, usa",
1,2,"stockton, california, usa",18.0
2,3,"moscow, yukon territory, russia",
3,4,"porto, v.n.gaia, portugal",17.0
4,5,"farnborough, hants, united kingdom",


In [23]:
## Checking null values
print(users.isna().sum())               

User-ID     0     
Location    0     
Age         110762
dtype: int64


In [24]:
## Check for all values present in Age column
print(sorted(list(users['Age'].unique())))

[nan, 0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 17.0, 18.0, 19.0, 20.0, 21.0, 22.0, 23.0, 24.0, 25.0, 26.0, 27.0, 28.0, 29.0, 30.0, 31.0, 32.0, 33.0, 34.0, 35.0, 36.0, 37.0, 38.0, 39.0, 40.0, 41.0, 42.0, 43.0, 44.0, 45.0, 46.0, 47.0, 48.0, 49.0, 50.0, 51.0, 52.0, 53.0, 54.0, 55.0, 56.0, 57.0, 58.0, 59.0, 60.0, 61.0, 62.0, 63.0, 64.0, 65.0, 66.0, 67.0, 68.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 76.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 84.0, 85.0, 86.0, 87.0, 88.0, 89.0, 90.0, 91.0, 92.0, 93.0, 94.0, 95.0, 96.0, 97.0, 98.0, 99.0, 100.0, 101.0, 102.0, 103.0, 104.0, 105.0, 106.0, 107.0, 108.0, 109.0, 110.0, 111.0, 113.0, 114.0, 115.0, 116.0, 118.0, 119.0, 123.0, 124.0, 127.0, 128.0, 132.0, 133.0, 136.0, 137.0, 138.0, 140.0, 141.0, 143.0, 146.0, 147.0, 148.0, 151.0, 152.0, 156.0, 157.0, 159.0, 162.0, 168.0, 172.0, 175.0, 183.0, 186.0, 189.0, 199.0, 200.0, 201.0, 204.0, 207.0, 208.0, 209.0, 210.0, 212.0, 219.0, 220.0, 223.0, 226.0

In [25]:
required = users[users['Age'] <= 80]
required = required[required['Age'] >= 10]

In [26]:
mean = round(required['Age'].mean())   
mean

35

In [27]:
users.loc[users['Age'] > 80, 'Age'] = mean    #outliers with age grater than 80 are substituted with mean 
users.loc[users['Age'] < 10, 'Age'] = mean    #outliers with age less than 10 years are substitued with mean
users['Age'] = users['Age'].fillna(mean)      #filling null values with mean
users['Age'] = users['Age'].astype(int)       #changing Datatype to int

In [28]:
list_ = users.Location.str.split(', ')

city = []
state = []
country = []
count_no_state = 0    
count_no_country = 0

for i in range(0,len(list_)):
    if list_[i][0] == ' ' or list_[i][0] == '' or list_[i][0]=='n/a' or list_[i][0] == ',':  #removing invalid entries too
        city.append('other')
    else:
        city.append(list_[i][0].lower())

    if(len(list_[i])<2):
        state.append('other')
        country.append('other')
        count_no_state += 1
        count_no_country += 1
    else:
        if list_[i][1] == ' ' or list_[i][1] == '' or list_[i][1]=='n/a' or list_[i][1] == ',':   #removing invalid entries 
            state.append('other')
            count_no_state += 1            
        else:
            state.append(list_[i][1].lower())
        
        if(len(list_[i])<3):
            country.append('other')
            count_no_country += 1
        else:
            if list_[i][2] == ''or list_[i][1] == ',' or list_[i][2] == ' ' or list_[i][2] == 'n/a':
                country.append('other')
                count_no_country += 1
            else:
                country.append(list_[i][2].lower())
        


temp = []
for ent in city:
    c = ent.split('/')            #handling cases where city/state entries from city list as state is already given 
    temp.append(c[0])

df_city = pd.DataFrame(temp,columns=['City'])
df_state = pd.DataFrame(state,columns=['State'])
df_country = pd.DataFrame(country,columns=['Country'])

users = pd.concat([users, df_city], axis=1)
users = pd.concat([users, df_state], axis=1)
users = pd.concat([users, df_country], axis=1)

print(count_no_country)   #printing the number of countries didnt have any values 
print(count_no_state)     #printing the states which didnt have any values

4659
16044


In [29]:
## Drop duplicate rows
users.drop_duplicates(keep='last', inplace=True)
users.reset_index(drop=True, inplace=True)

In [30]:
users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278858 entries, 0 to 278857
Data columns (total 6 columns):
 #   Column    Non-Null Count   Dtype 
---  ------    --------------   ----- 
 0   User-ID   278858 non-null  int64 
 1   Location  278858 non-null  object
 2   Age       278858 non-null  int32 
 3   City      278858 non-null  object
 4   State     278858 non-null  object
 5   Country   278858 non-null  object
dtypes: int32(1), int64(1), object(4)
memory usage: 11.7+ MB


In [31]:
users.head()

Unnamed: 0,User-ID,Location,Age,City,State,Country
0,1,"nyc, new york, usa",35,nyc,new york,usa
1,2,"stockton, california, usa",18,stockton,california,usa
2,3,"moscow, yukon territory, russia",35,moscow,yukon territory,russia
3,4,"porto, v.n.gaia, portugal",17,porto,v.n.gaia,portugal
4,5,"farnborough, hants, united kingdom",35,farnborough,hants,united kingdom


<b>Ratings Dataset Pre-processing

In [32]:
print("Columns: ", list(ratings.columns))
ratings.head()

Columns:  ['User-ID', 'ISBN', 'Book-Rating']


Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [33]:
## Checking for null values
ratings.isnull().sum() 

User-ID        0
ISBN           0
Book-Rating    0
dtype: int64

In [34]:
## checking all ratings number or not
print(is_numeric_dtype(ratings['Book-Rating']))

True


In [35]:
## checking User-ID contains only number or not
print(is_numeric_dtype(ratings['User-ID']))

True


In [36]:
## checking ISBN
flag = 0
k =[]
reg = "[^A-Za-z0-9]"

for x in ratings['ISBN']:
    z = re.search(reg,x)    
    if z:
        flag = 1

if flag == 1:
    print("False")
else:
    print("True")

False


In [37]:
## removing extra characters from ISBN (from ratings dataset) existing in books dataset
bookISBN = books['ISBN'].tolist() 
reg = "[^A-Za-z0-9]" 
for index, row_Value in ratings.iterrows():
    z = re.search(reg, row_Value['ISBN'])    
    if z:
        f = re.sub(reg,"",row_Value['ISBN'])
        if f in bookISBN:
            ratings.at[index , 'ISBN'] = f

In [38]:
## Uppercasing all alphabets in ISBN
ratings['ISBN'] = ratings['ISBN'].str.upper()

In [39]:
## Drop duplicate rows
ratings.drop_duplicates(keep='last', inplace=True)
ratings.reset_index(drop=True, inplace=True)

In [40]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1149776 entries, 0 to 1149775
Data columns (total 3 columns):
 #   Column       Non-Null Count    Dtype 
---  ------       --------------    ----- 
 0   User-ID      1149776 non-null  int64 
 1   ISBN         1149776 non-null  object
 2   Book-Rating  1149776 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 26.3+ MB


In [41]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [42]:
ratings['User-ID'].value_counts().shape

(105283,)

In [43]:
ratings['User-ID'].unique().shape

(105283,)

In [44]:
# Lets keep the users who have rated atleast more than 200 books
x = ratings['User-ID'].value_counts() > 200

In [45]:
x[x].shape

(899,)

In [46]:
y =x[x].index

In [47]:
y

Int64Index([ 11676, 198711, 153662,  98391,  35859, 212898, 278418,  76352,
            110973, 235105,
            ...
            260183,  73681,  44296, 155916,   9856, 274808,  28634,  59727,
            268622, 188951],
           dtype='int64', length=899)

In [48]:
ratings= ratings[ratings['User-ID'].isin(y)]

In [49]:
ratings.head()

Unnamed: 0,User-ID,ISBN,Book-Rating
1456,277427,002542730X,10
1457,277427,0026217457,0
1458,277427,003008685X,8
1459,277427,0030615321,0
1460,277427,0060002050,0


In [50]:
ratings.shape

(526356, 3)

<h3><b>Merging of all three Tables

<b>Merging Books, Users and Rating Tables in One

In [51]:
dataset = pd.merge(books, ratings, on='ISBN', how='inner')
dataset = pd.merge(dataset, users, on='User-ID', how='inner')
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 487787 entries, 0 to 487786
Data columns (total 15 columns):
 #   Column               Non-Null Count   Dtype 
---  ------               --------------   ----- 
 0   ISBN                 487787 non-null  object
 1   Book-Title           487787 non-null  object
 2   Book-Author          487787 non-null  object
 3   Year-Of-Publication  487787 non-null  int32 
 4   Publisher            487787 non-null  object
 5   Image-URL-S          487787 non-null  object
 6   Image-URL-M          487787 non-null  object
 7   Image-URL-L          487784 non-null  object
 8   User-ID              487787 non-null  int64 
 9   Book-Rating          487787 non-null  int64 
 10  Location             487787 non-null  object
 11  Age                  487787 non-null  int32 
 12  City                 487787 non-null  object
 13  State                487787 non-null  object
 14  Country              487787 non-null  object
dtypes: int32(2), int64(2), object(11)


<b>Divide complete data on the basis of Implicit and Explicit ratings datasets

In [52]:
dataset.shape

(487787, 15)

In [53]:
## Explicit Ratings Dataset
#dataset1 = dataset[dataset['Book-Rating'] != 0]
#dataset1 = dataset1.reset_index(drop = True)
#dataset1.shape

In [54]:
## Implicit Ratings Dataset
#dataset2 = dataset[dataset['Book-Rating'] == 0]
#dataset2 = dataset2.reset_index(drop = True)
#dataset2.shape

In [55]:
#dataset1.head()

In [56]:
#dataset1['User-ID'].unique().shape

In [57]:
no_ratings= dataset.groupby('Book-Title')['Book-Rating'].count().reset_index()

In [58]:
no_ratings.head()
#print('Shape',no_ratings.shape)

Unnamed: 0,Book-Title,Book-Rating
0,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance for the 1990s,1
4,Clifford Visita El Hospital (Clifford El Gran Perro Colorado),1


In [59]:
no_ratings.rename(columns={'Book-Rating':'no_of_ratings'},inplace=True)

In [60]:
no_ratings.head()

Unnamed: 0,Book-Title,no_of_ratings
0,"A Light in the Storm: The Civil War Diary of Amelia Martin, Fenwick Island, Delaware, 1861 (Dear America)",2
1,Always Have Popsicles,1
2,Apple Magic (The Collector's series),1
3,Beyond IBM: Leadership Marketing and Finance for the 1990s,1
4,Clifford Visita El Hospital (Clifford El Gran Perro Colorado),1


In [61]:
finaldf=dataset.merge(no_ratings, on='Book-Title')

In [62]:
finaldf.head()

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,Location,Age,City,State,Country,no_of_ratings
0,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg,11676,8,"n/a, n/a, n/a",35,other,other,other,4
1,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg,85526,0,"victoria, british columbia, canada",36,victoria,british columbia,canada,4
2,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg,96054,0,"ottawa, ontario, canada",29,ottawa,ontario,canada,4
3,2005018,Clara Callan,Richard Bruce Wright,2001,HarperFlamingo Canada,http://images.amazon.com/images/P/0002005018.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0002005018.01.LZZZZZZZ.jpg,177458,0,"ottawa, ontario, canada",29,ottawa,ontario,canada,4
4,399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0399135782.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0399135782.01.LZZZZZZZ.jpg,11676,9,"n/a, n/a, n/a",35,other,other,other,111


In [63]:
finaldf.shape

(487787, 16)

In [64]:
#considering only those books which got atleast 50 ratings
finaldf1=finaldf[finaldf['no_of_ratings']>=50]

In [65]:
finaldf1

Unnamed: 0,ISBN,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L,User-ID,Book-Rating,Location,Age,City,State,Country,no_of_ratings
4,0399135782,The Kitchen God's Wife,Amy Tan,1991,Putnam Pub Group,http://images.amazon.com/images/P/0399135782.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0399135782.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0399135782.01.LZZZZZZZ.jpg,11676,9,"n/a, n/a, n/a",35,other,other,other,111
5,080410753X,The Kitchen God's Wife,Amy Tan,1992,Ivy Books,http://images.amazon.com/images/P/080410753X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.LZZZZZZZ.jpg,11676,8,"n/a, n/a, n/a",35,other,other,other,111
6,080410753X,The Kitchen God's Wife,Amy Tan,1992,Ivy Books,http://images.amazon.com/images/P/080410753X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.LZZZZZZZ.jpg,85526,0,"victoria, british columbia, canada",36,victoria,british columbia,canada,111
7,080410753X,The Kitchen God's Wife,Amy Tan,1992,Ivy Books,http://images.amazon.com/images/P/080410753X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.LZZZZZZZ.jpg,110912,9,"milpitas, california, usa",36,milpitas,california,usa,111
8,080410753X,The Kitchen God's Wife,Amy Tan,1992,Ivy Books,http://images.amazon.com/images/P/080410753X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/080410753X.01.LZZZZZZZ.jpg,137688,0,"medford, new york, usa",46,medford,new york,usa,111
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
246220,0553586122,Reap the Wind,Iris Johansen,2002,Bantam Books,http://images.amazon.com/images/P/0553586122.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0553586122.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0553586122.01.LZZZZZZZ.jpg,63938,0,"northwest, iowa, usa",35,northwest,iowa,usa,50
246221,0553292447,Reap the Wind,Iris Johansen,1991,Bantam Books,http://images.amazon.com/images/P/0553292447.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0553292447.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0553292447.01.LZZZZZZZ.jpg,180348,9,"fredericksburg, virginia, usa",40,fredericksburg,virginia,usa,50
246222,0553586122,Reap the Wind,Iris Johansen,2002,Bantam Books,http://images.amazon.com/images/P/0553586122.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0553586122.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0553586122.01.LZZZZZZZ.jpg,77940,0,"melaka, melaka, malaysia",35,melaka,melaka,malaysia,50
246223,0553586122,Reap the Wind,Iris Johansen,2002,Bantam Books,http://images.amazon.com/images/P/0553586122.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0553586122.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0553586122.01.LZZZZZZZ.jpg,160541,5,"ashford, england, united kingdom",35,ashford,england,united kingdom,50


In [66]:
finaldf1.shape

(61925, 16)

In [67]:
finaldf1.drop_duplicates(['User-ID','Book-Title'],inplace=True)

In [68]:
finaldf1.shape

(59921, 16)

<h2><b>Recommendation Systems

In [69]:
bookName = input("Enter a book name: ")
number = int(input("Enter number of books to recommend: "))

# Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))

<h5><b> 1. Popularity Based (Top In whole collection)

In [70]:
def popularity_based(dataframe, n):
    if n >= 1 and n <= len(dataframe):
        data = pd.DataFrame(dataframe.groupby('ISBN')['Book-Rating'].count()).sort_values('Book-Rating', ascending=False).head(n)
        result = pd.merge(data, books, on='ISBN')
        return result
    return "Invalid number of books entered!!"

In [71]:
print("Top", number, "Popular books are: ")
popularity_based(finaldf1, number)

Top 5 Popular books are: 


Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,971880107,363,Wild Animus,Rich Shapero,2004,Too Far,http://images.amazon.com/images/P/0971880107.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0971880107.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0971880107.01.LZZZZZZZ.jpg
1,316666343,270,The Lovely Bones: A Novel,Alice Sebold,2002,"Little, Brown",http://images.amazon.com/images/P/0316666343.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0316666343.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0316666343.01.LZZZZZZZ.jpg
2,60928336,220,Divine Secrets of the Ya-Ya Sisterhood: A Novel,Rebecca Wells,1997,Perennial,http://images.amazon.com/images/P/0060928336.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0060928336.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0060928336.01.LZZZZZZZ.jpg
3,440214041,218,The Pelican Brief,John Grisham,1993,Dell,http://images.amazon.com/images/P/0440214041.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0440214041.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0440214041.01.LZZZZZZZ.jpg
4,385504209,215,The Da Vinci Code,Dan Brown,2003,Doubleday,http://images.amazon.com/images/P/0385504209.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0385504209.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0385504209.01.LZZZZZZZ.jpg


<h5><b>2. Popularity Based (Top In a given place)

In [72]:
def search_unique_places(dataframe, place):
    place = place.lower()

    if place in list(dataframe['City'].unique()):
        return dataframe[dataframe['City'] == place]
    elif place in list(dataframe['State'].unique()):
        return dataframe[dataframe['State'] == place]
    elif place in list(dataframe['Country'].unique()):
        return dataframe[dataframe['Country'] == place]
    else:
        return "Invalid Entry"

In [73]:
place = input("Enter the name of place: ")
data = search_unique_places(finaldf1, place)

if isinstance(data, pd.DataFrame):
    data = popularity_based(data, number)

data

Unnamed: 0,ISBN,Book-Rating,Book-Title,Book-Author,Year-Of-Publication,Publisher,Image-URL-S,Image-URL-M,Image-URL-L
0,051513628X,2,Key of Light (Key Trilogy (Paperback)),Nora Roberts,2003,Jove Books,http://images.amazon.com/images/P/051513628X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/051513628X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/051513628X.01.LZZZZZZZ.jpg
1,051513628X,2,Key of Light,Nora Roberts,2003,Jove Books,http://images.amazon.com/images/P/051513628X.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/051513628X.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/051513628X.01.LZZZZZZZ.jpg
2,0312284683,1,Faking It,Jennifer Crusie,2002,St. Martin's Press,http://images.amazon.com/images/P/0312284683.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0312284683.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0312284683.01.LZZZZZZZ.jpg
3,0312291639,1,The Nanny Diaries: A Novel,Emma McLaughlin,2003,St. Martin's Griffin,http://images.amazon.com/images/P/0312291639.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0312291639.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0312291639.01.LZZZZZZZ.jpg
4,0515128600,1,The Edge,Catherine Coulter,2000,Jove Books,http://images.amazon.com/images/P/0515128600.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0515128600.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0515128600.01.LZZZZZZZ.jpg
5,0515130966,1,Riptide,Catherine Coulter,2001,Jove Books,http://images.amazon.com/images/P/0515130966.01.THUMBZZZ.jpg,http://images.amazon.com/images/P/0515130966.01.MZZZZZZZ.jpg,http://images.amazon.com/images/P/0515130966.01.LZZZZZZZ.jpg


<b><h5>3. Books by same author, publisher of given book name

In [74]:
def printBook(k, n):
    z = k['Book-Title'].unique()
    for x in range(len(z)):
        print(z[x])
        if x >= n-1:
            break

In [75]:
def get_books(dataframe, name, n):
    print("\nBooks by same Author:\n")
    au = dataframe['Book-Author'].unique()

    data = dataset[dataset['Book-Title'] != name]

    if au[0] in list(data['Book-Author'].unique()):
        k2 = data[data['Book-Author'] == au[0]]
    k2 = k2.sort_values(by=['Book-Rating'])
    printBook(k2, n)

    print("\n\nBooks by same Publisher:\n")
    au = dataframe['Publisher'].unique()

    if au[0] in list(data['Publisher'].unique()):
        k2 = pd.DataFrame(data[data['Publisher'] == au[0]])
    k2=k2.sort_values(by=['Book-Rating']) 
    printBook(k2, n)

In [76]:
if bookName in list(finaldf1['Book-Title'].unique()):
    d = finaldf1[finaldf1['Book-Title'] == bookName]
    get_books(d, bookName, number)
else:
    print("Invalid Book Name!!")


Books by same Author:

Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))
Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter and the Sorcerer's Stone (Book 1)
Harry Potter and the Order of the Phoenix (Book 5)


Books by same Publisher:

Blind Flight
The Encounter (Animorphs , No 3)
The Visitor (Animorphs, No 2)
Amazing But True Sports Stories
Green Angel


<b><h5>4.Using Nearest Neighbours

In [77]:
finaldf1.shape

(59921, 16)

In [78]:
pivot=finaldf1.pivot_table(columns='User-ID',index='Book-Title', values='Book-Rating')

In [79]:
pivot

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,,,,,,,,,,...,,,,,,0.0,,,,
1st to Die: A Novel,,,,,,,,,,,...,,,,,,,,,,
2nd Chance,,10.0,,,,,,,,,...,,,,0.0,,,,,0.0,
4 Blondes,,,,,,,,,,0.0,...,,,,,,,,,,
84 Charing Cross Road,,,,,,,,,,,...,,,,,,10.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,,,,7.0,,,,,7.0,,...,,,,,,0.0,,,,
You Belong To Me,,,,,,,,,,,...,,,,,,,,,,
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,,,,,0.0,,,,,0.0,...,,,,,,0.0,,,,
Zoya,,,,,,,,,,,...,,,,,,,,,,


In [80]:
pivot.fillna(0,inplace=True)

In [81]:
pivot

User-ID,254,2276,2766,2977,3363,3757,4017,4385,6242,6251,...,274004,274061,274301,274308,274808,275970,277427,277478,277639,278418
Book-Title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1984,9.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1st to Die: A Novel,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2nd Chance,0.0,10.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4 Blondes,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
84 Charing Cross Road,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,10.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Year of Wonders,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,7.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
You Belong To Me,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zen and the Art of Motorcycle Maintenance: An Inquiry into Values,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Zoya,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [82]:
from scipy.sparse import csr_matrix

In [83]:
pivot_sparse= csr_matrix(pivot)

In [84]:
type(pivot_sparse)

scipy.sparse._csr.csr_matrix

In [85]:
from sklearn.neighbors import NearestNeighbors
model = NearestNeighbors(algorithm= 'brute')

In [86]:
model.fit(pivot_sparse)

NearestNeighbors(algorithm='brute')

In [87]:
distance, suggestion = model.kneighbors(pivot.iloc[237,:].values.reshape(1,-1), n_neighbors=6 )

In [88]:
distance

array([[ 0.        , 69.5413546 , 69.69935437, 72.74613392, 76.83098333,
        77.3369252 ]])

In [89]:
suggestion

array([[237, 238, 240, 241, 184, 291]], dtype=int64)

In [90]:
pivot.iloc[241,:]

User-ID
254       9.0
2276      0.0
2766      0.0
2977      0.0
3363      0.0
         ... 
275970    9.0
277427    0.0
277478    0.0
277639    0.0
278418    0.0
Name: Harry Potter and the Sorcerer's Stone (Book 1), Length: 888, dtype: float64

In [91]:
for i in range(len(suggestion)):
    print(pivot.index[suggestion[i]])

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'Jacob Have I Loved'],
      dtype='object', name='Book-Title')


In [92]:
pivot.index[3]

'4 Blondes'

In [93]:
#keeping books name
book_names = pivot.index

In [94]:
book_names[2]

'2nd Chance'

In [95]:
np.where(pivot.index== '4 Blondes')[0][0]

3

In [96]:
book_names[3]

'4 Blondes'

<b>Find images

In [97]:
# finaldf['title'].value_counts()
imgs= np.where(finaldf1['Book-Title'] == "Harry Potter and the Chamber of Secrets (Book 2)")[0][0]

In [98]:
finaldf1.iloc[imgs]['Image-URL-M']

'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'

In [99]:
book_name = []
for book_id in suggestion:
     book_name.append(pivot.index[book_id])

In [100]:
book_name[0]

Index(['Harry Potter and the Chamber of Secrets (Book 2)',
       'Harry Potter and the Goblet of Fire (Book 4)',
       'Harry Potter and the Prisoner of Azkaban (Book 3)',
       'Harry Potter and the Sorcerer's Stone (Book 1)', 'Exclusive',
       'Jacob Have I Loved'],
      dtype='object', name='Book-Title')

In [101]:
imgs_index=[]
for name in book_name[0]:
    imgs=np.where(finaldf1['Book-Title'] == name)[0][0]
    imgs_index.append(imgs)

In [102]:
for idx in imgs_index:
    url = finaldf1.iloc[idx]['Image-URL-M']
    print(url)

http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg
http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg
http://images.amazon.com/images/P/0439136350.01.MZZZZZZZ.jpg
http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg
http://images.amazon.com/images/P/0446604232.01.MZZZZZZZ.jpg
http://images.amazon.com/images/P/0064403688.01.MZZZZZZZ.jpg


In [103]:
import pickle

In [104]:
pickle.dump(model,open('BRSmodel.pkl','wb'))
pickle.dump(book_names,open('book_names.pkl','wb'))
pickle.dump(finaldf1,open('finaldf1.pkl','wb'))
pickle.dump(pivot,open('pivot.pkl','wb'))

<b>Model testing

In [105]:
def recommend_book(book_name):
    book_id= np.where(pivot.index == book_name)[0][0]
    distance , suggestion = model.kneighbors(pivot.iloc[book_id,:].values.reshape(1,-1), n_neighbors=6)

    for i in range(len(suggestion)):
            books = pivot.index[suggestion[i]]
            for j in books:
                if j == book_name:
                        print(f"You searched '{book_name}'\n")
                        print("The recommended books are: \n")
                else:
                  print(j)

In [106]:
book_name= "Harry Potter and the Prisoner of Azkaban (Book 3)"
recommend_book(book_name)

You searched 'Harry Potter and the Prisoner of Azkaban (Book 3)'

The recommended books are: 

Harry Potter and the Goblet of Fire (Book 4)
Harry Potter and the Chamber of Secrets (Book 2)
Harry Potter and the Sorcerer's Stone (Book 1)
Harry Potter and the Order of the Phoenix (Book 5)
Tough Cookie


<b><h5>5.Based on cosine_similarity

In [107]:
from sklearn.metrics.pairwise import cosine_similarity

In [108]:
similarity_scores = cosine_similarity(pivot)

In [109]:
similarity_scores

array([[1.        , 0.07624004, 0.        , ..., 0.09387814, 0.04480685,
        0.03286937],
       [0.07624004, 1.        , 0.24424725, ..., 0.07424784, 0.16365457,
        0.15050832],
       [0.        , 0.24424725, 1.        , ..., 0.0432679 , 0.04617844,
        0.10992264],
       ...,
       [0.09387814, 0.07424784, 0.0432679 , ..., 1.        , 0.07085128,
        0.03898126],
       [0.04480685, 0.16365457, 0.04617844, ..., 0.07085128, 1.        ,
        0.13208788],
       [0.03286937, 0.15050832, 0.10992264, ..., 0.03898126, 0.13208788,
        1.        ]])

In [110]:
similarity_scores.shape

(743, 743)

In [111]:
np.where(pivot.index == "Harry Potter and the Prisoner of Azkaban (Book 3)")[0][0]

240

In [119]:
sorted(list(enumerate(similarity_scores[1])),key = lambda x:x[1],reverse = True)[1:6]

[(45, 0.3264281010124157),
 (398, 0.2863486231475148),
 (702, 0.28224417845567223),
 (426, 0.2663898357979779),
 (654, 0.26201292516529295)]

In [124]:
def recommend_books(book_name):
    index = np.where(pivot.index == book_name)[0][0]

    similar_items = sorted(list(enumerate(similarity_scores[index])),key = lambda x:x[1],reverse = True)[1:6]

    data = []
    for i in similar_items:

        item = []
        temp_df = books[books["Book-Title"] == pivot.index[i[0]]]
        item.extend(list(temp_df.drop_duplicates("Book-Title")["Book-Title"].values))
        item.extend(list(temp_df.drop_duplicates("Book-Title")["Book-Author"].values))
        item.extend(list(temp_df.drop_duplicates("Book-Title")["Image-URL-M"].values))

        data.append(item)

    return data


In [126]:
recommend_books("Harry Potter and the Prisoner of Azkaban (Book 3)")

[['Harry Potter and the Goblet of Fire (Book 4)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439139597.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Chamber of Secrets (Book 2)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439064872.01.MZZZZZZZ.jpg'],
 ['Harry Potter and the Order of the Phoenix (Book 5)',
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0439567610.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Book 1)",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/0590353403.01.MZZZZZZZ.jpg'],
 ["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))",
  'J. K. Rowling',
  'http://images.amazon.com/images/P/059035342X.01.MZZZZZZZ.jpg']]