In [83]:
# Importing Libraries
import pandas as pd

In [84]:
# Adjusting pandas dataframe settings
pd.set_option('display.max_columns', 500)

In [85]:
# Reading Dataframes
books = pd.read_csv("../input/book-recommendation-dataset/Books.csv", low_memory='False')
ratings = pd.read_csv('../input/book-recommendation-dataset/Ratings.csv', low_memory='False')
users = pd.read_csv("../input/book-recommendation-dataset/Users.csv", low_memory='False')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [86]:
# Merging Dataframes
temp_df = books.merge(ratings, how='left', on='ISBN')
final_df = temp_df.merge(users, how='left', on='User-ID')
final_df.shape

(1032345, 12)

In [87]:
# Data Preprocessing
final_df.drop(['Image-URL-S', 'Image-URL-M','Image-URL-L'], axis=1, inplace=True)
final_df.dropna(inplace=True)

final_df['User-ID'] = final_df['User-ID'].astype('int')
final_df=final_df[final_df["Book-Rating"]>0]

In [88]:
# Analysis of Data

final_df.columns
final_df['User-ID'].nunique() # 40543 unique Readers
final_df['ISBN'].nunique() # 119945 unique ISBN number (books)
final_df['Book-Title'].nunique() # 109210 unique Books.
final_df['Book-Author'].nunique() # 50883 unique Author.
final_df['Publisher'].nunique() # 9749 unique Publisher.
final_df['Book-Rating'].nunique() # 10 unique Rating (1-10).

final_df.groupby('User-ID')['Book-Title'].agg('count').sort_values(ascending=False)

User-ID
98391     5689
153662    1845
235105    1020
171118     962
16795      959
          ... 
120886       1
120862       1
120861       1
120852       1
278852       1
Name: Book-Title, Length: 40543, dtype: int64

In [89]:
final_df.groupby('Book-Author')['ISBN'].agg('count').sort_values(ascending=False)

Book-Author
Stephen King         3488
Nora Roberts         2138
John Grisham         1670
James Patterson      1559
J. K. Rowling        1375
                     ... 
James Hawkins           1
James Henry Gray        1
James Henry Rubin       1
James Hoffman           1
Ã?Â?pictÃ?Â¨te          1
Name: ISBN, Length: 50883, dtype: int64

In [90]:
final_df.groupby('ISBN').agg({'Book-Title':'count'}).sort_values(by='Book-Title',ascending=False).head(100)

Unnamed: 0_level_0,Book-Title
ISBN,Unnamed: 1_level_1
0971880107,473
0316666343,473
0385504209,373
0312195516,227
059035342X,224
...,...
0385492081,82
0385486804,82
0449212602,81
0375700757,80


In [91]:
final_df.groupby('User-ID')['Book-Rating'].agg('count').sort_values(ascending=False)

User-ID
98391     5689
153662    1845
235105    1020
171118     962
16795      959
          ... 
120886       1
120862       1
120861       1
120852       1
278852       1
Name: Book-Rating, Length: 40543, dtype: int64

In [92]:
# We can calculate the ratings of the books and eliminate the rare books by determining a threshold which is 85 in this case by removing the main dataframe.
# Finally, we have common books which are rated more than 85.
rating_book = pd.DataFrame(final_df['Book-Title'].value_counts())
final_df['Book-Title'].value_counts().mean() # 2.4688

rare_books = rating_book[rating_book['Book-Title'] <= 85].index

common_books = final_df[~final_df["Book-Title"].isin(rare_books)]
common_books['Book-Title'].value_counts().mean() # 139.6589

139.65891472868216

In [93]:
# Creating User-Item Matrix Dataframe
user_book_matrix = common_books.pivot_table(index=['User-ID'], columns=['Book-Title'], values='Book-Rating')
user_book_matrix.shape

(8742, 129)

In [94]:
# If the correlations are quite similar, that means these books are similar.
book_name = 'Interview with the Vampire'

def book_to_ISBN(name):
    """
    This function gives us the ISBN number of the book for the name of the book which you want to learn ISBN number.
    :param name: the name of the book
    :return: ISBN; the books are identified by their respective ISBN
    """
    ISBN = final_df[final_df['Book-Title'] == name][['ISBN']].iloc[0,0]
    return ISBN

ISBN_no = book_to_ISBN(book_name)

book_df = user_book_matrix[book_name]
book_df.sort_values(ascending=False)

User-ID
259452    10.0
104399    10.0
222062    10.0
223190    10.0
223325    10.0
          ... 
278800     NaN
278836     NaN
278843     NaN
278844     NaN
278846     NaN
Name: Interview with the Vampire, Length: 8742, dtype: float64

In [95]:
# Let's look at the correlations between books with selected book and sort the best.
similar_to_book_name = user_book_matrix.corrwith(book_df)

corr_book_name = pd.DataFrame(similar_to_book_name, columns=['Correlation'])
corr_book_name.dropna(inplace=True)
corr_book_name.sort_values(by='Correlation',ascending=False).head(50)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


Unnamed: 0_level_0,Correlation
Book-Title,Unnamed: 1_level_1
She's Come Undone (Oprah's Book Club (Paperback)),1.0
The Beach House,1.0
"The Golden Compass (His Dark Materials, Book 1)",1.0
Suzanne's Diary for Nicholas,1.0
1st to Die: A Novel,1.0
Neverwhere,1.0
While I Was Gone,1.0
ANGELA'S ASHES,1.0
The Reader,1.0
Interview with the Vampire,1.0


In [96]:
# Creating recommendation list.
recommended_list = corr_book_name.reset_index().sort_values(by=('Correlation'),ascending=False)['Book-Title'].head(10).to_list()
recommended_list

["She's Come Undone (Oprah's Book Club (Paperback))",
 'The Beach House',
 'The Golden Compass (His Dark Materials, Book 1)',
 "Suzanne's Diary for Nicholas",
 '1st to Die: A Novel',
 'Neverwhere',
 'While I Was Gone',
 "ANGELA'S ASHES",
 'The Reader',
 'Interview with the Vampire']