In [1]:
from difflib import get_close_matches 
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

Importing Datasets

In [2]:
book_data = pd.read_csv("books.csv")
book_data.head()

Unnamed: 0,id,book_id,best_book_id,work_id,books_count,isbn,isbn13,authors,original_publication_year,original_title,...,ratings_count,work_ratings_count,work_text_reviews_count,ratings_1,ratings_2,ratings_3,ratings_4,ratings_5,image_url,small_image_url
0,1,2767052,2767052,2792775,272,439023483,9780439000000.0,Suzanne Collins,2008.0,The Hunger Games,...,4780653,4942365,155254,66715,127936,560092,1481305,2706317,https://images.gr-assets.com/books/1447303603m...,https://images.gr-assets.com/books/1447303603s...
1,2,3,3,4640799,491,439554934,9780440000000.0,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,...,4602479,4800065,75867,75504,101676,455024,1156318,3011543,https://images.gr-assets.com/books/1474154022m...,https://images.gr-assets.com/books/1474154022s...
2,3,41865,41865,3212258,226,316015849,9780316000000.0,Stephenie Meyer,2005.0,Twilight,...,3866839,3916824,95009,456191,436802,793319,875073,1355439,https://images.gr-assets.com/books/1361039443m...,https://images.gr-assets.com/books/1361039443s...
3,4,2657,2657,3275794,487,61120081,9780061000000.0,Harper Lee,1960.0,To Kill a Mockingbird,...,3198671,3340896,72586,60427,117415,446835,1001952,1714267,https://images.gr-assets.com/books/1361975680m...,https://images.gr-assets.com/books/1361975680s...
4,5,4671,4671,245494,1356,743273567,9780743000000.0,F. Scott Fitzgerald,1925.0,The Great Gatsby,...,2683664,2773745,51992,86236,197621,606158,936012,947718,https://images.gr-assets.com/books/1490528560m...,https://images.gr-assets.com/books/1490528560s...


In [3]:
rating_data = pd.read_csv("ratings.csv")
rating_data.head()

Unnamed: 0,book_id,user_id,rating
0,1,314,5
1,1,439,3
2,1,588,5
3,1,1169,4
4,1,1185,4


In [4]:
tag_data = pd.read_csv("book_tags.csv")
tag_data.head()

Unnamed: 0,goodreads_book_id,tag_id,count
0,1,30574,167697
1,1,11305,37174
2,1,11557,34173
3,1,8717,12986
4,1,33114,12716


In [5]:
book_tags = pd.read_csv("tags.csv")
book_tags.head()
book_tags.tail()

Unnamed: 0,tag_id,tag_name
34247,34247,Ｃhildrens
34248,34248,Ｆａｖｏｒｉｔｅｓ
34249,34249,Ｍａｎｇａ
34250,34250,ＳＥＲＩＥＳ
34251,34251,ｆａｖｏｕｒｉｔｅｓ


Drop Unnecessary Columns


In [6]:
book_data = book_data.drop(columns=['id', 'best_book_id', 'work_id', 'isbn', 'isbn13', 'title','work_ratings_count',
                                   'work_text_reviews_count', 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',
                                    'image_url','small_image_url'], axis=1)
book_data.head()

Unnamed: 0,book_id,books_count,authors,original_publication_year,original_title,language_code,average_rating,ratings_count
0,2767052,272,Suzanne Collins,2008.0,The Hunger Games,eng,4.34,4780653
1,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,eng,4.44,4602479
2,41865,226,Stephenie Meyer,2005.0,Twilight,en-US,3.57,3866839
3,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,eng,4.25,3198671
4,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,eng,3.89,2683664


Remove Nan value and null value

In [7]:
print(book_data.isnull().sum())

book_id                         0
books_count                     0
authors                         0
original_publication_year      21
original_title                585
language_code                1084
average_rating                  0
ratings_count                   0
dtype: int64


In [8]:
book_data = book_data.dropna()
book_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8405 entries, 0 to 9998
Data columns (total 8 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   book_id                    8405 non-null   int64  
 1   books_count                8405 non-null   int64  
 2   authors                    8405 non-null   object 
 3   original_publication_year  8405 non-null   float64
 4   original_title             8405 non-null   object 
 5   language_code              8405 non-null   object 
 6   average_rating             8405 non-null   float64
 7   ratings_count              8405 non-null   int64  
dtypes: float64(2), int64(3), object(3)
memory usage: 591.0+ KB


In [9]:
print(book_data.isnull().sum())

book_id                      0
books_count                  0
authors                      0
original_publication_year    0
original_title               0
language_code                0
average_rating               0
ratings_count                0
dtype: int64


Drop Duplicates

In [10]:
rating_data = rating_data.sort_values("user_id")
rating_data.drop_duplicates(subset =["user_id","book_id"], keep = False, inplace = True)
rating_data.head()

Unnamed: 0,book_id,user_id,rating
117889,1180,1,4
488112,4893,1,3
625717,6285,1,4
796318,8034,2,4
875008,8855,2,5


In [11]:
book_data.drop_duplicates(subset='original_title',keep=False,inplace=True)

In [12]:
book_tags.drop_duplicates(subset='tag_id',keep=False,inplace=True)

In [13]:
tag_data.drop_duplicates(subset=['tag_id','goodreads_book_id'],keep=False,inplace=True)

Content Based Recommendation

In [14]:
book_data["Content"] = book_data['original_title'] + ' ' + book_data['authors'] + ' ' + book_data['average_rating'].astype(str)

In [15]:
book_data = book_data.reset_index()
book_data.head()

Unnamed: 0,index,book_id,books_count,authors,original_publication_year,original_title,language_code,average_rating,ratings_count,Content
0,0,2767052,272,Suzanne Collins,2008.0,The Hunger Games,eng,4.34,4780653,The Hunger Games Suzanne Collins 4.34
1,1,3,491,"J.K. Rowling, Mary GrandPré",1997.0,Harry Potter and the Philosopher's Stone,eng,4.44,4602479,Harry Potter and the Philosopher's Stone J.K. ...
2,3,2657,487,Harper Lee,1960.0,To Kill a Mockingbird,eng,4.25,3198671,To Kill a Mockingbird Harper Lee 4.25
3,4,4671,1356,F. Scott Fitzgerald,1925.0,The Great Gatsby,eng,3.89,2683664,The Great Gatsby F. Scott Fitzgerald 3.89
4,5,11870085,226,John Green,2012.0,The Fault in Our Stars,eng,4.26,2346404,The Fault in Our Stars John Green 4.26


In [16]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(book_data['Content'])
tfidf_matrix.shape

(8175, 13024)

In [17]:
cosine_sim = cosine_similarity(tfidf_matrix)

In [18]:
cosine_sim

array([[1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 1.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 1.        , ..., 0.10729347, 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.10729347, ..., 1.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        1.        ]])

In [19]:
def get_title_from_index(index):
    return book_data[book_data.index==index]['original_title'].values[0]
def get_index_from_title(title):
    return book_data[book_data.original_title==title]['index'].values[0]

In [27]:
def recommendation(Book_name):
    Book_index = get_index_from_title(Book_name)
    similar_books = list(enumerate(cosine_sim[Book_index]))
    sorted_list = sorted(similar_books, key=lambda x:x[1], reverse=True)[1:]
    i=0
    print("Top ten similar books are:\n")
    for elements in sorted_list:
        print(get_title_from_index(elements[0]))
        i = i+1
        if i>10:
            break

In [28]:
book_list=book_data.original_title.to_list()

Getting The name on which suggestion is given

In [29]:
book_name = "The Great gotby" #original is: "The Great Gatsby"

Autocorrect

In [30]:
closest = get_close_matches(book_name,book_list)

In [31]:
Book_name = closest[0]

In [32]:
Book_name

'The Great Gatsby'

Results

In [33]:
if Book_name in book_list:
    recommendation(Book_name)
else:
    print("Wrong Book Name, Please put Correct Name")

Top ten similar books are:

Agent to the stars
Full Dark, No Stars
Stars of Fortune
Paper Towns
Looking for Alaska
Stars Above
The City and the Stars
An Abundance of Katherines
Number the Stars
The Stars Shine Down
The Other Woman
