In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np

%matplotlib inline

In [2]:
ratings = pd.read_csv('BX-Book-Ratings.csv', encoding='iso-8859-1', sep = ';')
ratings.columns = ['user_id', 'isbn', 'book_rating']
books = pd.read_csv('BX-Books.csv', sep=';', encoding = 'iso-8859-1', dtype =str)

books["Book-Title"].nunique() == books["ISBN"].nunique()
book_dict = books[["Book-Title","ISBN"]].set_index("Book-Title").to_dict()["ISBN"]
books['new_isbn'] = books["Book-Title"].apply(lambda x: book_dict[x])
books["Book-Title"].nunique() == books["new_isbn"].nunique()
books['isbn'] = books['new_isbn']

del books['Image-URL-L']
del books['Image-URL-M']
del books['Image-URL-S']
del books['Book-Author']
del books['Publisher']
del books['ISBN']
del books['new_isbn']

newdf = ratings[ratings.book_rating>0]
joined = books.merge(newdf, on ='isbn')
print(newdf.shape)

(433671, 3)


In [3]:
bookinfo = pd.read_csv("goodreads_list_props.csv")
bookinfo.drop_duplicates(inplace = True)

In [4]:
books.drop_duplicates(subset = 'isbn',inplace = True)

In [5]:
#!/usr/bin/env python
# -*- coding: utf-8 -*-

import sys


def is_isbn10_valid(isbn):
    """
    Check ISBN-10 is valid.
    Code Implementaion from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 10:
        return False
    if ((not isbn[0:9].isdigit()) or
            ((isbn[-1] != 'X') and (not isbn[-1].isdigit()))):
        return False
    result = sum((10 - i) * (int(x) if x != 'X' else 10)
                 for i, x in enumerate(isbn))
    return result % 11 == 0


def is_isbn13_valid(isbn):
    """
    Check ISBN-13 is valid.
    Code Implemetation from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 13 or isbn.isdigit() is not True:
        return False
    check = (10 - (sum(int(digit) * (3 if idx % 2 else 1)
                       for idx, digit in enumerate(isbn[:12])) % 10)) % 10
    return check == int(isbn[-1])


def isbn13_to_isbn10(isbn13_str):
    """
    Convert ISBN-13 to ISBN-10.
    """
    num = 11 - sum((10 - i) * (int(x))
                   for i, x in enumerate(isbn13_str[3:12])) % 11
    if num == 10:
        check_digit = 'X'
    elif num == 11:
        check_digit = 0
    else:
        check_digit = num
    return isbn13_str[3:12] + str(check_digit)


def isbn10_to_isbn13(isbn10_str):
    """
    Convert ISBN-10 to ISBN-13.
    """
    check_digit = (
        10 - (sum(int(digit) * (3 if idx % 2 else 1)
                  for idx, digit in enumerate('978' + isbn10_str[:9])
                  ) % 10)) % 10
    return '978' + isbn10_str[:9] + str(check_digit)


def isbn_converter(isbn):
    """
    Convert isbn format to another format.
    """
    if is_isbn10_valid(isbn):
        result = isbn10_to_isbn13(isbn)
    elif is_isbn13_valid(isbn):
        result = isbn13_to_isbn10(isbn)
    else:
        return None
    return result


if __name__ == "__main__":
    for isbn_str in sys.argv[1:]:
        the_result = isbn_converter(isbn_str)
        if the_result:
            print(the_result)
        else:
            print("Bad ISBN " + isbn_str)

Bad ISBN -f
Bad ISBN C:\Users\vijay\AppData\Roaming\jupyter\runtime\kernel-f0403c4b-f81b-4cf4-837e-adc1c052d77d.json


In [6]:
isbn13 = []
for i in books['isbn']:
    isbn13.append(isbn_converter(i))

In [7]:
books['isbn13'] = isbn13

In [8]:
books.dropna(subset = ['isbn13'],inplace = True)
bookinfo.dropna(subset = ['isbn13'],inplace = True)

In [9]:
mergedinfo = bookinfo.merge(books,on = 'isbn13',how = 'inner')

In [10]:
import re
def striphtml(data):
    p = re.compile('<.*?>')
    try:
        return p.sub('', data)
    except:
        return None

In [11]:
mergedinfo['description'] = mergedinfo['description'].apply(lambda x: striphtml(x))
mergedinfo['description'] = mergedinfo['description'].str.strip()
mergedinfo['description'] = mergedinfo['description'].str.replace('“','').str.replace(',','').str.replace('"','')

In [44]:
from nltk.corpus import stopwords
# ...
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in mergedinfo['description']:
    try:
        words = desc.split()
        filtereddesc.append([word for word in words if word not in stops])
    except:
        filtereddesc.append(None)

In [48]:
mergedinfo['filtered_description'] = filtereddesc

In [80]:
wordlist = []
for descs in mergedinfo['filtered_description']:
    sentence = []
    if descs is not None:
        for word in descs:
            sentence.append(word)
    wordlist.append(sentence)

In [88]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [83]:
genres = ['Science','Satire','Drama','Action','Romance','Mystery','Horror','Travel','Children','Religion','History','Biography','Autobiography','Fantasy']

In [94]:
scores = []
for desc in mergedinfo['filtered_description']:
    if desc is not None:
        gscore = []
        for genre in genres:
            simsum = 0
            n = 0
            for word in desc:
                try:
                    simsum = simsum + model.similarity(word,genre)
                    n = n + 1
                except:
                    continue
            if n!=0:
                gscore.append((simsum)/n)
            else:
                gscore.append(0)
        scores.append(gscore)
    else:
        scores.append(None)

In [109]:
mergedinfo.iloc[12]

book_name                                             Pride and Prejudice
author                                                        Jane Austen
rating                                                               4.24
votes                                                             2074236
description             It is a truth universally acknowledged that a ...
book_type                                                       Paperback
no_of_pages                                                           279
first_published                                     ['January 28th 1813']
isbn13                                                      9780679783268
genre                                                          Literature
link                    https://www.goodreads.com//book/show/1885.Prid...
Book-Title                  Pride and Prejudice (Modern Library Classics)
Year-Of-Publication                                                  2000
isbn                                  

In [110]:
scores[12]

[0.043320644589633382,
 0.11083853228245849,
 0.073109114540767337,
 0.011299775784885288,
 0.10893565938360378,
 0.09275169855010712,
 0.080056073238774811,
 0.043229179516044519,
 0.044671671412237358,
 0.080341256226919594,
 0.099886692389653794,
 0.095638035395685814,
 0.076172551076481299,
 0.069401170797353723]