In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import re
from scipy.sparse import csr_matrix, coo_matrix
import numpy as np

%matplotlib inline

In [2]:
import pickle
bookinfo = pd.read_pickle("ibsn_features_full.pickle")

In [3]:


import sys


def is_isbn10_valid(isbn):
    """
    Check ISBN-10 is valid.
    Code Implementaion from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 10:
        return False
    if ((not isbn[0:9].isdigit()) or
            ((isbn[-1] != 'X') and (not isbn[-1].isdigit()))):
        return False
    result = sum((10 - i) * (int(x) if x != 'X' else 10)
                 for i, x in enumerate(isbn))
    return result % 11 == 0


def is_isbn13_valid(isbn):
    """
    Check ISBN-13 is valid.
    Code Implemetation from:
    http://en.wikipedia.org/wiki/International_Standard_Book_Number
    """
    if len(isbn) != 13 or isbn.isdigit() is not True:
        return False
    check = (10 - (sum(int(digit) * (3 if idx % 2 else 1)
                       for idx, digit in enumerate(isbn[:12])) % 10)) % 10
    return check == int(isbn[-1])


def isbn13_to_isbn10(isbn13_str):
    """
    Convert ISBN-13 to ISBN-10.
    """
    num = 11 - sum((10 - i) * (int(x))
                   for i, x in enumerate(isbn13_str[3:12])) % 11
    if num == 10:
        check_digit = 'X'
    elif num == 11:
        check_digit = 0
    else:
        check_digit = num
    return isbn13_str[3:12] + str(check_digit)


def isbn10_to_isbn13(isbn10_str):
    """
    Convert ISBN-10 to ISBN-13.
    """
    check_digit = (
        10 - (sum(int(digit) * (3 if idx % 2 else 1)
                  for idx, digit in enumerate('978' + isbn10_str[:9])
                  ) % 10)) % 10
    return '978' + isbn10_str[:9] + str(check_digit)


def isbn_converter(isbn):
    """
    Convert isbn format to another format.
    """
    if is_isbn10_valid(isbn):
        result = isbn10_to_isbn13(isbn)
    elif is_isbn13_valid(isbn):
        result = isbn13_to_isbn10(isbn)
    else:
        return None
    return result

In [4]:
import re
def striphtml(data):
    p = re.compile('<.*?>')
    try:
        return p.sub('', data)
    except:
        return None

Download google's word2vec model before running next line

In [5]:
import gensim
model = gensim.models.KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)  

In [6]:
genres = ['Science','Satire','Drama','Action','Romance','Mystery','Horror','Travel','Children','Religion','History','Biography','Autobiography','Fantasy']

In [7]:
bookinfo['description'] = bookinfo['description'].apply(lambda x: striphtml(x))
bookinfo['description'] = bookinfo['description'].str.strip()
bookinfo['description'] = bookinfo['description'].str.replace('“','').str.replace(',','').str.replace('"','')

In [8]:
from nltk.corpus import stopwords
filtereddesc = []
stops = set(stopwords.words("english"))
for desc in bookinfo['description']:
    try:
        words = desc.split()
        filtereddesc.append([word for word in words if word not in stops])
    except:
        filtereddesc.append(None)

In [9]:
bookinfo['filtered_description'] = filtereddesc
wordlist = []
for descs in bookinfo['filtered_description']:
    sentence = []
    if descs is not None:
        for word in descs:
            sentence.append(word)
    wordlist.append(sentence)

In [10]:
scores = []
for desc in bookinfo['filtered_description']:
    if desc is not None:
        gscore = []
        for genre in genres:
            simsum = 0
            n = 0
            for word in desc:
                try:
                    simsum = simsum + model.similarity(word,genre)
                    n = n + 1
                except:
                    continue
            if n!=0:
                gscore.append((simsum)/n)
            else:
                gscore.append(0)
        scores.append(gscore)
    else:
        scores.append(None)

In [11]:
editedscores = []
for score in scores:
    if score is not None:
        editedscores.append(score)
    else:
        editedscores.append([0] * 14)

In [12]:
scoredf = pd.DataFrame(editedscores,columns = [genre + '_Score' for genre in genres])

In [13]:
scoredf['isbn'] = bookinfo.isbn
scoredf['title'] = bookinfo.title

In [14]:
scoredf.shape

(4959, 16)

In [15]:
scoredf.to_csv('book_features.csv')