In [1]:
import json
import queue
import re
from concurrent.futures import ThreadPoolExecutor

import requests
from tqdm.notebook import tqdm

In [2]:
with open('./data/book-test-set/book_covers.json', 'r', encoding="utf-8") as f:
    book_covers_data = json.load(f)

In [3]:
class BookCover:
    def __init__(self, json_object):
        self.image_name = json_object['image_name']
        self.pages_with_matching_images = json_object['custom_annotations']['pages_with_matching_images']
        self.full_matching_images = json_object['custom_annotations']['full_matching_images']
        self.partial_matching_images = json_object['custom_annotations']['partial_matching_images']
        self.web_entities = json_object['custom_annotations']['web_entities']

    def get_possible_titles(self):
        # TODO: make a better implementation
        return [(web_entity['description'], web_entity['score']) for web_entity in self.web_entities]

    def __str__(self):
        return json.dumps(self.__dict__(), indent=4)

    def __repr__(self):
        return self.__str__()

    def __dict__(self):
        outDict = {
            'image_name': self.image_name,
            'pages_with_matching_images': self.pages_with_matching_images,
            'full_matching_images': self.full_matching_images,
            'partial_matching_images': self.partial_matching_images,
            'web_entities': self.web_entities
        }
        return outDict

In [4]:
book_covers = [BookCover(book_cover) for book_cover in book_covers_data['book_covers']]

In [15]:
class BooksAPIObj:
    def __init__(self, book_cover):
        self.book_cover = book_cover
        self.google_book_data = self.get_google_book_data_for_book_cover()

    def get_images(self, urls):
        """Downloads images from the internet"""
        images = []
        for url in urls:
            response = requests.get(url)
            images.append(response.content)
        return images

    def download_partial_matching_images(self):
        """Downloads partial matching images from the internet"""
        return self.get_images(self.get_partial_matching_images())

    def download_full_matching_images(self):
        """Downloads full matching images from the internet"""
        return self.get_images(self.get_full_matching_images())

    def get_partial_matching_images(self):
        """Returns a list of urls of images that are partial matches"""
        return self.book_cover.partial_matching_images

    def get_full_matching_images(self):
        """Returns a list of urls of images that are full matches"""
        return self.book_cover.full_matching_images

    def get_pages_with_matching_images(self):
        """Returns a list of pages that have images that match the book cover"""
        return self.book_cover.pages_with_matching_images

    def clean_title(self, title):
        """Cleans the characters in the title to make it more suitable for searching"""
        cleaned = re.sub(r'[^0-9a-zA-Z ]', '', title)
        cleaned = re.sub(r'\s+', '+', cleaned)
        return cleaned

    def get_google_book_data(self, title):
        """Gets the google book data for a given title"""
        url = f'https://www.googleapis.com/books/v1/volumes?q=:{title}&orderBy=relevance&maxResults=3'
        response = requests.get(url)
        return response.json()

    def get_most_popular_titles(self, num_titles=3):
        """Gets the most popular titles from the book cover"""
        titles = []
        for title, score in self.book_cover.get_possible_titles():
            cleaned_title = self.clean_title(title)
            titles.append((cleaned_title, score))
        titles = sorted(titles, key=lambda x: x[1], reverse=True)
        return [title for title, score in titles[:num_titles]]

    def get_google_book_data_for_titles(self, titles):
        """Gets the google book data for a list of titles"""
        google_book_data = {}
        for title in titles:
            google_book_data[title] = self.get_google_book_data(title)
        return google_book_data

    def get_google_book_data_for_book_cover(self):
        """Gets the google book data for the book cover"""
        titles = self.get_most_popular_titles()
        return self.get_google_book_data_for_titles(titles)

    def save_partial_matching_images(self, image_output_dir):
        """Saves the partial matching images to the image output directory"""
        for i, image in enumerate(self.partial_matching_images):
            with open(f'{image_output_dir}/partial_matching_image_{self.book_cover.image_name}_{i}.jpg', 'wb') as f:
                f.write(image)

    def write_to_file(self, filename):
        """Writes the object to a file"""
        with open(filename, 'w') as f:
            json.dump(self.__dict__(), f, indent=4)

    def save_full_matching_images(self, image_output_dir):
        """Saves the full matching images to the image output directory"""
        for i, image in enumerate(self.full_matching_images):
            with open(f'{image_output_dir}/full_matching_image_{self.book_cover.image_name}_{i}.jpg', 'wb') as f:
                f.write(image)

    def __str__(self):
        return json.dumps(self.__dict__(), indent=4)

    def __repr__(self):
        return self.__str__()

    def __dict__(self):
        outDict = {
            'book_cover': self.book_cover.__dict__(),
            'google_book_data': self.google_book_data,
        }
        return outDict

In [16]:
results = queue.Queue()
with (tqdm(total=len(book_covers))) as pbar:
    with ThreadPoolExecutor(max_workers=8) as executor:
        for _ in executor.map(BooksAPIObj, book_covers):
            results.put(_)
            pbar.update(1)

results_list = list(results.queue)

  0%|          | 0/480 [00:00<?, ?it/s]

In [17]:
print(results_list[0])

{
    "book_cover": {
        "image_name": "user1433_1806_book_1_3.jpg",
        "pages_with_matching_images": [
            "https://books.disney.com/book/lightning-thief-the/",
            "https://foreveryoungadult.com/2013/05/31/book-vs-movie-the-lightning-thief/",
            "https://www.bookseriesrecaps.com/what-happened-in-the-lightning-thief/",
            "https://www.walmart.com/ip/The-Lightning-Thief-Paperback-9780786838653/4406976",
            "https://ppld.org/book-reviews/lightning-thief-3",
            "https://ppld.org/book-reviews/lightning-thief-6",
            "https://ppld.org/book-reviews/lightning-thief-2",
            "https://boysbookblog.wordpress.com/2011/02/10/opening-lines-stories-that-grab-you-at-go/",
            "https://time.com/collection/100-best-ya-books/6084700/the-lightning-thief/",
            "https://fb2bookfree.com/adventure/3374-the-lightning-thief.html"
        ],
        "full_matching_images": [],
        "partial_matching_images": [
    

In [19]:
integrated_book_covers = [response.__dict__() for response in results_list]
print('Writing to file...')
with open(r'./data/book-test-set/integrated_api_responses.json', 'w', encoding='utf-8') as outfile:
    # certain book titles have unicode characters in them, so we need to encode them as utf-8
    json.dump(integrated_book_covers, outfile, ensure_ascii=False, indent=4)
print('Done!')

Writing to file...
Done!


In [7]:
first_book_cover = book_covers[0]
test = BooksAPIObj(first_book_cover)

{'Percy+Jackson+the+Olympians+The+Lightning+Thief': {'kind': 'books#volumes', 'totalItems': 963, 'items': [{'kind': 'books#volume', 'id': 'FFTJDYx_ZiEC', 'etag': 'tuem6tsBRWQ', 'selfLink': 'https://www.googleapis.com/books/v1/volumes/FFTJDYx_ZiEC', 'volumeInfo': {'title': 'Lightning Thief, The (Percy Jackson and the Olympians, Book 1)', 'authors': ['Rick Riordan'], 'publisher': 'Disney Electronic Content', 'publishedDate': '2009-05-02', 'description': "Percy Jackson is a good kid, but he can't seem to focus on his schoolwork or control his temper. And lately, being away at boarding school is only getting worse-Percy could have sworn his pre-algebra teacher turned into a monster and tried to kill him.", 'industryIdentifiers': [{'type': 'ISBN_13', 'identifier': '9781423131892'}, {'type': 'ISBN_10', 'identifier': '1423131894'}], 'readingModes': {'text': True, 'image': False}, 'pageCount': 400, 'printType': 'BOOK', 'categories': ['Juvenile Fiction'], 'averageRating': 4, 'ratingsCount': 838

In [11]:
print(test)


{
    "book_cover": {
        "image_name": "user1433_1806_book_1_3.jpg",
        "pages_with_matching_images": [
            "https://books.disney.com/book/lightning-thief-the/",
            "https://foreveryoungadult.com/2013/05/31/book-vs-movie-the-lightning-thief/",
            "https://www.bookseriesrecaps.com/what-happened-in-the-lightning-thief/",
            "https://www.walmart.com/ip/The-Lightning-Thief-Paperback-9780786838653/4406976",
            "https://ppld.org/book-reviews/lightning-thief-3",
            "https://ppld.org/book-reviews/lightning-thief-6",
            "https://ppld.org/book-reviews/lightning-thief-2",
            "https://boysbookblog.wordpress.com/2011/02/10/opening-lines-stories-that-grab-you-at-go/",
            "https://time.com/collection/100-best-ya-books/6084700/the-lightning-thief/",
            "https://fb2bookfree.com/adventure/3374-the-lightning-thief.html"
        ],
        "full_matching_images": [],
        "partial_matching_images": [
    