# Books Metadata

In this notebook, we will acquire books data from their ISBNs using the APIs.

In [None]:
from pathlib import Path
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import time

In [None]:
metadata_root = Path("./metadata")
books_data_path = metadata_root / "books.csv"

## Data Clean Up

First step is to clean ISBNs data by adding leading zeros and drop empty and duplicate ISBNs. And the end of this section, we will have a clean list of ISBNs.

In [None]:
books_df = pd.read_csv(books_data_path)
books_df

In [None]:
books_df = books_df.dropna(subset=["ISBN"])
books_df.loc[:, "ISBN"] = books_df["ISBN"].astype(str)
books_df

In [None]:
books_df.loc[:, "ISBN"] = books_df["ISBN"].str.zfill(10)
books_df

In [None]:
isbn_list = books_df["ISBN"].unique()
print(f"Number of unique ISBNs: {len(isbn_list)}")
isbn_list

## Acquire Books Metadata

In [None]:
BASE_URL = "https://openlibrary.org"

### Book Data

In [None]:
def fetch_books_data(isbn_list, batch_size=64, delay=5, max_retries=20):
    books_data = {}

    for i in tqdm(range(0, len(isbn_list), batch_size), desc="Fetching Books"):
        batch_isbns = isbn_list[i : i + batch_size]
        bibkeys = ",".join(["ISBN:" + isbn for isbn in batch_isbns])
        params = {
            "bibkeys": bibkeys,
            "format": "json",
            "jscmd": "details",
        }
        retry_count = 0
        while retry_count < max_retries:
            try:
                response = requests.get(f"{BASE_URL}/api/books", params=params)
                response.raise_for_status()
                data = response.json()
                books_data.update(data)
                break
            except requests.exceptions.RequestException as e:
                retry_count += 1
                print(f"error at index {i} (attempt {retry_count}): {e}")
                if retry_count == max_retries:
                    print(f"giving up on batch at index {i}.")
                else:
                    time.sleep(delay * retry_count)
        time.sleep(delay)

    return books_data

In [None]:
books_data = fetch_books_data(isbn_list)
with open(metadata_root / "books_data.json", "w", encoding="utf-8") as f:
    json.dump(books_data, f, ensure_ascii=False, indent=4)

### Work Data

In [None]:
with open(metadata_root / "books_data.json", "r", encoding="utf-8") as f:
    books_data = json.load(f)
print(len(books_data))

In [None]:
def extract_work_keys(books_data):
    work_keys = set()
    for book_info in books_data.values():
        if "works" in book_info["details"] and book_info["details"]["works"]:
            for work in book_info["details"]["works"]:
                work_key = work.get("key")
                if work_key:
                    work_keys.add(work_key)
    return list(work_keys)

In [None]:
def fetch_works_data(work_keys, batch_size=32, delay=1, max_retries=10):
    works_data = {}
    for i in tqdm(range(0, len(work_keys), batch_size), desc="Fetching Works"):
        batch_keys = work_keys[i : i + batch_size]
        urls = [f"{BASE_URL}{key}.json" for key in batch_keys]

        for url in urls:
            retry_count = 0
            while retry_count < max_retries:
                try:
                    response = requests.get(url)
                    response.raise_for_status()
                    data = response.json()
                    works_data[data["key"]] = data
                    break
                except requests.exceptions.RequestException as e:
                    retry_count += 1
                    print(f"Error fetching URL {url} (attempt {retry_count}): {e}")
                    if retry_count == max_retries:
                        print(f"Giving up on URL {url} after {max_retries} attempts.")
                    else:
                        time.sleep(delay)

            time.sleep(delay)

    return works_data

In [None]:
work_keys = extract_work_keys(books_data)
print(len(work_keys))

In [None]:
works_data = fetch_works_data(work_keys)
with open(metadata_root / "works_data.json", "w", encoding="utf-8") as f:
    json.dump(works_data, f, ensure_ascii=False, indent=4)

### Author Data

In [None]:
with open(metadata_root / "books_data.json", "r", encoding="utf-8") as f:
    books_data = json.load(f)
print(len(books_data))

In [None]:
with open(metadata_root / "works_data.json", "r", encoding="utf-8") as f:
    works_data = json.load(f)
print(len(works_data))

In [None]:
def extract_author_keys(books_data, works_data):
    author_keys = set()
    # Extract from books
    for book_info in books_data.values():
        if "authors" in book_info and book_info["authors"]:
            for author in book_info["authors"]:
                author_key = author.get("key")
                if author_key:
                    author_keys.add(author_key)
    # Extract from works
    for work_info in works_data.values():
        if "authors" in work_info and work_info["authors"]:
            for author in work_info["authors"]:
                author_key = author.get("author", {}).get("key")
                if author_key:
                    author_keys.add(author_key)
    return list(author_keys)

In [None]:
def fetch_authors_data(author_keys, batch_size=32, delay=0.1, max_retries=3):
    authors_data = {}
    for i in tqdm(range(0, len(author_keys), batch_size), desc="Fetching Authors"):
        batch_keys = author_keys[i : i + batch_size]
        urls = [f"{BASE_URL}{key}.json" for key in batch_keys]
        for url in urls:
            retry_count = 0
            while retry_count < max_retries:
                try:
                    response = requests.get(url)
                    response.raise_for_status()
                    data = response.json()
                    authors_data[data["key"]] = data
                    break
                except requests.exceptions.RequestException as e:
                    retry_count += 1
                    print(f"Error fetching URL {url} (attempt {retry_count}): {e}")
                    if retry_count == max_retries:
                        print(f"Giving up on URL {url} after {max_retries} attempts.")
                    else:
                        time.sleep(delay)
            time.sleep(delay)
    return authors_data

In [None]:
author_keys = extract_author_keys(books_data, works_data)
print(len(author_keys))

In [None]:
authors_data = fetch_authors_data(author_keys)
with open(metadata_root / "authors_data.json", "w", encoding="utf-8") as f:
    json.dump(authors_data, f, ensure_ascii=False, indent=4)