# Books Metadata

In this notebook, we will acquire books data from their ISBNs using the APIs.

In [None]:
from pathlib import Path
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import time

In [None]:
# for running on local
metadata_root = Path("../data")
books_data_path = metadata_root / "books.csv"

In [None]:
# for running on colab
from google.colab import drive

drive.mount('/content/gdrive')

metadata_root = Path("gdrive/MyDrive/DIS Project")
books_data_path = metadata_root / "books.csv"

Mounted at /content/gdrive


## Data Clean Up

First step is to clean ISBNs data by adding leading zeros and drop empty and duplicate ISBNs. And the end of this section, we will have a clean list of ISBNs.

In [None]:
books_df = pd.read_csv(books_data_path)
books_df

Unnamed: 0,ISBN,book_id
0,0002005018,1
1,0374157065,3
2,0399135782,5
3,0440234743,18
4,0452264464,19
...,...,...
16594,786914041,248348
16595,62117378,247944
16596,1905294964,248214
16597,1937007588,247154


In [None]:
books_df = books_df.dropna(subset=["ISBN"])
books_df.loc[:, "ISBN"] = books_df["ISBN"].astype(str)
books_df

Unnamed: 0,ISBN,book_id
0,0002005018,1
1,0374157065,3
2,0399135782,5
3,0440234743,18
4,0452264464,19
...,...,...
16594,786914041,248348
16595,62117378,247944
16596,1905294964,248214
16597,1937007588,247154


In [None]:
books_df.loc[:, "ISBN"] = books_df["ISBN"].str.zfill(10)
books_df

Unnamed: 0,ISBN,book_id
0,0002005018,1
1,0374157065,3
2,0399135782,5
3,0440234743,18
4,0452264464,19
...,...,...
16594,0786914041,248348
16595,0062117378,247944
16596,1905294964,248214
16597,1937007588,247154


In [None]:
isbn_list = books_df["ISBN"].unique()
print(f"Number of unique ISBNs: {len(isbn_list)}")
isbn_list

Number of unique ISBNs: 15540


array(['0002005018', '0374157065', '0399135782', ..., '0062117378',
       '1905294964', '1937007588'], dtype=object)

## Acquire Books Metadata

In [None]:
BASE_URL = "https://openlibrary.org"

### Book Data

In [None]:
def fetch_books_data(isbn_list, batch_size=192, delay=1, max_retries=20):
    books_data = {}

    for i in tqdm(range(0, len(isbn_list), batch_size), desc="Fetching Books"):
        batch_isbns = isbn_list[i : i + batch_size]
        bibkeys = ",".join(["ISBN:" + isbn for isbn in batch_isbns])
        params = {
            "bibkeys": bibkeys,
            "format": "json",
            "jscmd": "details",
        }
        retry_count = 0
        while retry_count < max_retries:
            try:
                response = requests.get(f"{BASE_URL}/api/books", params=params)
                response.raise_for_status()
                data = response.json()
                books_data.update(data)
                break
            except requests.exceptions.RequestException as e:
                retry_count += 1
                print(f"error at index {i} (attempt {retry_count}): {e}")
                if retry_count == max_retries:
                    print(f"giving up on batch at index {i}.")
                else:
                    time.sleep(delay * retry_count)
        time.sleep(delay)

    return books_data

In [None]:
books_data = fetch_books_data(isbn_list)
with open(metadata_root / "open_library_data" / "books_data.json", "w", encoding="utf-8") as f:
    json.dump(books_data, f, ensure_ascii=False, indent=4)

Fetching Books: 100%|██████████| 81/81 [22:45<00:00, 16.86s/it]


### Work Data

In [None]:
with open(metadata_root / "open_library_data" / "books_data.json", "r", encoding="utf-8") as f:
    books_data = json.load(f)
print(len(books_data))

15488


In [None]:
def extract_work_keys(books_data):
    work_keys = set()
    for book_info in books_data.values():
        if "works" in book_info["details"] and book_info["details"]["works"]:
            for work in book_info["details"]["works"]:
                work_key = work.get("key")
                if work_key:
                    work_keys.add(work_key)
    return list(work_keys)

In [None]:
def fetch_works_data(work_keys, batch_size=32, delay=1, max_retries=20):
    works_data = {}
    for i in tqdm(range(0, len(work_keys), batch_size), desc="Fetching Works"):
        batch_keys = work_keys[i : i + batch_size]
        urls = [f"{BASE_URL}{key}.json" for key in batch_keys]

        for url in urls:
            retry_count = 0
            while retry_count < max_retries:
                try:
                    response = requests.get(url)
                    response.raise_for_status()
                    data = response.json()
                    works_data[data["key"]] = data
                    break
                except requests.exceptions.RequestException as e:
                    retry_count += 1
                    print(f"Error fetching URL {url} (attempt {retry_count}): {e}")
                    if retry_count == max_retries:
                        print(f"Giving up on URL {url} after {max_retries} attempts.")
                    else:
                        time.sleep(delay)

            time.sleep(delay)

    return works_data

In [None]:
work_keys = extract_work_keys(books_data)
print(len(work_keys))

12357


In [None]:
works_data = fetch_works_data(work_keys)
with open(metadata_root / "open_library_data" / "works_data.json", "w", encoding="utf-8") as f:
    json.dump(works_data, f, ensure_ascii=False, indent=4)

Fetching Works:  65%|██████▍   | 250/387 [2:42:40<1:28:34, 38.79s/it]

Error fetching URL https://openlibrary.org/works/OL796597W.json (attempt 1): 503 Server Error: Service Unavailable for url: https://openlibrary.org/works/OL796597W.json
Error fetching URL https://openlibrary.org/works/OL796597W.json (attempt 2): 503 Server Error: Service Unavailable for url: https://openlibrary.org/works/OL796597W.json


Fetching Works: 100%|██████████| 387/387 [4:10:44<00:00, 38.87s/it]


### Author Data

In [None]:
with open(metadata_root / "open_library_data" / "books_data.json", "r", encoding="utf-8") as f:
    books_data = json.load(f)
print(len(books_data))

with open(metadata_root / "open_library_data" / "works_data.json", "r", encoding="utf-8") as f:
    works_data = json.load(f)
print(len(works_data))

15488
12357


In [None]:
def extract_author_keys(books_data, works_data):
    author_keys = set()
    # Extract from books
    for book_info in books_data.values():
        if "authors" in book_info["details"] and book_info["details"]["authors"]:
            for author in book_info["details"]["authors"]:
                author_key = author.get("key")
                if author_key:
                    author_keys.add(author_key)
    # Extract from works
    for work_info in works_data.values():
        if "authors" in work_info and work_info["authors"]:
            for author in work_info["authors"]:
                author_data = author.get("author", {})
                if isinstance(author_data, dict):
                    author_key = author_data.get("key")
                else:
                    author_key = author_data
                if author_key:
                    author_keys.add(author_key)
    return list(author_keys)

In [None]:
def fetch_authors_data(author_keys, batch_size=32, delay=1, max_retries=20):
    authors_data = {}
    for i in tqdm(range(0, len(author_keys), batch_size), desc="Fetching Authors"):
        batch_keys = author_keys[i : i + batch_size]
        urls = [f"{BASE_URL}{key}.json" for key in batch_keys]
        for url in urls:
            retry_count = 0
            while retry_count < max_retries:
                try:
                    response = requests.get(url)
                    response.raise_for_status()
                    data = response.json()
                    authors_data[data["key"]] = data
                    break
                except requests.exceptions.RequestException as e:
                    retry_count += 1
                    print(f"Error fetching URL {url} (attempt {retry_count}): {e}")
                    if retry_count == max_retries:
                        print(f"Giving up on URL {url} after {max_retries} attempts.")
                    else:
                        time.sleep(delay)
            time.sleep(delay)
    return authors_data

In [None]:
author_keys = extract_author_keys(books_data, works_data)
print(len(author_keys))

6555


In [None]:
authors_data = fetch_authors_data(author_keys)
with open(metadata_root / "open_library_data" / "authors_data.json", "w", encoding="utf-8") as f:
    json.dump(authors_data, f, ensure_ascii=False, indent=4)

Fetching Authors:  89%|████████▉ | 183/205 [2:02:09<15:10, 41.37s/it]