# Books Metadata

In this notebook, we will acquire books data from their ISBNs using the APIs.

In [1]:
from pathlib import Path
import requests
import numpy as np
import pandas as pd
from tqdm import tqdm
import json
import time

In [2]:
metadata_root = Path("./metadata")
books_data_path = metadata_root / "books.csv"

## Data Clean Up

First step is to clean ISBNs data by adding leading zeros and drop empty and duplicate ISBNs. And the end of this section, we will have a clean list of ISBNs.

In [3]:
books_df = pd.read_csv(books_data_path)
books_df

Unnamed: 0,ISBN,book_id
0,0002005018,1
1,0374157065,3
2,0399135782,5
3,0440234743,18
4,0452264464,19
...,...,...
16594,786914041,248348
16595,62117378,247944
16596,1905294964,248214
16597,1937007588,247154


In [4]:
books_df = books_df.dropna(subset=["ISBN"])
books_df.loc[:, "ISBN"] = books_df["ISBN"].astype(str)
books_df

Unnamed: 0,ISBN,book_id
0,0002005018,1
1,0374157065,3
2,0399135782,5
3,0440234743,18
4,0452264464,19
...,...,...
16594,786914041,248348
16595,62117378,247944
16596,1905294964,248214
16597,1937007588,247154


In [5]:
books_df.loc[:, "ISBN"] = books_df["ISBN"].str.zfill(10)
books_df

Unnamed: 0,ISBN,book_id
0,0002005018,1
1,0374157065,3
2,0399135782,5
3,0440234743,18
4,0452264464,19
...,...,...
16594,0786914041,248348
16595,0062117378,247944
16596,1905294964,248214
16597,1937007588,247154


In [6]:
isbn_list = books_df["ISBN"].unique()
print(f"Number of unique ISBNs: {len(isbn_list)}")
isbn_list

Number of unique ISBNs: 15540


array(['0002005018', '0374157065', '0399135782', ..., '0062117378',
       '1905294964', '1937007588'], dtype=object)

## Acquire Books Metadata

In [7]:
BASE_URL = "https://openlibrary.org"

### Book Data

In [12]:
def fetch_books_data(isbn_list, batch_size=64, delay=5, max_retries=20):
    books_data = {}

    for i in tqdm(range(0, len(isbn_list), batch_size), desc="Fetching Books"):
        batch_isbns = isbn_list[i : i + batch_size]
        bibkeys = ",".join(["ISBN:" + isbn for isbn in batch_isbns])
        params = {
            "bibkeys": bibkeys,
            "format": "json",
            "jscmd": "details",
        }
        retry_count = 0
        while retry_count < max_retries:
            try:
                response = requests.get(f"{BASE_URL}/api/books", params=params)
                response.raise_for_status()
                data = response.json()
                books_data.update(data)
                break
            except requests.exceptions.RequestException as e:
                retry_count += 1
                print(f"error at index {i} (attempt {retry_count}): {e}")
                if retry_count == max_retries:
                    print(f"giving up on batch at index {i}.")
                else:
                    time.sleep(delay * retry_count)
        time.sleep(delay)

    return books_data

In [13]:
books_data = fetch_books_data(isbn_list)
with open(metadata_root / "books_data.json", "w", encoding="utf-8") as f:
    json.dump(books_data, f, ensure_ascii=False, indent=4)

Fetching Books:  39%|███▉      | 95/243 [20:10<1:14:14, 30.10s/it]

error at index 6080 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0449006581%2CISBN%3A038547167X%2CISBN%3A0060913983%2CISBN%3A0874411033%2CISBN%3A0345434471%2CISBN%3A1551669412%2CISBN%3A0451205448%2CISBN%3A0452284295%2CISBN%3A0425190749%2CISBN%3A0451197550%2CISBN%3A159184021X%2CISBN%3A0743442652%2CISBN%3A1575667223%2CISBN%3A1575667215%2CISBN%3A038081918X%2CISBN%3A0380816830%2CISBN%3A0425184315%2CISBN%3A0380814390%2CISBN%3A1551666189%2CISBN%3A0743219333%2CISBN%3A0671786601%2CISBN%3A0505524228%2CISBN%3A0380782340%2CISBN%3A0505524279%2CISBN%3A0312981848%2CISBN%3A0515133868%2CISBN%3A1551668270%2CISBN%3A0373226055%2CISBN%3A0821765221%2CISBN%3A0446403830%2CISBN%3A074342347X%2CISBN%3A0671729454%2CISBN%3A0671007718%2CISBN%3A0446364819%2CISBN%3A8478886451%2CISBN%3A8478885196%2CISBN%3A8478884955%2CISBN%3A8478884459%2CISBN%3A0449208648%2CISBN%3A044918336X%2CISBN%3A0393000567%2CISBN%3A0553211668%2CISBN%3A0394550846%2CISBN%3A0679737383%2CI

Fetching Books:  51%|█████     | 124/243 [34:15<56:14, 28.35s/it]  

error at index 7936 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0590254766%2CISBN%3A0590553232%2CISBN%3A0060953691%2CISBN%3A0553275569%2CISBN%3A0060195703%2CISBN%3A0553289586%2CISBN%3A0684809001%2CISBN%3A0380400634%2CISBN%3A0345424646%2CISBN%3A0451193237%2CISBN%3A0679403612%2CISBN%3A0505524090%2CISBN%3A0505523892%2CISBN%3A0743471393%2CISBN%3A1401301231%2CISBN%3A050552354X%2CISBN%3A0425190641%2CISBN%3A0505523752%2CISBN%3A0446611360%2CISBN%3A0451524764%2CISBN%3A0553379658%2CISBN%3A1580050751%2CISBN%3A0465014909%2CISBN%3A1551667940%2CISBN%3A0140295569%2CISBN%3A0452278120%2CISBN%3A0316107298%2CISBN%3A0517266555%2CISBN%3A0553801880%2CISBN%3A0399142185%2CISBN%3A0425137562%2CISBN%3A1581346085%2CISBN%3A0553575953%2CISBN%3A0399139818%2CISBN%3A1558747435%2CISBN%3A0380815575%2CISBN%3A0517595338%2CISBN%3A1578561256%2CISBN%3A0671681567%2CISBN%3A3596150981%2CISBN%3A0812515595%2CISBN%3A0515135739%2CISBN%3A0590484125%2CISBN%3A0743202694%2CI

Fetching Books:  64%|██████▍   | 156/243 [50:02<31:28, 21.71s/it]  

error at index 9984 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A189205101X%2CISBN%3A000648302X%2CISBN%3A0060952520%2CISBN%3A0671673645%2CISBN%3A0689855443%2CISBN%3A0553276182%2CISBN%3A0446363073%2CISBN%3A0451128389%2CISBN%3A0061032034%2CISBN%3A0441552765%2CISBN%3A0451180763%2CISBN%3A968150108X%2CISBN%3A0345423879%2CISBN%3A0441338518%2CISBN%3A0345336615%2CISBN%3A080501084X%2CISBN%3A0679435824%2CISBN%3A0380802457%2CISBN%3A0553272225%2CISBN%3A0300089023%2CISBN%3A0440802458%2CISBN%3A0312969414%2CISBN%3A0743403428%2CISBN%3A0140274286%2CISBN%3A006017773X%2CISBN%3A0743203984%2CISBN%3A0765342510%2CISBN%3A0671795740%2CISBN%3A0553274465%2CISBN%3A0821773682%2CISBN%3A0517559501%2CISBN%3A0671704591%2CISBN%3A1551665824%2CISBN%3A0671020366%2CISBN%3A0671020374%2CISBN%3A0394800893%2CISBN%3A0812544595%2CISBN%3A0156006219%2CISBN%3A0060953713%2CISBN%3A0394580702%2CISBN%3A0380785188%2CISBN%3A0060392991%2CISBN%3A0688077080%2CISBN%3A0451450647%2CI

Fetching Books:  65%|██████▍   | 157/243 [51:42<1:04:46, 45.19s/it]

error at index 10048 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0312978448%2CISBN%3A0066214769%2CISBN%3A0380977478%2CISBN%3A0425166112%2CISBN%3A0425182010%2CISBN%3A0345462270%2CISBN%3A0804119716%2CISBN%3A0743243978%2CISBN%3A0515120448%2CISBN%3A0451408586%2CISBN%3A0440241111%2CISBN%3A0312977689%2CISBN%3A0380820706%2CISBN%3A0449006840%2CISBN%3A1551669625%2CISBN%3A0671739727%2CISBN%3A0671739751%2CISBN%3A0312987625%2CISBN%3A0261102362%2CISBN%3A0451523202%2CISBN%3A185326007X%2CISBN%3A0786889179%2CISBN%3A0425186865%2CISBN%3A0590409174%2CISBN%3A0373790961%2CISBN%3A0525946500%2CISBN%3A067187750X%2CISBN%3A0671690582%2CISBN%3A0671026305%2CISBN%3A0671523686%2CISBN%3A8420633119%2CISBN%3A0091891965%2CISBN%3A096463161X%2CISBN%3A0064406970%2CISBN%3A3453866797%2CISBN%3A3612275127%2CISBN%3A039914868X%2CISBN%3A0743436601%2CISBN%3A0330255320%2CISBN%3A0940322153%2CISBN%3A0441007430%2CISBN%3A0312319754%2CISBN%3A8466300651%2CISBN%3A014044131X%2C

Fetching Books:  65%|██████▌   | 158/243 [53:05<1:19:51, 56.37s/it]

error at index 10112 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0345450884%2CISBN%3A0345434870%2CISBN%3A0060976624%2CISBN%3A0399135812%2CISBN%3A0345453743%2CISBN%3A0140276238%2CISBN%3A0330487612%2CISBN%3A2070364232%2CISBN%3A0345457374%2CISBN%3A0451199669%2CISBN%3A0380816822%2CISBN%3A0743428617%2CISBN%3A0439425220%2CISBN%3A0091842050%2CISBN%3A0380795981%2CISBN%3A0425135772%2CISBN%3A0452283221%2CISBN%3A0786868414%2CISBN%3A0375411089%2CISBN%3A0385336195%2CISBN%3A0515087491%2CISBN%3A0440209862%2CISBN%3A0679451463%2CISBN%3A0671024809%2CISBN%3A0684854848%2CISBN%3A0609810022%2CISBN%3A3426610361%2CISBN%3A3404139178%2CISBN%3A3442444365%2CISBN%3A0515132039%2CISBN%3A0671014676%2CISBN%3A0553568167%2CISBN%3A0425173739%2CISBN%3A0060934794%2CISBN%3A0399149309%2CISBN%3A059033123X%2CISBN%3A0679737898%2CISBN%3A0452285186%2CISBN%3A3404920481%2CISBN%3A0553213458%2CISBN%3A0399142703%2CISBN%3A067003004X%2CISBN%3A0285636464%2CISBN%3A0684847817%2C

Fetching Books:  65%|██████▌   | 159/243 [54:22<1:27:36, 62.58s/it]

error at index 10176 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A006103083X%2CISBN%3A1559587032%2CISBN%3A0062513591%2CISBN%3A0515129240%2CISBN%3A0316809063%2CISBN%3A0440979757%2CISBN%3A0553493981%2CISBN%3A0345434684%2CISBN%3A0671695320%2CISBN%3A0140250913%2CISBN%3A0434009202%2CISBN%3A2277241202%2CISBN%3A0525944001%2CISBN%3A0451197968%2CISBN%3A8484505294%2CISBN%3A0385334389%2CISBN%3A0886771404%2CISBN%3A0345348672%2CISBN%3A0441865038%2CISBN%3A0812531108%2CISBN%3A0380721457%2CISBN%3A0452272319%2CISBN%3A0451202511%2CISBN%3A0505522659%2CISBN%3A091581112X%2CISBN%3A0886779898%2CISBN%3A0060081597%2CISBN%3A0743412311%2CISBN%3A1561380911%2CISBN%3A0140431179%2CISBN%3A3423071516%2CISBN%3A0743422740%2CISBN%3A0451408756%2CISBN%3A3257233396%2CISBN%3A0425103552%2CISBN%3A0553572024%2CISBN%3A2264029951%2CISBN%3A2253153265%2CISBN%3A2226133895%2CISBN%3A0060534044%2CISBN%3A2266120166%2CISBN%3A0446612588%2CISBN%3A0751502014%2CISBN%3A0312970234%2C

Fetching Books:  66%|██████▌   | 160/243 [55:44<1:34:49, 68.55s/it]

error at index 10240 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A8838910170%2CISBN%3A0812930991%2CISBN%3A0446612766%2CISBN%3A045121157X%2CISBN%3A0399151540%2CISBN%3A0771099975%2CISBN%3A1560974273%2CISBN%3A1853260037%2CISBN%3A0812523466%2CISBN%3A0515131660%2CISBN%3A044100928X%2CISBN%3A0590452568%2CISBN%3A0399231420%2CISBN%3A0312289197%2CISBN%3A0440220394%2CISBN%3A0316845221%2CISBN%3A0770426182%2CISBN%3A2266023039%2CISBN%3A0140233903%2CISBN%3A0449210677%2CISBN%3A0425190838%2CISBN%3A0684833778%2CISBN%3A068803036X%2CISBN%3A0765343266%2CISBN%3A0765345013%2CISBN%3A0810909650%2CISBN%3A0385508719%2CISBN%3A2070406962%2CISBN%3A0812536037%2CISBN%3A0385487622%2CISBN%3A078688200X%2CISBN%3A031600023X%2CISBN%3A0440235448%2CISBN%3A8433969528%2CISBN%3A0679428879%2CISBN%3A0553802038%2CISBN%3A1551669552%2CISBN%3A0743223551%2CISBN%3A0425193993%2CISBN%3A0451165551%2CISBN%3A3423205903%2CISBN%3A0061056928%2CISBN%3A0811818276%2CISBN%3A0609809636%2C

Fetching Books:  66%|██████▋   | 161/243 [56:59<1:36:23, 70.54s/it]

error at index 10304 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0091867908%2CISBN%3A0671769227%2CISBN%3A1551665972%2CISBN%3A055357289X%2CISBN%3A0394712587%2CISBN%3A0441029205%2CISBN%3A0006551971%2CISBN%3A0061092096%2CISBN%3A0671648152%2CISBN%3A0865714088%2CISBN%3A0743477316%2CISBN%3A0804104549%2CISBN%3A0380791854%2CISBN%3A0892966777%2CISBN%3A0446377651%2CISBN%3A0192816209%2CISBN%3A3548600808%2CISBN%3A0552150738%2CISBN%3A0140250832%2CISBN%3A0786918047%2CISBN%3A0312962444%2CISBN%3A0345329821%2CISBN%3A1561483176%2CISBN%3A0679767223%2CISBN%3A0451933028%2CISBN%3A0312983395%2CISBN%3A0380727617%2CISBN%3A0374373620%2CISBN%3A067172052X%2CISBN%3A0886775914%2CISBN%3A0449130703%2CISBN%3A1572460733%2CISBN%3A2205054252%2CISBN%3A1563891387%2CISBN%3A0330482203%2CISBN%3A0886777992%2CISBN%3A0812504577%2CISBN%3A0743236009%2CISBN%3A0345288238%2CISBN%3A0060528370%2CISBN%3A0060921056%2CISBN%3A014062063X%2CISBN%3A0140390197%2CISBN%3A0552141275%2C

Fetching Books:  67%|██████▋   | 162/243 [58:23<1:40:35, 74.51s/it]

error at index 10368 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0452279623%2CISBN%3A0452279615%2CISBN%3A078601475X%2CISBN%3A0441010652%2CISBN%3A0590403605%2CISBN%3A067976657X%2CISBN%3A1577780728%2CISBN%3A0345459407%2CISBN%3A0451456688%2CISBN%3A0515137200%2CISBN%3A014130636X%2CISBN%3A0786889780%2CISBN%3A0440941407%2CISBN%3A0446678112%2CISBN%3A0425062333%2CISBN%3A067974066X%2CISBN%3A0515136298%2CISBN%3A0758201346%2CISBN%3A0505524899%2CISBN%3A0373790376%2CISBN%3A0515134899%2CISBN%3A0553257382%2CISBN%3A067153288X%2CISBN%3A0440235081%2CISBN%3A017443460X%2CISBN%3A0345350480%2CISBN%3A0385007515%2CISBN%3A0349101787%2CISBN%3A849550152X%2CISBN%3A0345440056%2CISBN%3A038097407X%2CISBN%3A0394909674%2CISBN%3A0375413006%2CISBN%3A0886775132%2CISBN%3A0312252943%2CISBN%3A0140096930%2CISBN%3A0060977035%2CISBN%3A0451457307%2CISBN%3A0671870807%2CISBN%3A1400034639%2CISBN%3A0671654160%2CISBN%3A0380730375%2CISBN%3A0399527257%2CISBN%3A050552337X%2C

Fetching Books:  67%|██████▋   | 163/243 [1:00:05<1:50:15, 82.69s/it]

error at index 10432 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A2253097829%2CISBN%3A0451518829%2CISBN%3A3453025237%2CISBN%3A0679740392%2CISBN%3A0752808753%2CISBN%3A0140081135%2CISBN%3A0385495757%2CISBN%3A0671682873%2CISBN%3A1853262218%2CISBN%3A3404129660%2CISBN%3A3492234429%2CISBN%3A0451526570%2CISBN%3A0373078110%2CISBN%3A3453115937%2CISBN%3A006447352X%2CISBN%3A0345417062%2CISBN%3A067972575X%2CISBN%3A0679416463%2CISBN%3A0732257476%2CISBN%3A184149058X%2CISBN%3A0380820560%2CISBN%3A078691551X%2CISBN%3A0786915528%2CISBN%3A0140315977%2CISBN%3A0425067947%2CISBN%3A0345375599%2CISBN%3A0312876637%2CISBN%3A0515121959%2CISBN%3A0060548258%2CISBN%3A0553565214%2CISBN%3A0140071075%2CISBN%3A0140441212%2CISBN%3A0007170866%2CISBN%3A0876120796%2CISBN%3A3257230885%2CISBN%3A0517565188%2CISBN%3A2253006920%2CISBN%3A8420466034%2CISBN%3A3522128001%2CISBN%3A3499158124%2CISBN%3A1551669250%2CISBN%3A3492107001%2CISBN%3A3453861434%2CISBN%3A0449214923%2C

Fetching Books:  67%|██████▋   | 164/243 [1:01:53<1:59:03, 90.43s/it]

error at index 10496 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A2070360105%2CISBN%3A0684848465%2CISBN%3A0312317751%2CISBN%3A0373709366%2CISBN%3A0785270493%2CISBN%3A0399142487%2CISBN%3A0060008725%2CISBN%3A0553094416%2CISBN%3A0743249925%2CISBN%3A0140620621%2CISBN%3A0192834622%2CISBN%3A0060188952%2CISBN%3A0380974991%2CISBN%3A0679781498%2CISBN%3A0060176660%2CISBN%3A0375400516%2CISBN%3A0399148612%2CISBN%3A0525943749%2CISBN%3A0553211919%2CISBN%3A0553212443%2CISBN%3A0451452062%2CISBN%3A0812510011%2CISBN%3A0140366733%2CISBN%3A0330349678%2CISBN%3A0441011632%2CISBN%3A0449910555%2CISBN%3A325721846X%2CISBN%3A0575600306%2CISBN%3A0451526775%2CISBN%3A0440224640%2CISBN%3A0671600419%2CISBN%3A0671876236%2CISBN%3A0061093580%2CISBN%3A0316921173%2CISBN%3A0393057658%2CISBN%3A074346446X%2CISBN%3A1569472424%2CISBN%3A0140238719%2CISBN%3A0451407636%2CISBN%3A3442350247%2CISBN%3A0812532538%2CISBN%3A0930289528%2CISBN%3A3404123689%2CISBN%3A0375718850%2C

Fetching Books:  68%|██████▊   | 165/243 [1:03:25<1:58:05, 90.84s/it]

error at index 10560 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0440210917%2CISBN%3A1551667126%2CISBN%3A1580050298%2CISBN%3A0609607383%2CISBN%3A0310205719%2CISBN%3A0425126870%2CISBN%3A0425180433%2CISBN%3A0812969650%2CISBN%3A0061091227%2CISBN%3A0345425480%2CISBN%3A044661372X%2CISBN%3A0671701606%2CISBN%3A0440215730%2CISBN%3A0786882573%2CISBN%3A0717283194%2CISBN%3A0156465116%2CISBN%3A0517219018%2CISBN%3A0393324826%2CISBN%3A0385121679%2CISBN%3A8408022938%2CISBN%3A1558532765%2CISBN%3A8437600685%2CISBN%3A0451159411%2CISBN%3A0446518344%2CISBN%3A0836252926%2CISBN%3A0380776162%2CISBN%3A0312979088%2CISBN%3A0805425500%2CISBN%3A1571312471%2CISBN%3A0590306731%2CISBN%3A0553382241%2CISBN%3A0571198775%2CISBN%3A0973047305%2CISBN%3A0671644467%2CISBN%3A1571458735%2CISBN%3A0380820692%2CISBN%3A0061020621%2CISBN%3A0380775271%2CISBN%3A0451207394%2CISBN%3A0740713922%2CISBN%3A0664224199%2CISBN%3A0671020781%2CISBN%3A1883473004%2CISBN%3A0345369343%2C

Fetching Books:  68%|██████▊   | 166/243 [1:04:43<1:51:27, 86.85s/it]

error at index 10624 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A078686804X%2CISBN%3A0553111132%2CISBN%3A0373790147%2CISBN%3A0452283507%2CISBN%3A0345433300%2CISBN%3A0345326784%2CISBN%3A0786863900%2CISBN%3A0152012397%2CISBN%3A0060234814%2CISBN%3A0670873616%2CISBN%3A0425178900%2CISBN%3A0446523232%2CISBN%3A0743205383%2CISBN%3A0345295250%2CISBN%3A1558599177%2CISBN%3A1551666804%2CISBN%3A0140441476%2CISBN%3A0671722565%2CISBN%3A0786926945%2CISBN%3A0312141440%2CISBN%3A0374128715%2CISBN%3A0743451716%2CISBN%3A0140174664%2CISBN%3A0441788386%2CISBN%3A0064430189%2CISBN%3A0345353889%2CISBN%3A081120118X%2CISBN%3A067121148X%2CISBN%3A840149186X%2CISBN%3A050552421X%2CISBN%3A0449002551%2CISBN%3A0886777348%2CISBN%3A0884860884%2CISBN%3A0060928832%2CISBN%3A0451201159%2CISBN%3A0671577786%2CISBN%3A0375702695%2CISBN%3A8401462231%2CISBN%3A1558533834%2CISBN%3A0679740244%2CISBN%3A075640049X%2CISBN%3A0380893002%2CISBN%3A0385326335%2CISBN%3A0451198646%2C

Fetching Books:  71%|███████   | 173/243 [1:12:02<1:07:28, 57.83s/it]

error at index 11072 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0142437298%2CISBN%3A0805072454%2CISBN%3A1606841750%2CISBN%3A1400078776%2CISBN%3A006057299X%2CISBN%3A0399170863%2CISBN%3A0723247706%2CISBN%3A0399156577%2CISBN%3A0312361270%2CISBN%3A1400034205%2CISBN%3A156512569X%2CISBN%3A0345478169%2CISBN%3A0743406567%2CISBN%3A1400044618%2CISBN%3A159514322X%2CISBN%3A0446698873%2CISBN%3A0760704066%2CISBN%3A1423118243%2CISBN%3A1416554955%2CISBN%3A0060885459%2CISBN%3A0375821821%2CISBN%3A0394709306%2CISBN%3A044022800X%2CISBN%3A1857983416%2CISBN%3A0385340117%2CISBN%3A0698113578%2CISBN%3A0618485228%2CISBN%3A1408852594%2CISBN%3A0099595818%2CISBN%3A1400078431%2CISBN%3A1594489866%2CISBN%3A0312315732%2CISBN%3A0374384738%2CISBN%3A0061240273%2CISBN%3A0440240964%2CISBN%3A0425190455%2CISBN%3A045146379X%2CISBN%3A0312380828%2CISBN%3A145162445X%2CISBN%3A0743297334%2CISBN%3A0440226708%2CISBN%3A1551111721%2CISBN%3A044101268X%2CISBN%3A0553384104%2C

Fetching Books:  75%|███████▍  | 182/243 [1:20:00<46:07, 45.37s/it]  

error at index 11648 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0375724427%2CISBN%3A0312932804%2CISBN%3A1936365537%2CISBN%3A0060935782%2CISBN%3A0441014739%2CISBN%3A0811216012%2CISBN%3A0586064176%2CISBN%3A0743482778%2CISBN%3A0670011940%2CISBN%3A033041335X%2CISBN%3A0062128027%2CISBN%3A0373772327%2CISBN%3A0374187614%2CISBN%3A0765326361%2CISBN%3A148234873X%2CISBN%3A0312947054%2CISBN%3A0060931728%2CISBN%3A1421501708%2CISBN%3A0553584502%2CISBN%3A0618101365%2CISBN%3A0763628115%2CISBN%3A014044789X%2CISBN%3A142311339X%2CISBN%3A1596437138%2CISBN%3A0345373944%2CISBN%3A1622660757%2CISBN%3A1423166000%2CISBN%3A1442402008%2CISBN%3A1400047463%2CISBN%3A080509461X%2CISBN%3A0143036661%2CISBN%3A0006174434%2CISBN%3A0802720854%2CISBN%3A0062311077%2CISBN%3A0385739168%2CISBN%3A0752815393%2CISBN%3A0316133191%2CISBN%3A1442408626%2CISBN%3A0141183721%2CISBN%3A0316098868%2CISBN%3A0670062278%2CISBN%3A0385340990%2CISBN%3A1607060760%2CISBN%3A0006480101%2C

Fetching Books:  76%|███████▌  | 184/243 [1:22:05<52:03, 52.95s/it]

error at index 11776 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0439846692%2CISBN%3A0553807056%2CISBN%3A0803740166%2CISBN%3A0142400017%2CISBN%3A1595541209%2CISBN%3A1600963943%2CISBN%3A030711838X%2CISBN%3A1595143971%2CISBN%3A1594480001%2CISBN%3A1451668201%2CISBN%3A0767915305%2CISBN%3A0670018864%2CISBN%3A059306173X%2CISBN%3A0062024035%2CISBN%3A0803734735%2CISBN%3A0553381660%2CISBN%3A067002497X%2CISBN%3A0345534182%2CISBN%3A0399231900%2CISBN%3A1594480036%2CISBN%3A1557091552%2CISBN%3A1455521191%2CISBN%3A1442402326%2CISBN%3A0553813153%2CISBN%3A0060856262%2CISBN%3A0060735414%2CISBN%3A0099471426%2CISBN%3A0316159417%2CISBN%3A0061969559%2CISBN%3A0060577371%2CISBN%3A0451475321%2CISBN%3A1596060204%2CISBN%3A0143039970%2CISBN%3A0823404706%2CISBN%3A1846558913%2CISBN%3A0743231511%2CISBN%3A1579125743%2CISBN%3A0192834401%2CISBN%3A0373775490%2CISBN%3A1439148503%2CISBN%3A031601477X%2CISBN%3A1416914293%2CISBN%3A0618680004%2CISBN%3A1616960922%2C

Fetching Books:  78%|███████▊  | 189/243 [1:25:59<44:09, 49.07s/it]  

error at index 12096 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0486270645%2CISBN%3A076534078X%2CISBN%3A0446520594%2CISBN%3A1860498809%2CISBN%3A0060510862%2CISBN%3A0571215165%2CISBN%3A0739461192%2CISBN%3A0440241022%2CISBN%3A159463467X%2CISBN%3A1563893150%2CISBN%3A159038783X%2CISBN%3A0805080481%2CISBN%3A0689875347%2CISBN%3A069401298X%2CISBN%3A0330369032%2CISBN%3A0439895979%2CISBN%3A0689711735%2CISBN%3A1596434872%2CISBN%3A031609739X%2CISBN%3A1416505016%2CISBN%3A1416949658%2CISBN%3A0099409968%2CISBN%3A1563892049%2CISBN%3A0446675539%2CISBN%3A0312642989%2CISBN%3A0718149122%2CISBN%3A0062285505%2CISBN%3A038553714X%2CISBN%3A1847679447%2CISBN%3A0345487133%2CISBN%3A0670869392%2CISBN%3A0609602195%2CISBN%3A0062279874%2CISBN%3A0778313654%2CISBN%3A0143123238%2CISBN%3A0060724552%2CISBN%3A044024160X%2CISBN%3A0843955287%2CISBN%3A0553376055%2CISBN%3A0152047387%2CISBN%3A1400043468%2CISBN%3A0061340634%2CISBN%3A0553583182%2CISBN%3A0345495160%2C

Fetching Books:  79%|███████▊  | 191/243 [1:27:55<43:37, 50.34s/it]

error at index 12224 (attempt 1): HTTPSConnectionPool(host='openlibrary.org', port=443): Max retries exceeded with url: /api/books?bibkeys=ISBN%3A1400033411%2CISBN%3A1560254556%2CISBN%3A1906427135%2CISBN%3A1569314063%2CISBN%3A0571219357%2CISBN%3A0316201545%2CISBN%3A076534825X%2CISBN%3A1566195764%2CISBN%3A0307588653%2CISBN%3A0613026667%2CISBN%3A0007251866%2CISBN%3A0064408671%2CISBN%3A081299860X%2CISBN%3A0812974611%2CISBN%3A0399159371%2CISBN%3A0007173040%2CISBN%3A0804138141%2CISBN%3A1612130291%2CISBN%3A1423102282%2CISBN%3A045122986X%2CISBN%3A0778313077%2CISBN%3A0671035975%2CISBN%3A0062224859%2CISBN%3A0375435484%2CISBN%3A0316080845%2CISBN%3A0340893605%2CISBN%3A0312591837%2CISBN%3A0552562521%2CISBN%3A0312938810%2CISBN%3A0312650094%2CISBN%3A0316115002%2CISBN%3A0765326574%2CISBN%3A125000621X%2CISBN%3A0451528182%2CISBN%3A140121813X%2CISBN%3A0064410471%2CISBN%3A042523567X%2CISBN%3A0394839730%2CISBN%3A039586786X%2CISBN%3A054506046X%2CISBN%3A0385513534%2CISBN%3A0575058080%2CISBN%3A0060775971%2CI

Fetching Books:  82%|████████▏ | 200/243 [1:34:02<25:07, 35.07s/it]

error at index 12800 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A1301949825%2CISBN%3A1582406723%2CISBN%3A0345418484%2CISBN%3A0140283307%2CISBN%3A0192840398%2CISBN%3A0515144665%2CISBN%3A1599906953%2CISBN%3A0755358783%2CISBN%3A0062219685%2CISBN%3A0226320618%2CISBN%3A0486414248%2CISBN%3A1416927832%2CISBN%3A0142408654%2CISBN%3A0441016154%2CISBN%3A000712774X%2CISBN%3A0739380338%2CISBN%3A0743474171%2CISBN%3A0451219368%2CISBN%3A031623480X%2CISBN%3A1585673927%2CISBN%3A0156031663%2CISBN%3A1416547037%2CISBN%3A0385504225%2CISBN%3A1400067111%2CISBN%3A1847442404%2CISBN%3A067122350X%2CISBN%3A0060531258%2CISBN%3A0061974587%2CISBN%3A0671655973%2CISBN%3A1423100050%2CISBN%3A0099366614%2CISBN%3A0590396439%2CISBN%3A0060734574%2CISBN%3A0385682824%2CISBN%3A0195007778%2CISBN%3A0756403014%2CISBN%3A1595141987%2CISBN%3A0575090855%2CISBN%3A0451461037%2CISBN%3A1593081197%2CISBN%3A0385538510%2CISBN%3A1903436575%2CISBN%3A0451236718%2CISBN%3A1563894459%2C

Fetching Books:  84%|████████▍ | 204/243 [1:38:25<36:43, 56.51s/it]

error at index 13056 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0452285453%2CISBN%3A9799822939%2CISBN%3A0486422453%2CISBN%3A1600102379%2CISBN%3A141654707X%2CISBN%3A0446615862%2CISBN%3A0739352350%2CISBN%3A0803738552%2CISBN%3A1595144358%2CISBN%3A0758216432%2CISBN%3A0385520697%2CISBN%3A0297854399%2CISBN%3A0425217299%2CISBN%3A0793551617%2CISBN%3A141651693X%2CISBN%3A0552152978%2CISBN%3A078684907X%2CISBN%3A1501132938%2CISBN%3A0393064646%2CISBN%3A0812993292%2CISBN%3A0198245971%2CISBN%3A0822210789%2CISBN%3A0670026328%2CISBN%3A0062224077%2CISBN%3A1400068312%2CISBN%3A0778329860%2CISBN%3A1402262272%2CISBN%3A0553213873%2CISBN%3A0446556815%2CISBN%3A1401341705%2CISBN%3A0062414216%2CISBN%3A0394298667%2CISBN%3A006218850X%2CISBN%3A067001821X%2CISBN%3A0451229738%2CISBN%3A0312377045%2CISBN%3A0312990960%2CISBN%3A057507681X%2CISBN%3A1476731535%2CISBN%3A0307268195%2CISBN%3A0307593312%2CISBN%3A044657922X%2CISBN%3A0486455599%2CISBN%3A1595542779%2C

Fetching Books:  88%|████████▊ | 214/243 [1:45:32<19:27, 40.26s/it]

error at index 13696 (attempt 1): 504 Server Error: Gateway Time-out for url: https://openlibrary.org/api/books?bibkeys=ISBN%3A0061935085%2CISBN%3A0586203192%2CISBN%3A0446694851%2CISBN%3A0449000621%2CISBN%3A0449010880%2CISBN%3A0380817934%2CISBN%3A0679776192%2CISBN%3A0743484274%2CISBN%3A1451693567%2CISBN%3A0743470125%2CISBN%3A1593080646%2CISBN%3A0140620192%2CISBN%3A0310210062%2CISBN%3A1400033721%2CISBN%3A0316726192%2CISBN%3A0552154288%2CISBN%3A0061992259%2CISBN%3A0380821214%2CISBN%3A034541800X%2CISBN%3A0142406252%2CISBN%3A1595144919%2CISBN%3A0778322858%2CISBN%3A0312949804%2CISBN%3A0802142443%2CISBN%3A0446696137%2CISBN%3A039474067X%2CISBN%3A0812536363%2CISBN%3A0449000753%2CISBN%3A0803731531%2CISBN%3A0062248162%2CISBN%3A0099268701%2CISBN%3A0385532458%2CISBN%3A9993911550%2CISBN%3A0140449183%2CISBN%3A0062012037%2CISBN%3A0198320043%2CISBN%3A1416599398%2CISBN%3A0545424925%2CISBN%3A0670026336%2CISBN%3A0099549344%2CISBN%3A0439925509%2CISBN%3A0375869026%2CISBN%3A142150331X%2CISBN%3A0312424442%2C

Fetching Books: 100%|██████████| 243/243 [1:58:10<00:00, 29.18s/it]


### Work Data

In [18]:
with open(metadata_root / "books_data.json", "r", encoding="utf-8") as f:
    books_data = json.load(f)
print(len(books_data))

15488


In [19]:
def extract_work_keys(books_data):
    work_keys = set()
    for book_info in books_data.values():
        if "works" in book_info["details"] and book_info["details"]["works"]:
            for work in book_info["details"]["works"]:
                work_key = work.get("key")
                if work_key:
                    work_keys.add(work_key)
    return list(work_keys)

In [24]:
def fetch_works_data(work_keys, batch_size=32, delay=0.25):
    works_data = {}
    for i in tqdm(range(0, len(work_keys), batch_size), desc="Fetching Works"):
        batch_keys = work_keys[i : i + batch_size]
        urls = [f"{BASE_URL}{key}.json" for key in batch_keys]
        try:
            for url in urls:
                response = requests.get(url)
                response.raise_for_status()
                data = response.json()
                works_data[data["key"]] = data
                time.sleep(delay)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching works batch starting at index {i}: {e}")
    return works_data

In [25]:
work_keys = extract_work_keys(books_data)
print(len(work_keys))

12357


In [26]:
works_data = fetch_works_data(work_keys)
with open(metadata_root / "works_data.json", "w", encoding="utf-8") as f:
    json.dump(works_data, f, ensure_ascii=False, indent=4)

Fetching Works:   1%|          | 2/387 [01:08<3:40:33, 34.37s/it]


KeyboardInterrupt: 

### Author Data

In [None]:
def extract_author_keys(books_data, works_data):
    author_keys = set()
    # Extract from books
    for book_info in books_data.values():
        if "authors" in book_info and book_info["authors"]:
            for author in book_info["authors"]:
                author_key = author.get("key")
                if author_key:
                    author_keys.add(author_key)
    # Extract from works
    for work_info in works_data.values():
        if "authors" in work_info and work_info["authors"]:
            for author in work_info["authors"]:
                author_key = author.get("author", {}).get("key")
                if author_key:
                    author_keys.add(author_key)
    return list(author_keys)

In [None]:
def fetch_authors_data(author_keys, batch_size=32, delay=0.1):
    authors_data = {}
    for i in tqdm(range(0, len(author_keys), batch_size), desc="Fetching Authors"):
        batch_keys = author_keys[i : i + batch_size]
        urls = [f"{BASE_URL}{key}.json" for key in batch_keys]
        try:
            for url in urls:
                response = requests.get(url)
                response.raise_for_status()
                data = response.json()
                authors_data[data["key"]] = data
                time.sleep(delay)
        except requests.exceptions.RequestException as e:
            print(f"Error fetching authors batch starting at index {i}: {e}")
    return authors_data

In [None]:
author_keys = extract_author_keys(books_data, works_data)
authors_data = fetch_authors_data(author_keys, batch_size=192, delay=2)
with open(metadata_root / "authors_data.json", "w", encoding="utf-8") as f:
    json.dump(authors_data, f, ensure_ascii=False, indent=4)