In [1]:
import os
import requests
import pandas as pd
import asyncio
import time

from json.decoder import JSONDecodeError
from concurrent.futures import ThreadPoolExecutor

In [2]:
authors = [
    "J.K. Rowling",
    "Leo Tolstoy",
    "Fyodor Dostoevsky",
    "Jordan Peterson",
    "Carl Jung",
    "Viktor Frankl",
    "Arthur Schopenhauer",
    "Friedrich Nietzsche",
    "Jocko Willink",
    "Earnest Becker",
    "Bell Hooks",
    "Esther Perel",
    "Simone De Beauvoir",
    "Johann Wolfgang von Goethe",
    "William Shakespeare",
    "Logan Ury",
    "Emil Cioran",
    "Aleksandr Solzhenitsyn",
    "Jean Paul Sarte",
    "Soren Kierkegaard",
    "Immanuel Kant",
    "Mark Manson",
    "John Gottman",
    "Albert Camus",
    "J.R.R. Tolkien",
    "C.S. Lewis",
]

In [3]:
formatted_authors = [author.replace(' ', "%") for author in authors]
formatted_authors

['J.K.%Rowling',
 'Leo%Tolstoy',
 'Fyodor%Dostoevsky',
 'Jordan%Peterson',
 'Carl%Jung',
 'Viktor%Frankl',
 'Arthur%Schopenhauer',
 'Friedrich%Nietzsche',
 'Jocko%Willink',
 'Earnest%Becker',
 'Bell%Hooks',
 'Esther%Perel',
 'Simone%De%Beauvoir',
 'Johann%Wolfgang%von%Goethe',
 'William%Shakespeare',
 'Logan%Ury',
 'Emil%Cioran',
 'Aleksandr%Solzhenitsyn',
 'Jean%Paul%Sarte',
 'Soren%Kierkegaard',
 'Immanuel%Kant',
 'Mark%Manson',
 'John%Gottman',
 'Albert%Camus',
 'J.R.R.%Tolkien',
 'C.S.%Lewis']

In [4]:
def helper(author):
    params = {
        'q': author
    }
    response = requests.get("https://openlibrary.org/search/authors.json", params=params)
    data = response.json()
    docs = data.get('docs')

    res = {
        'author_id': [],
        'name': [],
        'top_subjects': [],
        'work_count': [],
        'ratings_average': [],
        'ratings_sortable': [] ,
        'ratings_count': [],
        'ratings_count_1': [],
        'ratings_count_2': [],
        'ratings_count_3': [],
        'ratings_count_4': [],
        'ratings_count_5': [],
        'want_to_read_count': [],
        'already_read_count': [],
        'currently_reading_count': [],
        'readinglog_count': [],
    }
    
    for doc in docs:
        res['author_id'].append(doc.get('key'))
        res['name'].append(doc.get('name'))
        res['top_subjects'].append(doc.get('top_subjects'))
        res['work_count'].append(doc.get('work_count', 0))
        res['ratings_average'].append(doc.get('ratings_average', 0))
        res['ratings_sortable'].append(doc.get('ratings_sortable', 0))
        res['ratings_count'].append(doc.get('ratings_count', 0))
        res['ratings_count_1'].append(doc.get('ratings_count_1', 0))
        res['ratings_count_2'].append(doc.get('ratings_count_2', 0))
        res['ratings_count_3'].append(doc.get('ratings_count_3', 0))
        res['ratings_count_4'].append(doc.get('ratings_count_4', 0))
        res['ratings_count_5'].append(doc.get('ratings_count_5', 0))
        res['want_to_read_count'].append(doc.get('want_to_read_count', 0))
        res['already_read_count'].append(doc.get('already_read_count', 0))
        res['currently_reading_count'].append(doc.get('currently_reading_count', 0))
        res['readinglog_count'].append(doc.get('readinglog_count', 0))

    res_df = pd.DataFrame(res)
    return res_df

    

with ThreadPoolExecutor() as exe:
    author_data = pd.concat(list(exe.map(helper, formatted_authors)), ignore_index=True, axis=0)

In [5]:
author_data

Unnamed: 0,author_id,name,top_subjects,work_count,ratings_average,ratings_sortable,ratings_count,ratings_count_1,ratings_count_2,ratings_count_3,ratings_count_4,ratings_count_5,want_to_read_count,already_read_count,currently_reading_count,readinglog_count
0,OL14732426A,J.K. Rowling,,0.0,0.000000,2.047372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,OL14534632A,J. K. J.K. Rowling,,0.0,0.000000,2.047372,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,OL23919A,J. K. Rowling,"[Children's fiction, Fantasy fiction, Wizards,...",890.0,4.226277,4.192404,3151.0,157.0,76.0,411.0,760.0,1747.0,34729.0,5516.0,2741.0,42986.0
3,OL738926A,Leo Tolstoy,"[Services for, Viol, Trials (Rape), Services a...",7.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,OL12785803A,Leo Tolstoy,,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
421,OL12495518A,C.S. Lewis | C. S. &#xB8E8;&#xC774;&#xC2A4;,,2.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
422,OL11489024A,LEWIS C.S.,,6.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
423,OL13211749A,Lewis C.S.,,2.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
424,OL9219678A,"Lewis,C.S.",,2.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
author_data.dtypes

author_id                   object
name                        object
top_subjects                object
work_count                 float64
ratings_average            float64
ratings_sortable           float64
ratings_count              float64
ratings_count_1            float64
ratings_count_2            float64
ratings_count_3            float64
ratings_count_4            float64
ratings_count_5            float64
want_to_read_count         float64
already_read_count         float64
currently_reading_count    float64
readinglog_count           float64
dtype: object

In [7]:
author_data.to_csv('./data/author.csv')

# use the author id to search more works from the author

In [8]:
author_data['author_id']

0      OL14732426A
1      OL14534632A
2         OL23919A
3        OL738926A
4      OL12785803A
          ...     
421    OL12495518A
422    OL11489024A
423    OL13211749A
424     OL9219678A
425    OL12514322A
Name: author_id, Length: 426, dtype: object

In [9]:
# async def run():
#    conn = await asyncpg.connect(db_url)
#    values = await conn.fetch('''SELECT ... FROM ... WHERE ...;''')
#    await conn.close()
#    return values

# @app.route('/')
# def test():
#     loop = asyncio.get_event_loop()
#     res = loop.run_until_complete(run())
#     return json.dumps([dict(r) for r in res]) 

# if __name__ == '__main__':
#     app.run()

In [10]:
def get_batches(l: list, n: int):
    """
    splits the large list, dataframe, array
    into manageable chunks or batches with size n
    for later concurrent batch processing
    """
    
    # looping till length l
    for i in range(0, len(l), n): 
        yield l[i:i + n]

In [11]:
def get_sizes(author_id: str) -> tuple[str, int] | None:
    try:
        response = requests.get(f"https://openlibrary.org/authors/{author_id}/works.json")
        data = response.json()
        size = data.get('size')
        print(size)

        return (author_id, size)
    except JSONDecodeError as e:
        print(f"{e} occured with author id {author_id}. Response returned status code: {response.status_code}.")
        print(f"Response headers and body is the ff. {data}, {response.headers}")
        return None

In [12]:
author_ids = author_data['author_id'].tolist()

# divide list into batches of size 32 for batch concurrent
# processing to avoid 429 too many requests errors
author_ids_batches = get_batches(author_ids, 32)
# next(author_ids_batches)

author_ids_sizes_batches = []
for author_ids_batch in author_ids_batches:
    # concurrently process the batch
    with ThreadPoolExecutor() as exe:
        author_ids_sizes_batch = list(exe.map(get_sizes, author_ids_batch))
    
    # close executor and place delay 
    exe.shutdown()
    time.sleep(3)

    # save result in final list
    author_ids_sizes_batches.append(author_ids_sizes_batch)

11
1

1
2
1
3
1
3
12
1
3
17
9
411
41
0
1
1
1
1
1
1
10
2
4
1
3274
0
1
915
0
1
1
1
1
1
2
2
1
2
3
1
1
1
4
30
0
1
4
2
1
2
1
9
1
16
1
2265
1
718
1
74
1
1
1
1
1
1
1
1
2
1
4
2
12
2
2
104
1
1
1
1
1
1
1
1
0
1
2
514
583
3
2724
1
0
11
1
1

1
1
1
1
1
1
1
1
1
1
1
3
1
1
1
4
2
1
1
1
1
1
1
1
1
75
1
1
11

1
2
1
4
11
9
9
1
4
1
302
2
32
67
1
0
1
1
6
1
2685
118
1
1
4
312
1
35
1027
7
11
1
1

1
1
2
5
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
2
1
2
1
27
1
1
1
11

1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
7
2
1
2
10649
6436
15
1
170
1
1
2
2
2
1
2
6
1
2
1
1
1
1
2
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
2
1
1
1
1
1
1
1
1
2
1
1
1
0
1
1
2
1
3
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
1
20
1
1
1
2
1
123
27
1
59
1
1
1
1
1
1
1
1
1
1
0
0
4
6
10
11
5
26
1
1
0
1
1
1
2
10
1
1
312
1847
1
1
1069
347
1
1
1
1
2
1
1
1
1
1
1
1
1
6
1
1
1
4
25
76
1
3
1
1
1
1
1
1
5
13
1
95
0
453
1
4
0
0
0
0
0
0
0
0
0
0
1
0
0
0
0
0
0
1
1
1
1
0
1
1
8
1
1
3
1
1
1122
314
1
1
1
1
1
1
0
3
1
1


In [13]:
author_ids_sizes_batches

[[('OL14732426A', 3),
  ('OL14534632A', 1),
  ('OL23919A', 411),
  ('OL738926A', 3),
  ('OL12785803A', 12),
  ('OL14183059A', 1),
  ('OL14163185A', 17),
  ('OL173132A', 1),
  ('OL13633317A', 1),
  ('OL14420613A', 41),
  ('OL14034712A', 1),
  ('OL13681693A', 2),
  ('OL14360467A', 1),
  ('OL14431373A', 9),
  ('OL12256006A', 1),
  ('OL12959022A', 3),
  ('OL11490779A', 1),
  ('OL14675995A', 1),
  ('OL13677863A', 1),
  ('OL26783A', 3274),
  ('OL2662483A', 0),
  ('OL13592249A', 1),
  ('OL12634132A', 10),
  ('OL13554925A', 1),
  ('OL11442030A', 1),
  ('OL12292966A', 1),
  ('OL13542837A', 0),
  ('OL14550908A', 2),
  ('OL14447153A', 4),
  ('OL14439655A', 1),
  ('OL12506504A', 0),
  ('OL13214190A', 915)],
 [('OL13280086A', 2),
  ('OL13632639A', 2),
  ('OL14485152A', 4),
  ('OL12771857A', 1),
  ('OL13593411A', 1),
  ('OL14536603A', 1),
  ('OL13091422A', 1),
  ('OL22242A', 2265),
  ('OL13350643A', 2),
  ('OL13591819A', 1),
  ('OL13503617A', 3),
  ('OL13737591A', 1),
  ('OL13593589A', 1),
  ('OL234

In [24]:
def get_works(author_id_size: list[str, int]) -> pd.DataFrame:
    author_id, size = author_id_size
    offset = 50

    # 411 % 50 = 11, then 411 - 11 = 400
    # 0 to 400 + 1 with increment of 50
    stop = size - (size % offset)

    # [0, 50, 100, 150, ..., 400]
    paginators = list(range(0, stop + 1, offset))
    print(f"paginators: {paginators}")

    res = {
        'author_id': [],
        'title': [],
        'desc': [],
        'subjects': [],
        'date_created': [],
        'date_modified': [] ,
    }

    try:
        # paginate 
        for i in paginators:
            # what I want to do here is if size is too big parition api requests per batch
            # through `limit` parameter or offset to paginate
            # "https://openlibrary.org/authors/{author_id}/works.json" 0 to 49
            # next offset=50
            # "https://openlibrary.org/authors/{author_id}/works.json?offset=50" 50 to 99 
            # next offset=100
            # "https://openlibrary.org/authors/{author_id}/works.json?offset=100" 100 to 149 
            # ...
            # "https://openlibrary.org/authors/{author_id}/works.json?offset=350" 350 to 399
            # next offset=400
            # "https://openlibrary.org/authors/{author_id}/works.json?offset=400" 400 to 410
            # so what I want is indeces [0, 50, 100, 150, ..., 400] derived from size 411
            # 411 % 50 = 11
            # 450 % 50 = 0
            # if 411 we want to stop our pagination at 400, so 411 - (411 % 50)
            # if 93 we want to stop our pagination at 50, so 93 - (93 % 50)
            # if 20 we want to stop our pagination at 0, so 20 - (20 % 50)
            # 0 to (0 + 1) * 50 or 50 exclusively
            # 1 to (1 + 1) * 50 or 100 exclusively
            # 400 to (400 + 1)
            params = {
                'offset': i
            }
            response = requests.get(f"https://openlibrary.org/authors/{author_id}/works.json", params=params)
            data = response.json()
            entries = data.get('entries')

            for entry in entries:
                key = entry.get('key')
                title = entry.get('title')
                subjects = entry.get('subjects')
                
                # descrioption, created, and last modified can be just a string or a dictionary
                desc = entry.get('description', {}).get('value') if type(entry.get('description')) == dict else entry.get('description')
                created = entry.get('created', {}).get('value') if type(entry.get('created')) == dict else entry.get('created')
                last_modified = entry.get('last_modified', {}).get('value') if type(entry.get('last_modified')) == dict else entry.get('last_modified')

                res['author_id'].append(key)
                res['title'].append(title)
                res['desc'].append(desc)
                res['subjects'].append(subjects)
                res['date_created'].append(created)
                res['date_modified'].append(last_modified)
        
        res_df = pd.DataFrame(res)

        return res_df
    except AttributeError as e:
        print(f"{e} occured. {entry}")
        return

In [None]:
author_works_batches = []
for author_ids_sizes_batch in author_ids_sizes_batches:
    with ThreadPoolExecutor() as exe:
        author_works_batch = list(exe.map(get_works, author_ids_sizes_batch))

    # close executor
    exe.shutdown()
    time.sleep(3)

    # save author works batch
    author_works_batches.extend(author_works_batch)

paginators: [0]
paginators: [0]
paginators: [0, 50, 100, 150, 200, 250, 300, 350, 400]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 850, 900, 950, 1000, 1050, 1100, 1150, 1200, 1250, 1300, 1350, 1400, 1450, 1500, 1550, 1600, 1650, 1700, 1750, 1800, 1850, 1900, 1950, 2000, 2050, 2100, 2150, 2200, 2250, 2300, 2350, 2400, 2450, 2500, 2550, 2600, 2650, 2700, 2750, 2800, 2850, 2900, 2950, 3000, 3050, 3100, 3150, 3200, 3250]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0]
paginators: [0, 50, 100, 150, 200, 250, 300, 350, 400, 450, 500, 550, 600, 650, 700, 750, 800, 

In [32]:
author_works_flat = [x for xs in author_works_batches for x in xs]

In [34]:
author_works = pd.concat(author_works_flat, ignore_index=True, axis=0)
author_works

Unnamed: 0,author_id,title,desc,subjects,date_created,date_modified
0,/works/OL42301477W,Harry Potter and the deathly hallows,,,2024-12-28T08:13:14.530095,2024-12-28T08:14:25.236582
1,/works/OL41942543W,Harry Potter and the Deathly Hallows,,"[Harry Potter (Fictitious character), death ea...",2024-11-19T22:38:30.091489,2024-11-19T22:45:00.045478
2,/works/OL41905393W,Harry Potter og Ildbegeret (Norwegian Edition),,,2024-10-06T07:13:26.729829,2024-10-06T07:13:26.729829
3,/works/OL40716312W,Fantastic Beasts and Where to Find Them,,,2024-09-26T18:00:16.313120,2024-09-26T18:00:16.313120
4,/works/OL30668340W,"Short Stories from Hogwarts of Heroism, Hardsh...",‘Minerva was the Roman goddess of warriors and...,"[Harry Potter, Magic, fiction, Wizards, fictio...",2022-11-30T09:45:38.966290,2024-12-23T03:48:41.671222
...,...,...,...,...,...,...
40144,/works/OL31553567W,Dieu au banc des accusés nouvelle édition revu...,,,2022-12-07T21:09:32.833872,2022-12-07T21:09:32.833872
40145,/works/OL31553564W,RÉFLEXIONS SUR LES PSAUMES (NVELLE ÉD),,,2022-12-07T21:09:31.465439,2022-12-07T21:09:31.465439
40146,/works/OL36758782W,The Horse and His Boy,,,2023-09-29T01:06:51.138787,2023-09-29T01:06:51.138787
40147,/works/OL24482596W,The Silver Chair - C. S. Lewis,,,2021-05-15T20:27:01.887502,2021-05-15T20:27:01.887502


In [35]:
author_works.to_csv('./data/author_works.csv')

In [None]:
# # SuperFastPython.com
# # example of getting results for tasks as they are completed
# from time import sleep
# from random import random
# from concurrent.futures import ThreadPoolExecutor
# from concurrent.futures import as_completed

# # task function to be executed in the thread pool
# def task(task_id):
#     # generate a random value between 0 and 1
#     value = random()
#     # sleep for some number of seconds
#     sleep(value * 10)
#     # return a result
#     return f'Task={task_id}: {value}'

# # protect the entry point
# if __name__ == '__main__':
#     # start the thread pool
#     with ThreadPoolExecutor(10) as exe:
#         # submit tasks and collect futures
#         futures = [exe.submit(task, i) for i in range(10)]
#         # process task results as they are available
#         for future in as_completed(futures):
#             # retrieve the result
#             result = future.result()
#             # report the result
#             print(result)