## Inspect Data

You can perform a volumes search by sending an HTTP GET request to the following URI:


https://www.googleapis.com/books/v1/volumes?q=search+terms
This request has a single required parameter:

q - Search for volumes that contain this text string. There are special keywords you can specify in the search terms to search in particular fields, such as:
- intitle: Returns results where the text following this keyword is found in the title.
- inauthor: Returns results where the text following this keyword is found in the author.
- inpublisher: Returns results where the text following this keyword is found in the publisher.
- subject: Returns results where the text following this keyword is listed in the category list of the volume.
- isbn: Returns results where the text following this keyword is the ISBN number.
- lccn: Returns results where the text following this keyword is the Library of Congress Control Number.
- oclc: Returns results where the text following this keyword is the Online Computer Library Center number.

### Request
Here is an example of searching for Daniel Keyes' "Flowers for Algernon":


GET https://www.googleapis.com/books/v1/volumes?q=flowers+inauthor:keyes&key=yourAPIKey

## Get Authors

In [None]:
import requests

# Define the search parameters
category = 'fiction'
start_index = 0
max_results = 100000 

# API URL
url = f"https://openlibrary.org/search.json?q=subject:{category}&start={start_index}&limit={max_results}&fields=author_name"

# Send the GET request
response = requests.get(url)
data = response.json()
authors = []

# Check if 'docs' is in the response
if 'docs' in data:
    # Print the total number of books found
    print(f"Total books found: {data['num_found']}")
    
    # Iterate through the books and print their details
    for idx, book in enumerate(data['docs']):
        author = book.get('author_name', ['No author available'])
        if len(author) == 1:
            authors.append(author[0])
        else:
            pass
else:
    print("No books found or 'docs' key is missing in the response.")

# Filter out duplicate authors
authors_filtered = set(authors)

# Store the filtered authors in a text file
data_path = "../data/authors_filtered.txt"
with open(data_path, 'w') as f:
    for author in authors_filtered:
        f.write(f"{author}\n")

## Scraping Google Books

In [4]:
import requests
import json 


API_KEY = "AIzaSyDSSplnRCPbL_k_ukQovrz6-lOH94RA13U"
# https://developers.google.com/books/docs/v1/using#APIKey

authors_path = "../data/authors_filtered.txt"
with open(authors_path, 'r') as f:
    authors = [line.strip() for line in f.readlines()]

In [7]:

def setting_api_params(author, start_index=0, language='en', country='US', max_results=40):
    params = {
        'q': f'inauthor:{author}',
        'startIndex': start_index,
        'langRestrict': language,
        'country': country,
        'maxResults': max_results,
        'printType': 'books',
        'fields': '''totalItems,items(
                id,
                volumeInfo(
                    title,subtitle,authors,publisher,publishedDate,description,
                    pageCount,mainCategory,categories,averageRating,
                    ratingsCount,language,previewLink,
                    imageLinks(thumbnail)
                ),
                saleInfo(
                    country,saleability,
                    listPrice(amount,currencyCode),
                    buyLink
                )
                    )'''.replace('\n', '').replace(' ', ''),
                    'key': API_KEY,
            }
    
    return params


In [None]:
def calculate_text_overlap(text1, text2, word_threshold=5):
    """
    Calculate what percentage of text1 appears in text2 using sliding window approach.
    """
    words1 = text1.lower().split()
    words2_text = ' '.join(text2.lower().split())
    
    if len(words1) < word_threshold:
        return 0.0
    
    matches = 0
    i = 0
    while i <= len(words1) - word_threshold:
        window = ' '.join(words1[i:i + word_threshold])
        if window in words2_text:
            matches += word_threshold
            i += word_threshold  # Skip ahead properly
        else:
            i += 1
    
    return min(matches / len(words1), 1.0)  # Cap at 100%

def find_similar_descriptions(books, similarity_threshold=0.8, min_description_length=50):
    """
    Find books with descriptions where 80%+ of text appears in another description.
    Returns indices of books to remove.
    """
    descriptions = []
    valid_indices = []
    
    # Extract descriptions and track valid indices
    for idx, book in enumerate(books):
        desc = book.get('volumeInfo', {}).get('description', '').strip()
        if len(desc) >= min_description_length:
            descriptions.append(desc)
            valid_indices.append(idx)
    
    to_remove = set()
    
    for i, desc1 in enumerate(descriptions):
        if valid_indices[i] in to_remove:
            continue
            
        for j, desc2 in enumerate(descriptions):
            if i != j and valid_indices[j] not in to_remove:
                # Check if desc1 is mostly contained in desc2
                overlap1_in_2 = calculate_text_overlap(desc1, desc2)
                overlap2_in_1 = calculate_text_overlap(desc2, desc1)
                
                if overlap1_in_2 >= similarity_threshold or overlap2_in_1 >= similarity_threshold:
                    # High overlap detected - remove both books
                    max_overlap = max(overlap1_in_2, overlap2_in_1)
                    to_remove.add(valid_indices[i])
                    to_remove.add(valid_indices[j])
                    print(f"Removing both books {valid_indices[i]} and {valid_indices[j]}: {max_overlap:.1%} overlap")
                    break
    
    return list(to_remove)

def deduplicate_by_description_similarity(books, similarity_threshold=0.8):
    """
    Remove books with highly similar descriptions.
    """
    indices_to_remove = find_similar_descriptions(books, similarity_threshold)
    return [book for idx, book in enumerate(books) if idx not in indices_to_remove]

def enhanced_deduplicate_books(books, verbose=False):
    """
    Enhanced deduplication combining title and description similarity.
    """
    # First deduplicate by titles
    books = deduplicate_books_by_shortest_title(books, verbose)
    
    if verbose:
        print(f"\nAfter title deduplication: {len(books)} books")
    
    # Then deduplicate by description similarity
    books = deduplicate_by_description_similarity(books, similarity_threshold=0.8)
    
    if verbose:
        print(f"After description deduplication: {len(books)} books")
    
    return books

# Modified version of your existing function
def deduplicate_books_by_shortest_title(books, verbose=False):
    """
    Deduplicate books by their shortest title.
    """
    titles = [book['volumeInfo'].get('title', '').strip().lower() for book in books]

    to_remove = []
    for idx_outer, title in enumerate(titles):
        if idx_outer in to_remove:
            continue
        # this checks for duplicates and skips if none found
        temp_no_outer = titles[:idx_outer] + titles[idx_outer+1:]
        if any(title in temp_no_outer[idx_temp] for idx_temp in range(len(temp_no_outer))):
            if verbose:
                print(f'Duplicate found: {title}')
            for idx_inner in range(len(titles)):
                if idx_inner == idx_outer:
                    continue
                if titles[idx_outer] in titles[idx_inner]:
                    if verbose:
                        print(f'  "{titles[idx_inner]}" contains "{titles[idx_outer]}"')
                    if ((len(titles[idx_outer]) < len(titles[idx_inner])) and (len(titles[idx_outer]) > 3)) or (len(titles[idx_outer]) == len(titles[idx_inner])):
                        to_remove.append(idx_inner)
                        continue
        else:
            if verbose:
                print('No duplicate found')

    return [books[idx] for idx in range(len(books)) if idx not in to_remove]

def get_filtered_books(author, iterations=3):
    """
    Get a list of filtered book IDs for a specific author with enhanced deduplication.
    """
    # total items is unreliable, using an euristic of 3 iterations to gather more results per author
    iteration = 0
    filtered_books = []
    while iteration < iterations:
        print(f"Fetching books for {author}, iteration {iteration + 1}")
        url = f"https://www.googleapis.com/books/v1/volumes"
        params = setting_api_params(author, start_index=iteration * 40)
        response = requests.get(url, params=params)
        books = response.json().get('items', [])
        
        # Get the current iteration length
        current_iteration_length = len(books)

        # Filter books
        books[:] = [book for book in books if 
                    book['volumeInfo'].get('authors', []) == [author]
                    and book['volumeInfo'].get('language', '') == 'en'
                    and book['volumeInfo'].get('title', '') != ''
                    and len(book['volumeInfo'].get('description', '')) > 100
                    and book['id']
                    ]
        
        filtered_books.extend(books)

        if current_iteration_length < 40:
            break
        iteration += 1

    # Enhanced deduplication after all iterations
    books_output = enhanced_deduplicate_books(filtered_books, verbose=True)
    ids = [books_output[idx]['id'] for idx in range(len(books_output))]
    
    return books_output, ids

In [610]:
# Test
books, ids = get_filtered_books("Stanislaw Lem", 10)
descriptions = [book['volumeInfo'].get('description', '').strip().lower() for book in books]

Fetching books for Stanislaw Lem, iteration 1
Fetching books for Stanislaw Lem, iteration 2
Fetching books for Stanislaw Lem, iteration 3
Fetching books for Stanislaw Lem, iteration 4
Fetching books for Stanislaw Lem, iteration 5
No duplicate found
No duplicate found
Duplicate found: the truth and other stories
  "the truth and other stories" contains "the truth and other stories"
No duplicate found
No duplicate found
No duplicate found
Duplicate found: the star diaries
  "the star diaries" contains "the star diaries"
No duplicate found
No duplicate found
Duplicate found: his master's voice
  "his master's voice" contains "his master's voice"
No duplicate found
No duplicate found
Duplicate found: the invincible
  "the invincible" contains "the invincible"
Duplicate found: tales of pirx the pilot
  "more tales of pirx the pilot" contains "tales of pirx the pilot"
No duplicate found
No duplicate found
No duplicate found
No duplicate found
No duplicate found
Duplicate found: the seventh v

In [628]:
authors

['Estelle Laure',
 'Diana Velez',
 'Brod, Max',
 'Kate Alcott',
 'Kirby Larson',
 'Sam Siciliano',
 'George Crenshaw',
 'Jennifer Haymore',
 'Reyna Grande',
 'Maureen Tan',
 'Sharon Ihle',
 'James Ellroy',
 'Randy Ribay',
 'W. R. Philbrick',
 'Jaclyn Reding',
 'Kate Seredy',
 'Marvin Harris',
 'Barry Unsworth',
 'Corey R. Tabor',
 'Harriette Louisa Simpson Arnow',
 'Smart Kids Publishing',
 'Tim Maughan',
 'Cheryl Brooks',
 'Trevanian',
 'Jackie French',
 'Confucius',
 'Steve Sheinkin',
 'Gillian Cross',
 'Edwin Corley',
 'Kathy Reichs',
 'Kate Duke',
 'Kiley Roache',
 'Gregg Loomis',
 'Mariama Bâ',
 'William Gibson',
 'Scott Reintgen',
 'Emily Blackwood',
 'Wei Hui',
 'Carla M. Pacis',
 'Margaret Mahy',
 'Barbara Kyle',
 'Deb Kastner',
 'Pedro Calderón de la Barca',
 'Elma Shaw',
 'Holly Hobbie',
 'Tesni Morgan',
 'Barbara Kaye',
 'Nicholas Adams',
 'Shirley Damsgaard',
 'Michelle Good',
 'Kirstin Cronn-Mills',
 'Adam Rubin',
 'Paige McKenzie',
 'Jean Warmbold',
 'Annika Thor',
 'Dani

In [None]:
import os

metadata_path = "../data/books_scraping_metadata.json"
os.makedirs(os.path.dirname(metadata_path), exist_ok=True)

if not os.path.exists(metadata_path):
    metadata = {"ids": [], "books": [], "authors": [], "authors_ids": []}
    with open(metadata_path, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=4, ensure_ascii=False)
else:
    # If file exists, just load it
    with open(metadata_path, "r", encoding="utf-8") as f:
        metadata = json.load(f)


n_authors_to_process = 3
try: 
    start_index = metadata['authors_ids'][-1] + 1
except IndexError:
    start_index = 0
stop_index = start_index + n_authors_to_process
cur_index = start_index


for author_idx, author in enumerate(authors[start_index:stop_index]):
    print(f"Processing author: {author}")
    books, ids = get_filtered_books(author, 10)
    print(f"Found {len(books)} unique books for author: {author}\n")

    metadata["ids"].extend(ids)
    metadata["books"].extend(books)
    metadata["authors"].extend([author])
    metadata["authors_ids"].extend([cur_index])
    cur_index += 1


with open(metadata_path, "w", encoding="utf-8") as f:
    json.dump(metadata, f, ensure_ascii=False, indent=4)



Processing author: Estelle Laure
Fetching books for Estelle Laure, iteration 1
Fetching books for Estelle Laure, iteration 2
Found 11 unique books for author: Estelle Laure

Processing author: Diana Velez
Fetching books for Diana Velez, iteration 1
Fetching books for Diana Velez, iteration 2
Fetching books for Diana Velez, iteration 3
Fetching books for Diana Velez, iteration 4
Found 2 unique books for author: Diana Velez



In [29]:
results = []
for idx in range(len(books_list)):
    books_output = enhanced_deduplicate_books(books_list[idx], verbose=True)
    ids = [books_output[idx]['id'] for idx in range(len(books_output))]
    results.extend(books_output)

No duplicate found
No duplicate found
No duplicate found
No duplicate found
No duplicate found
No duplicate found
No duplicate found
No duplicate found
No duplicate found
Duplicate found: city of hooks and scars
  "city of hooks and scars-city of villains, book 2" contains "city of hooks and scars"
No duplicate found

After title deduplication: 10 books
After description deduplication: 10 books
No duplicate found
No duplicate found

After title deduplication: 2 books
After description deduplication: 2 books


In [27]:
books_output = enhanced_deduplicate_books(books_list, verbose=True)
ids = [books_output[idx]['id'] for idx in range(len(books_output))]

TypeError: list indices must be integers or slices, not str

In [None]:
books_list

[[{'id': 'YuAMBgAAQBAJ',
   'volumeInfo': {'title': 'This Raging Light',
    'authors': ['Estelle Laure'],
    'publisher': 'Houghton Mifflin Harcourt',
    'publishedDate': '2015-12-22',
    'description': '“A funny, heartwrenching, and soulful” debut novel about family, friends, and first love from the acclaimed author of Mayhem and But Then I Came Back (Bustle). Lucille Bennett is pushed into adulthood after her mom decides to take a break from parenting, from responsibility, from Lucille and her little sister, Wren. Left to cover for her absentee parents, Lucille thinks, “Wren and Lucille. Lucille and Wren. I will do whatever I have to. No one will pull us apart.” Now is not the time for level-headed Lucille to fall in love. But love—messy, inconvenient love—is what she’s about to experience when she falls for Digby Jones, her best friend’s brother. With blazing longing that builds to a fever pitch, Estelle Laure’s soulful debut will keep readers hooked and hoping until the very la

In [None]:
temp_file_path = '../data_scraping/data/temp_raw_books_manual__2025-08-27T10:54:58+00:00.json'

with open(temp_file_path, 'r') as f:
    books_list = json.load(f).get('raw_books_list', [])
    
with open(temp_file_path, 'r') as f:
    metadata = json.load(f).get('metadata', {})

In [39]:
books_list[2]

[]

In [1]:
import boto3

In [None]:
client = boto3.client('s3', region_name='us-east-1')
