In [3]:
import requests
import pandas as pd
import time
categories = [
    "fiction", "historical fiction", "mystery", "thriller", "romance", "fantasy",
    "romantsy", "science fiction", "horror", "young adult", "nonfiction", "memoir",
    "autobiography", "history", "biography", "humor", "historical-fiction", "gay",
    "lgbt", "queer", "paranormal", "historical romance", "contemporary", "classic",
    "comics", "manga"
]
def get_books_by_category(category, max_books=2000):
    books = []
    page = 1
    total = 0
    print(f":libros: Getting books for category: {category}")
    while total < max_books:
        url = f"https://openlibrary.org/search.json?q={category}&language=eng&page={page}"
        response = requests.get(url)
        if response.status_code != 200:
            print(f"Error en página {page} para {category}")
            break
        data = response.json()
        docs = data.get("docs", [])
        if not docs:
            break
        for doc in docs:
            books.append({
                "category": category,
                "title": doc.get("title"),
                "original_title": doc.get("title"),  # no field for original_title
                "authors": ", ".join(doc.get("author_name", [])),
                "first_publish_year": doc.get("first_publish_year"),
                "language": ", ".join(doc.get("language", [])) if doc.get("language") else None,
                "edition_count": doc.get("edition_count"),
                "cover_id": doc.get("cover_i"),
                "cover_url": f"https://covers.openlibrary.org/b/id/{doc.get('cover_i')}-L.jpg" if doc.get("cover_i") else None,
                "isbn": ", ".join(doc.get("isbn", [])) if doc.get("isbn") else None,

            })
            total += 1
            if total >= max_books:
                break
        page += 1
        time.sleep(1)  # Evitar ser bloqueado
    return books

In [4]:
all_books = []
for cat in categories:
    books = get_books_by_category(cat, max_books=2000)
    all_books.extend(books)
df = pd.DataFrame(all_books)
# Eliminar duplicados por ISBN (o título si quieres ampliar)
df.drop_duplicates(subset=["isbn", "title"], inplace=True)
# Guardar
df.to_csv("openlibrary_books_english.csv", index=False)
print(f"Total de libros únicos recolectados: {len(df)}")

:libros: Getting books for category: fiction
:libros: Getting books for category: historical fiction
:libros: Getting books for category: mystery
:libros: Getting books for category: thriller
:libros: Getting books for category: romance
:libros: Getting books for category: fantasy
:libros: Getting books for category: romantsy
:libros: Getting books for category: science fiction
:libros: Getting books for category: horror
:libros: Getting books for category: young adult
:libros: Getting books for category: nonfiction
:libros: Getting books for category: memoir
:libros: Getting books for category: autobiography
:libros: Getting books for category: history
:libros: Getting books for category: biography
:libros: Getting books for category: humor
:libros: Getting books for category: historical-fiction
:libros: Getting books for category: gay
:libros: Getting books for category: lgbt
:libros: Getting books for category: queer
:libros: Getting books for category: paranormal
:libros: Getting b

In [8]:
API_KEY = "AIzaSyCmfH7t1IJB2kf4OoOarapigXMxCBtEJn0"

In [9]:
def search_google_books(title, author=None):
    query = f"{title}"
    if author:
        query += f"+inauthor:{author}"
    url = f"https://www.googleapis.com/books/v1/volumes?q={query}&langRestrict=en&key={API_KEY}"

    response = requests.get(url)
    if response.status_code != 200:
        return None

    data = response.json()
    if "items" not in data:
        return None

    book_info = data["items"][0]["volumeInfo"]
    return {
        "google_title": book_info.get("title"),
        "google_authors": ", ".join(book_info.get("authors", [])),
        "publisher": book_info.get("publisher"),
        "publishedDate": book_info.get("publishedDate"),
        "description": book_info.get("description"),
        "pageCount": book_info.get("pageCount"),
        "categories": ", ".join(book_info.get("categories", [])),
        "averageRating": book_info.get("averageRating"),
        "ratingsCount": book_info.get("ratingsCount"),
        "google_language": book_info.get("language"),
        "isbn_13": next((id["identifier"] for id in book_info.get("industryIdentifiers", []) if id["type"] == "ISBN_13"), None),
        "isbn_10": next((id["identifier"] for id in book_info.get("industryIdentifiers", []) if id["type"] == "ISBN_10"), None),
    }

In [10]:
from tqdm import tqdm
import pandas as pd
import time

df = pd.read_csv("openlibrary_books_english.csv")

google_data = []
batch_size = 100

for i, (_, row) in enumerate(tqdm(df.iterrows(), total=len(df))):
    title = row["title"]
    author = row["authors"].split(",")[0] if pd.notnull(row["authors"]) else None
    gb_info = search_google_books(title, author)
    google_data.append(gb_info)
    
    # Guardar cada 100 libros
    if (i + 1) % batch_size == 0 or (i + 1) == len(df):
        temp_df = pd.DataFrame(google_data)
        partial_df = pd.concat([df.iloc[:len(google_data)].reset_index(drop=True), temp_df], axis=1)
        partial_df.to_csv(f"books_with_google_data_up_to_{i+1}.csv", index=False)
        print(f"Guardado parcial: {i+1} libros")
    
    time.sleep(0.2)  # pequeña pausa para no abusar (aunque tengas API key)


  0%|          | 100/31017 [02:10<11:23:34,  1.33s/it]

Guardado parcial: 100 libros


  1%|          | 200/31017 [04:21<10:49:38,  1.26s/it]

Guardado parcial: 200 libros


  1%|          | 300/31017 [06:30<11:01:16,  1.29s/it]

Guardado parcial: 300 libros


  1%|▏         | 399/31017 [08:37<11:02:04,  1.30s/it]


AttributeError: 'NoneType' object has no attribute 'keys'

In [1]:
import requests
from bs4 import BeautifulSoup
import re
import time

headers = {
    "User-Agent": "Mozilla/5.0"
}

def search_goodreads(title, author=None):
    query = f"{title} {author}" if author else title
    search_url = f"https://www.goodreads.com/search?q={requests.utils.quote(query)}"
    
    search_resp = requests.get(search_url, headers=headers)
    if search_resp.status_code != 200:
        return None
    
    search_soup = BeautifulSoup(search_resp.text, "lxml")
    
    # Encuentra el primer resultado (enlace al libro)
    result_link = search_soup.select_one("a.bookTitle")
    if not result_link:
        return None
    
    book_url = "https://www.goodreads.com" + result_link.get("href")
    book_resp = requests.get(book_url, headers=headers)
    if book_resp.status_code != 200:
        return None
    
    book_soup = BeautifulSoup(book_resp.text, "lxml")

    # ⭐ Rating
    try:
        rating = book_soup.select_one("span[itemprop='ratingValue']").get_text(strip=True)
    except:
        rating = None

    # 🗳️ Ratings count
    try:
        rating_count_text = book_soup.select_one("meta[itemprop='ratingCount']")["content"]
        ratings_count = int(rating_count_text.replace(",", ""))
    except:
        ratings_count = None

    # 📖 Description
    try:
        desc_tag = book_soup.select_one("div#description span[style]")
        description = desc_tag.get_text(strip=True) if desc_tag else None
    except:
        description = None

    return {
        "goodreads_url": book_url,
        "averageRating": rating,
        "ratingsCount": ratings_count,
        "description_gr": description
    }


In [2]:
import pandas as pd
from tqdm import tqdm

df = pd.read_csv("openlibrary_books_english.csv")

goodreads_data = []
for _, row in tqdm(df.head(10000).iterrows(), total=10000):  # empieza con head(50)
    title = row["title"]
    author = row["authors"].split(",")[0] if pd.notnull(row["authors"]) else None
    gr_data = search_goodreads(title, author)
    goodreads_data.append(gr_data)
    time.sleep(2)  # para evitar bloqueo

gr_df = pd.DataFrame(goodreads_data)
df_combined = pd.concat([df.head(10000).reset_index(drop=True), gr_df], axis=1)

df_combined.to_csv("books_with_goodreads.csv", index=False)


  0%|          | 20/10000 [03:58<33:04:41, 11.93s/it]


KeyboardInterrupt: 