In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [13]:
import pandas as pd

# Carga el dataset original
df = pd.read_csv("/content/drive/MyDrive/Inteligencia Artificial I/datasets/goodreads_ds.csv")

# Extrae una muestra del dataset
sample_df = df.sample(5, random_state=42)[["Book Id", "Title", "Author", "isbn", "isbn13"]]
display(sample_df)


Unnamed: 0,Book Id,Title,Author,isbn,isbn13
8663,33438,Peach Cobbler Murder (Hannah Swensen #7),Joanne Fluke,0758201559,9780758201553
483,1537,The Oedipus Plays of Sophocles: Oedipus the Ki...,Sophocles/Paul Roche,0452011671,9780452011670
8403,32329,Air Gear Vol. 3 (Air Gear #3),Oh! Great/Â§ßÊöÆ Á∂≠‰∫∫,0345492803,9780345492807
6382,24006,Rule #1: The Simple Strategy for Successful In...,Phil Town,0307336131,9780307336132
1844,6544,The Scarpetta Collection: All That Remains / C...,Patricia Cornwell,074325581X,9780743255813


In [14]:
import requests

def fetch_description(isbn, title=None, author=None):
    """
    Funci√≥n para extraer la descripci√≥n de un libro a partir de su ISBN.
    Si no se proporciona ISBN, intenta con t√≠tulo+autor.
    Function to fetch a book description from Open Library API using ISBN.
    If ISBN is not available, try with title+author.
    """
    base_url = "https://openlibrary.org"

    # ISBN
    if pd.notna(isbn):
        url = f"{base_url}/isbn/{isbn}.json"
        try:
            r = requests.get(url, timeout=5)
            if r.status_code == 200:
                data = r.json()
                if "description" in data:
                    # Descripci√≥n puede ser string o disccionario
                    if isinstance(data["description"], dict):
                        return data["description"].get("value", None)
                    else:
                        return data["description"]
        except:
            pass

    # B√∫squeda por t√≠tulo y autor
    if title:
        query = f"{title} {author if author else ''}"
        url = f"{base_url}/search.json?q={query}"
        try:
            r = requests.get(url, timeout=5)
            if r.status_code == 200:
                data = r.json()
                if "docs" in data and len(data["docs"]) > 0:
                    doc = data["docs"][0]
                    if "key" in doc:
                        work_url = f"{base_url}{doc['key']}.json"
                        wr = requests.get(work_url, timeout=5)
                        if wr.status_code == 200:
                            work_data = wr.json()
                            if "description" in work_data:
                                if isinstance(work_data["description"], dict):
                                    return work_data["description"].get("value", None)
                                else:
                                    return work_data["description"]
        except:
            pass

    return None


In [16]:
sample_df = df.sample(10, random_state=42)[["Book Id", "Title", "Author", "isbn", "isbn13"]]

# Crea una nueva columna para las descripciones

sample_df["description"] = sample_df.apply(
    lambda row: fetch_description(row["isbn"], row["Title"], row["Author"]), axis=1
)

display(sample_df[["Title", "Author", "description"]])


Unnamed: 0,Title,Author,description
8663,Peach Cobbler Murder (Hannah Swensen #7),Joanne Fluke,"With The Cookie Jar, Hannah Swensen has a mout..."
483,The Oedipus Plays of Sophocles: Oedipus the Ki...,Sophocles/Paul Roche,
8403,Air Gear Vol. 3 (Air Gear #3),Oh! Great/Â§ßÊöÆ Á∂≠‰∫∫,
6382,Rule #1: The Simple Strategy for Successful In...,Phil Town,
1844,The Scarpetta Collection: All That Remains / C...,Patricia Cornwell,
398,The Histories,Herodotus/Carolyn Dewald/Robin Waterfield,Herotodus recounts the incidents preceding and...
1118,The Subterraneans,Jack Kerouac,The Subterraneans is a 1958 novella by Beat Ge...
9951,Artful Paper Dolls: New Ways to Play with a Tr...,Terry Taylor,
8758,La falsificadora,Jennifer Crusie/Marina Mariasch,
3445,The Guardian,Jane Hamilton,Story of Holden Caufield with his idiosyncrasi...


In [17]:
# M√°scara: retorna True si hay una descripci√≥n

mask = sample_df["description"].notnull()

# Contar las descripciones obtenidas de la muestra, el 38% obtenido es de haber muestreado 100 libros
# por tiempos de ejecuci√≥n sugiero correrlo con valores menores a 50, el porcentaje sigue conservandose
# entre los 35% - 40% en todo momento.
num_with_desc = mask.sum()
num_total = len(sample_df)
coverage = num_with_desc / num_total * 100

print(f"Descriptions found: {num_with_desc}/{num_total} ({coverage:.2f}%)")


Descriptions found: 4/10 (40.00%)


Razones del tiempo de ejecuci√≥n prolongado

El tiempo de ejecuci√≥n elevado en la obtenci√≥n de descripciones mediante la API de Open Library puede explicarse por los siguientes factores:

N√∫mero de solicitudes HTTP:
Cada libro requiere al menos una consulta a la API. En aquellos casos en los que el ISBN no permite un acceso directo, se recurre a una b√∫squeda por t√≠tulo y autor, lo que implica realizar dos o incluso tres solicitudes por libro. En t√©rminos pr√°cticos, 100 libros pueden generar aproximadamente 200 solicitudes HTTP.

Latencia de red y servidor:
Cada solicitud presenta una latencia inherente (tiempo de transmisi√≥n en la red m√°s el tiempo de respuesta del servidor). Incluso suponiendo que el servidor responda en un intervalo de 0.5 a 1 segundo, la acumulaci√≥n de miles de solicitudes genera un tiempo total significativo.

Ejecuci√≥n secuencial:
El proceso se implement√≥ de manera secuencial: Python espera la finalizaci√≥n de cada solicitud antes de iniciar la siguiente. Esto incrementa el tiempo total de ejecuci√≥n, ya que no se explotan t√©cnicas de concurrencia o paralelizaci√≥n.

Limitaciones de la API:
Open Library es un servicio p√∫blico cuya infraestructura est√° dise√±ada para un uso humano interactivo, no para grandes vol√∫menes de extracci√≥n autom√°tica de datos. Esto implica que las respuestas no son inmediatas y que el sistema puede aplicar mecanismos de limitaci√≥n (throttling) si se excede un umbral de solicitudes.



‚úÖ Ejecuci√≥n realizada: 115 segundos para 100 libros
‚Üí Equivale a 1.15 segundos/libro

üìä Para 11,127 libros:

11
,
127
√ó
1.15

segundos
=
12
,
796.05

segundos
11,127√ó1.15segundos=12,796.05segundos

‚è≥ Estimaci√≥n final:
‚âà 3.6 horas en total

In [None]:
# import pandas as pd
# import requests
# import time

# # Cargar DS
# df = pd.read_csv("/content/drive/MyDrive/Inteligencia Artificial I/datasets/goodreads_ds.csv")

# # Buscar descripci√≥n
# def fetch_description(isbn, title=None, author=None):
#     base_url = "https://openlibrary.org"

#     # ISBN
#     if pd.notna(isbn):
#         url = f"{base_url}/isbn/{isbn}.json"
#         try:
#             r = requests.get(url, timeout=5)
#             if r.status_code == 200:
#                 data = r.json()
#                 if "description" in data:
#                     if isinstance(data["description"], dict):
#                         return data["description"].get("value", None)
#                     return data["description"]
#         except:
#             pass

#     # Autor y titulo
#     if title:
#         query = f"{title} {author if author else ''}"
#         url = f"{base_url}/search.json?q={query}"
#         try:
#             r = requests.get(url, timeout=5)
#             if r.status_code == 200:
#                 data = r.json()
#                 if "docs" in data and len(data["docs"]) > 0:
#                     doc = data["docs"][0]
#                     if "key" in doc:
#                         work_url = f"{base_url}{doc['key']}.json"
#                         wr = requests.get(work_url, timeout=5)
#                         if wr.status_code == 200:
#                             work_data = wr.json()
#                             if "description" in work_data:
#                                 if isinstance(work_data["description"], dict):
#                                     return work_data["description"].get("value", None)
#                                 return work_data["description"]
#         except:
#             pass

#     return None

# # Tama√±o de los trozos (para no tardar tanto en ejecuci√≥n entre cada guardado de los datos)
# batch_size = 500
# save_path = "/content/drive/MyDrive/Inteligencia Artificial I/datasets/transformed_goodreads_ds.csv"

#
# for start in range(0, len(df), batch_size):
#     end = min(start + batch_size, len(df))
#     print(f"Processing books {start} to {end}...")

#     # Trozo actual
#     df.loc[start:end-1, "description"] = df.iloc[start:end].apply(
#         lambda row: fetch_description(row["isbn"], row["Title"], row["Author"]), axis=1
#     )

#     # Guardar
#     df.to_csv(save_path, index=False)

#     # Small pause to be polite (optional)
#     time.sleep(2)

# # Mensaje de finalizaci√≥n
# print("Completado!")
# print(f"Archivo guardado en: {save_path}")


In [18]:
# import pandas as pd
# import requests
# import time
# import os

# # Path
# original_path = "/content/drive/MyDrive/Inteligencia Artificial I/datasets/goodreads_ds.csv"
# save_path = "/content/drive/MyDrive/Inteligencia Artificial I/datasets/transformed_goodreads_ds.csv"

# # ----------------------------------------------------------------------------
# # Step 1: Load base dataset (all 45k books)
# # ----------------------------------------------------------------------------
# df = pd.read_csv(original_path)
# print(f"üìñ Loaded original dataset with {len(df)} rows.")

# # If progress file exists, merge descriptions into full dataset
# if os.path.exists(save_path):
#     df_progress = pd.read_csv(save_path)
#     print(f"üîÑ Loaded progress file with {len(df_progress)} rows.")

#     # Align by index (assuming same row order in both files)
#     if len(df) == len(df_progress):
#         df["description"] = df_progress.get("description")
#         df["description_status"] = df_progress.get("description_status")
#         print("‚úÖ Merged progress into full dataset.")
#     else:
#         print("‚ö†Ô∏è Warning: Progress file row count does not match original dataset.")
# else:
#     # Initialize empty columns if no progress file exists
#     df["description"] = None
#     df["description_status"] = "unchecked"
#     print("üÜï No progress file found, starting fresh.")

# # ----------------------------------------------------------------------------
# # Step 2: Fetch description function
# # ----------------------------------------------------------------------------
# def fetch_description(isbn, title=None, author=None):
#     base_url = "https://openlibrary.org"

#     if pd.not


üîÑ Loaded saved progress...
‚úÖ Resumed run is complete! All results saved at: /content/drive/MyDrive/Inteligencia Artificial I/datasets/transformed_goodreads_ds.csv


In [20]:
# import pandas as pd
# import requests
# import time
# import os

# # Paths
# original_path = "/content/drive/MyDrive/Inteligencia Artificial I/datasets/goodreads_ds.csv"
# save_path = "/content/drive/MyDrive/Inteligencia Artificial I/datasets/transformed_goodreads_ds.csv"

# # ----------------------------------------------------------------------------
# # Step 1: Load base dataset (all 45k books)
# # ----------------------------------------------------------------------------
# df = pd.read_csv(original_path)
# print(f"üìñ Loaded original dataset with {len(df)} rows.")

# # If progress file exists, merge descriptions into full dataset
# if os.path.exists(save_path):
#     df_progress = pd.read_csv(save_path)
#     print(f"üîÑ Loaded progress file with {len(df_progress)} rows.")

#     # Align by index (assuming same row order in both files)
#     if len(df) == len(df_progress):
#         df["description"] = df_progress.get("description")
#         df["description_status"] = df_progress.get("description_status")
#         print("‚úÖ Merged progress into full dataset.")
#     else:
#         print("‚ö†Ô∏è Warning: Progress file row count does not match original dataset.")
# else:
#     # Initialize empty columns if no progress file exists
#     df["description"] = None
#     df["description_status"] = "unchecked"
#     print("üÜï No progress file found, starting fresh.")

# # ----------------------------------------------------------------------------
# # Step 2: Fetch description function
# # ----------------------------------------------------------------------------
# def fetch_description(isbn, title=None, author=None):
#     base_url = "https://openlibrary.org"

#     if pd.notna(isbn):
#         url = f"{base_url}/isbn/{isbn}.json"
#         try:
#             r = requests.get(url, timeout=5)
#             if r.status_code == 200:
#                 data = r.json()
#                 if "description" in data:
#                     if isinstance(data["description"], dict):
#                         return data["description"].get("value", None)
#                     return data["description"]
#         except:
#             pass

#     if title:
#         query = f"{title} {author if author else ''}"
#         url = f"{base_url}/search.json?q={query}"
#         try:
#             r = requests.get(url, timeout=5)
#             if r.status_code == 200:
#                 data = r.json()
#                 if "docs" in data and len(data["docs"]) > 0:
#                     doc = data["docs"][0]
#                     if "key" in doc:
#                         work_url = f"{base_url}{doc['key']}.json"
#                         wr = requests.get(work_url, timeout=5)
#                         if wr.status_code == 200:
#                             work_data = wr.json()
#                             if "description" in work_data:
#                                 if isinstance(work_data["description"], dict):
#                                     return work_data["description"].get("value", None)
#                                 return work_data["description"]
#         except:
#             pass

#     return None

# # ----------------------------------------------------------------------------
# # Step 3: Resume only on unchecked rows
# # ----------------------------------------------------------------------------
# batch_size = 250
# remaining_indices = df[df["description_status"] == "unchecked"].index

# if len(remaining_indices) == 0:
#     print("üéâ All rows have already been processed!")
# else:
#     for start in range(0, len(remaining_indices), batch_size):
#         batch_idx = remaining_indices[start:start+batch_size]
#         print(f"üìö Processing {len(batch_idx)} books (rows {batch_idx.min()}‚Äì{batch_idx.max()})...")

#         for i in batch_idx:
#             row = df.loc[i]
#             desc = fetch_description(row["isbn"], row["Title"], row["Author"])
#             df.at[i, "description"] = desc
#             df.at[i, "description_status"] = "found" if desc else "not_found"

#         # Save progress
#         df.to_csv(save_path, index=False)
#         print(f"‚úÖ Saved progress up to row {batch_idx.max()}")
#         time.sleep(2)

#     print("üéâ Resume run complete! All results saved at:", save_path)


üìñ Loaded original dataset with 11127 rows.
üîÑ Loaded progress file with 11127 rows.
‚úÖ Merged progress into full dataset.
üéâ All rows have already been processed!


In [27]:
import pandas as pd

# Load your transformed dataset
df = pd.read_csv("/content/drive/MyDrive/Inteligencia Artificial I/datasets/transformed_goodreads_ds.csv")

# Count how many rows have genres
len(df)

# Count how many of those have a description
books_with_desc = df[(df["isbn"].notna()) & (df["description"].notna())].shape[0]

# See the coverage percentage
coverage = (books_with_desc / len(df)) * 100

print(f"üìö Total books with genre: {len(df)}")
print(f"üìù Books with description: {books_with_desc}")
print(f"‚úÖ Coverage: {coverage:.2f}%")


üìö Total books with genre: 11127
üìù Books with description: 5997
‚úÖ Coverage: 53.90%


In [28]:
import pandas as pd

def save_books_with_descriptions(input_path, output_path):
    # Cargar
    df = pd.read_csv(input_path)

    # Filtrar por descripci√≥n
    df_with_desc = df[df["description"].notna()]

    # Guardar a otro CSV
    df_with_desc.to_csv(output_path, index=False)

    print(f"‚úÖ Guardados {len(df_with_desc)} libros a: {output_path}")


# Example usage
input_file = "/content/drive/MyDrive/Inteligencia Artificial I/datasets/transformed_goodreads_ds.csv"
output_file = "/content/drive/MyDrive/Inteligencia Artificial I/datasets/bookreads_w_descriptions_ds.csv"

save_books_with_descriptions(input_file, output_file)


‚úÖ Guardados 5997 libros a: /content/drive/MyDrive/Inteligencia Artificial I/datasets/bookreads_w_descriptions_ds.csv
