In [1]:
import requests
from pathlib import Path
from datetime import datetime

CTRL = Path("control")
CTRL.mkdir(exist_ok=True)

LOG_CSV   = CTRL / "control_log.csv"
DOWN_TXT  = CTRL / "downloaded_books.txt"

START_MARKER = "*** START OF THE PROJECT GUTENBERG EBOOK"
END_MARKER   = "*** END OF THE PROJECT GUTENBERG EBOOK"

def already_downloaded(book_id: int) -> bool:
    # Verifica si el libro ya está registrado como descargado
    return DOWN_TXT.exists() and str(book_id) in DOWN_TXT.read_text(encoding="utf-8").splitlines()

def download_book(book_id: int, base: str = "datalake") -> tuple[Path, Path] | None:
    if already_downloaded(book_id):
        print(f"[SKIP] {book_id} was already downloaded")
        return None
    
    # Particionado por fecha y hora de INGESTIÓN
    now = datetime.now()
    base_dir = Path(base) / now.strftime("%Y%m%d") / now.strftime("%H")
    base_dir.mkdir(parents=True, exist_ok=True)

    url = f"https://www.gutenberg.org/cache/epub/{book_id}/pg{book_id}.txt"
    resp = requests.get(url, timeout=30)
    resp.raise_for_status()
    text = resp.text

    if START_MARKER not in text or END_MARKER not in text:
        print(f"[ERROR] Marcadores no encontrados en el libro {book_id}")
        return None

    header, body_and_footer = text.split(START_MARKER, 1)
    body, _footer = body_and_footer.split(END_MARKER, 1)

    header_path = base_dir / f"{book_id}.header.txt"
    body_path   = base_dir / f"{book_id}.body.txt"

    header_path.write_text(header.strip(), encoding="utf-8")
    body_path.write_text(body.strip(), encoding="utf-8")

    print(f"[OK] {book_id} -> {header_path} / {body_path}")
    return header_path, body_path



In [2]:
#Code for the Control Log
from pathlib import Path
from datetime import datetime
import csv

def register_in_control(book_id: int, header_path: Path, body_path: Path, state="OK"):
    if not LOG_CSV.exists():
        with open(LOG_CSV, "w", newline="", encoding="utf-8") as f:
            csv.writer(f).writerow(["book_id", "date", "header_file", "body_file", "state"])
    with open(LOG_CSV, "a", newline="", encoding="utf-8") as f:
        csv.writer(f).writerow([
            book_id,
            datetime.now().strftime("%Y-%m-%d %H:%M:%S"),
            str(header_path),
            str(body_path),
            state
        ])

    with open(DOWN_TXT, "a", encoding="utf-8") as f:
        f.write(f"{book_id}\n")

    print(f"[CONTROL] Registrado {book_id} en control_log.csv y downloaded_books.txt")



In [None]:
def download_multiple(ids: list[int]):
    # Recorre la lista de IDs para descargar cada libro
    for bid in ids:
        if already_downloaded(bid):
            # Si ya estaba descargado, se omite
            print(f"[SKIP] {bid} was already downloaded")
            continue
        res = download_book(bid)   # Usa tu función existente para descargar
        if res:
            header_path, body_path = res
            register_in_control(bid, header_path, body_path)  # Registra en el control


download_multiple([1342, 11, 84])


[SKIP] 1342 was already downloaded
[SKIP] 11 was already downloaded
[SKIP] 84 was already downloaded


In [3]:
import re, json
from collections import defaultdict

DATALAKE_PATH = Path("datalake")
DATAMARTS_PATH = Path("datamarts"); DATAMARTS_PATH.mkdir(exist_ok=True)

def tokenize(text: str) -> list[str]:
    """
    Convierte el texto en una lista de palabras en minúsculas,
    eliminando cualquier carácter que no sea alfanumérico.
    """
    text = text.lower()
    words = re.findall(r'\b\w+\b', text)
    return words

def build_inverted_index() -> dict[str, set[int]]:
    inverted_index = defaultdict(set)
    for body_file in DATALAKE_PATH.rglob("*.body.txt"):
        book_id = int(body_file.stem.split(".")[0])   
        text = body_file.read_text(encoding="utf-8")
        words = tokenize(text)
        for word in words:
            inverted_index[word].add(book_id)
    return inverted_index


def save_inverted_index(index: dict[str, set[int]]):
    """
    Guarda el índice invertido en formato JSON dentro de datamarts/
    """
    json_path = DATAMARTS_PATH / "inverted_index.json"
    # Convertimos los sets a listas para que sean serializables en JSON
    serializable_index = {word: list(ids) for word, ids in index.items()}
    with open(json_path, "w", encoding="utf-8") as f:
        json.dump(serializable_index, f, ensure_ascii=False, indent=2)
    print(f"[OK] Inverted index saved at {json_path}")


In [4]:
if __name__ == "__main__":

    book_ids = [43, 56]

    for book_id in book_ids:
        if already_downloaded(book_id):   
            print(f"[SKIP] {book_id} was already downloaded")
            continue

        try:
            res = download_book(book_id)
            if res:
                header_file, body_file = res
                register_in_control(book_id, header_file, body_file)
            else:
                register_in_control(book_id, "-", "-", state="FAILED")
        except Exception as e:
            print(f"[ERROR] Failed downloading {book_id}: {e}")
            register_in_control(book_id, "-", "-", state="FAILED")

    inverted_index = build_inverted_index()
    print("[OK] Inverted index built")

    save_inverted_index(inverted_index)

    word = "love"
    results = inverted_index.get(word.lower(), set())
    print(f"Results for '{word}': {results}")

[OK] 43 -> datalake\20251002\16\43.header.txt / datalake\20251002\16\43.body.txt
[CONTROL] Registrado 43 en control_log.csv y downloaded_books.txt
[OK] 56 -> datalake\20251002\16\56.header.txt / datalake\20251002\16\56.body.txt
[CONTROL] Registrado 56 en control_log.csv y downloaded_books.txt
[OK] Inverted index built
[OK] Inverted index saved at datamarts\inverted_index.json
Results for 'love': {11, 43, 15, 84, 1342}
