
# üì¶ 00_insumo ¬∑ Generador de **events.ndjson** (datos sint√©ticos)

Este notebook genera un **archivo NDJSON** con eventos web sint√©ticos para alimentar el pipeline **BRONCE ‚Üí PLATA ‚Üí ORO**.

Incluye:
- Par√°metros reproducibles (`date`, `n`, `seed`).
- Simulaci√≥n de sesiones con navegaci√≥n y compras.
- **Inyecci√≥n opcional de errores** (l√≠neas mal formadas, timestamps fuera de d√≠a, valores inv√°lidos).
- Escritura con l√≠mite de tama√±o (`MAX_SIZE_BYTES`).
- Salida final en `data/drops/<YYYY-MM-DD>/events.ndjson`.


## üì¶ Importaciones

In [None]:

import json
import random
from datetime import datetime, timedelta, timezone
from typing import List, Dict, Any, Iterable, Tuple, Optional
from pathlib import Path

# Base de trabajo del notebook (persistente)
BASE_DIR = Path('/mnt/data')
BASE_DIR.mkdir(parents=True, exist_ok=True)
print("BASE_DIR:", BASE_DIR)



## ‚öôÔ∏è Configuraci√≥n (autosuficiente)

Reemplaza a `configs.get_data_config`. Puedes ajustar los par√°metros seg√∫n tus necesidades.


In [None]:

# Rutas y l√≠mites
MAX_SIZE_BYTES = 256 * 1024  # 256 KB para la demo; ajusta si quieres archivos m√°s grandes

# Universo de paths visitables
LOOK_SITE = ["/", "/productos", "/carrito", "/checkout", "/blog", "/contacto", "/about"]
PIPELINE_MAKE_PURCHASE = {
    "/": "/productos",
    "/productos": "/carrito",
    "/carrito": "/checkout",
    "/checkout": "/",  # reseteo de ciclo
}

# Control de frecuencia de compra
RATE_MAKE_PURCHASE = 25  # 25% de prob. de iniciar camino de compra desde '/'
VALID_DEVICES = ["mobile", "desktop", "tablet"]
VALID_REFERRERS = ["direct", "google", "facebook"]
VALID_USERS = [f"u{i}" for i in range(1, 21)]  # 20 usuarios posibles



## üß∞ Utilidad de escritura (`ensure_dir`)

Equivalente a `utils.files.ensure_dir`: asegura el directorio y devuelve la ruta final del archivo.


In [None]:

def ensure_dir(path_dirs: str, file_name: str) -> str:
    out_dir = BASE_DIR / path_dirs
    out_dir.mkdir(parents=True, exist_ok=True)
    return str(out_dir / file_name)


## üß† L√≥gica de generaci√≥n de eventos

In [None]:

def parse_args(date_str: str = datetime.now().date().isoformat(), events_n: int = 150, seed: int = 17) -> Dict[str, Any]:
    return {"date": date_str, "n": events_n, "seed": seed}

def iso(dt: datetime) -> str:
    return dt.astimezone(timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")

def make_purchase(current: datetime, session: Dict[str, Any], rng: random.Random):
    current += timedelta(seconds=rng.randint(5, 30))
    finish_session = False
    if session["path"] == '/carrito':
        finish_session = True
    return [current, {"ts": iso(current), "user_id": session["user_id"],
            "path": PIPELINE_MAKE_PURCHASE[session["path"]], "referrer": session["path"], "device": session["device"]}, finish_session]

def update_session(sessions, user_id, new_session_data):
    idx = next((i for i, s in enumerate(sessions) if s["user_id"] == user_id), None)
    if idx is not None:
        sessions[idx].update(new_session_data.copy())
        return True
    return False

def choose_action(user: str, current: datetime, device: str, rng: random.Random):
    current += timedelta(seconds=rng.randint(5, 30))
    rate = rng.randint(1, 100)
    if rate <= RATE_MAKE_PURCHASE:
        return [current, {"ts": iso(current), "user_id": user,
                          "path": PIPELINE_MAKE_PURCHASE["/"], "referrer": "/", "device": device}, False]
    elif rate > 95:
        return [current, None, True]
    else:
        path = rng.choice(LOOK_SITE)
        return [current, {"ts": iso(current), "user_id": user,
                          "path": path, "referrer": "/", "device": device}, True]

def generate_session(current: datetime, rng: random.Random):
    current += timedelta(seconds=rng.randint(5, 30))
    user = rng.choice(VALID_USERS)
    device = rng.choices(population=VALID_DEVICES, weights=[55, 38, 7], k=1)[0]
    ref = rng.choices(population=VALID_REFERRERS, weights=[40, 35, 8], k=1)[0]
    return [current, {"ts": iso(current), "user_id": user, "path": "/", "referrer": ref, "device": device}]


### üß™ Inyecci√≥n de errores controlados

In [None]:

def inject_error(event: Dict[str, Any], rng: random.Random) -> Any:
    """Devuelve UN solo evento err√≥neo al azar (o una l√≠nea no-JSON)."""
    error_type = rng.choice([
        "missing_field",
        "bad_timestamp_format",
        "bad_values",
        "not_json",
        "timestamp_out_of_day",
        "empty_user_id",
    ])

    if error_type == "missing_field":
        field = rng.choice(["referrer", "device", "path'])
        event.pop(field, None)

    elif error_type == "bad_timestamp_format":
        event["ts"] = "03-01-2025 10:15:00"

    elif error_type == "bad_values":
        event["device"] = rng.choice(["toaster", "phon3", "desk-top", ""])  # no v√°lidos
        event["referrer"] = rng.choice([None, "(not set)", "   ", "file://local", "http://malformed"])  # se normalizar√°n a None o se descartar√°n
        event["path"] = rng.choice(["productos", "checkout", "//double-slash", ""])  # faltan / inicial, etc.

    elif error_type == "not_json":
        # Devolvemos una l√≠nea **no JSON** que se escribir√° tal cual al NDJSON
        return "NOT_JSON_LINE this is a broken log line"

    elif error_type == "timestamp_out_of_day":
        event["ts"] = "2024-01-04T00:00:00Z"

    elif error_type == "empty_user_id":
        event["user_id"] = ""

    return event


## üîÅ Generaci√≥n de eventos v√°lidos (con sesiones)

In [None]:

def generate_valid_events(date_str: str, n: int, rng: random.Random) -> List[Any]:
    y, m, d = map(int, date_str.split("-"))
    start = datetime(y, m, d, 0, 0, 0, tzinfo=timezone.utc)

    current = start + timedelta(minutes=rng.randint(0, 180))
    events: List[Any] = []
    sessions: List[Dict[str, Any]] = []

    for _ in range(n):
        current, event = generate_session(current, rng)
        exist_session = [s for s in sessions if event['user_id'] in s['user_id']]
        exist_session = exist_session[0] if exist_session else None

        # 10% de probabilidad de inyectar un error
        if rng.random() < 0.10:
            corrupted = inject_error(event.copy(), rng)
            events.append(corrupted)

        if exist_session is None:
            events.append(event)
            sessions.append(event.copy())
        else:
            if exist_session['path'] == "/":
                current, event2, finished_session = choose_action(exist_session['user_id'], current, exist_session['device'], rng)
                if event2 is not None:
                    events.append(event2)
                if finished_session:
                    sessions.remove(exist_session)
                else:
                    update_session(sessions, event2["user_id"], event2)
            else:
                current, event2, finished_session = make_purchase(current, exist_session, rng)
                if rng.random() < 0.60:
                    events.append(event2)
                    if finished_session:
                        sessions.remove(exist_session)
                    else:
                        update_session(sessions, event2["user_id"], event2)
                else:
                    sessions.remove(exist_session)

    return events


## üíæ Escritura NDJSON con l√≠mite de tama√±o

In [None]:

def write_ndjson_limited(path: str, lines: List[str], max_bytes: int):
    written = 0
    with open(path, "w", encoding="utf-8") as f:
        for line in lines:
            b = (line + "\n").encode("utf-8")
            if written + len(b) > max_bytes:
                break
            f.write(line)
            f.write("\n")
            written += len(b)
    return written


## üöÄ Ejecuci√≥n (`main`) y vista previa

In [None]:

def main():
    args = parse_args(events_n=500, seed=42)
    rng = random.Random(args["seed"])

    out_path = ensure_dir(f"data/drops/{args['date']}", "events.ndjson")
    valid = generate_valid_events(args["date"], args["n"], rng)

    # Convertimos a l√≠neas NDJSON: si es string, la dejamos tal cual (para simular l√≠nea rota no-JSON)
    lines = []
    for event in valid:
        if isinstance(event, str):
            lines.append(event)
        else:
            lines.append(json.dumps(event, ensure_ascii=False))

    written_bytes = write_ndjson_limited(out_path, lines, MAX_SIZE_BYTES)
    print(f"‚úî Archivo generado: {out_path} ({written_bytes/1024:.2f} KB)")
    return out_path

# Ejecutar demo
out_path = main()

# Vista previa de las primeras 10 l√≠neas
from itertools import islice
print("\nPrimeras 10 l√≠neas:\n")
with open(out_path, 'r', encoding='utf-8') as fh:
    for i, line in zip(range(10), fh):
        print(line.rstrip())
