In [None]:
%pip install transformers mistralai lakefs docx2pdf slack-sdk -q

In [None]:
import os
import io
import json
import base64
import requests

from mistralai import Mistral
from lakefs.client import Client
from lakefs import Repository
from transformers import AutoTokenizer

In [None]:
# — lakeFS
LAKEFS_HOST     = os.environ["LAKEFS_HOST"]
LAKEFS_USERNAME = os.environ["LAKEFS_USERNAME"]
LAKEFS_PASSWORD = os.environ["LAKEFS_PASSWORD"]
LAKEFS_REPO     = os.environ.get("lakefs_repository")
LAKEFS_COMMIT   = os.environ.get("lakefs_commit")

In [None]:
# — Mistral OCR
OCR_API_KEY = os.environ["MISTRAL_AI_API_KEY"]
OCR_URL= os.environ["MISTRAL_AI_URL"]

In [None]:
# — TWYD API
TWYD_URL = os.environ.get("TWYD_API_URL")
TWYD_KEY = os.environ.get("TWYD_API_KEY")
TWYD_TOPIC_ID = os.environ.get("topic_id")
TEXT_EMBEDDER = os.environ.get("embedding_model","nomic-ai/nomic-embed-text-v1")
MAX_TOKENS = 8192

In [None]:
# Initialize clients
ocr_client    = Mistral(OCR_API_KEY)
lakefs_client = Client(
    host=LAKEFS_HOST,
    username=LAKEFS_USERNAME,
    password=LAKEFS_PASSWORD,
)
repo = Repository(repository_id=LAKEFS_REPO, client=lakefs_client)
ref  = repo.ref(LAKEFS_COMMIT)

In [None]:
# TWYD headers & tokenizer
twyd_headers = {"Authorization": f"Bearer {TWYD_KEY}"}
tokenizer    = AutoTokenizer.from_pretrained(TEXT_EMBEDDER, trust_remote_code=True)

In [None]:
# 🔧 Helper Functions

def get_signed_url(data: bytes, name: str) -> str:
    """Upload raw bytes to Mistral and return the signed OCR URL."""
    up = ocr_client.files.upload(
        file={"file_name": name, "content": data},
        purpose="ocr"
    )
    signed = ocr_client.files.get_signed_url(file_id=up.id)
    return signed.url

def run_ocr(name: str, url: str) -> dict:
    """Call Mistral OCR and return the JSON response, handling rate limits."""
    headers = {
        "Authorization": f"Bearer {OCR_API_KEY}",
        "Content-Type": "application/json"
    }
    payload = {
        "model": "mistral-ocr-2503",
        "id":    "mistral-ocr-latest",
        "document": {
            "type":         "document_url",
            "document_url": url,
            "document_name": name,
        },
        "include_image_base64": True
    }
    resp = requests.post(OCR_URL, json=payload, headers=headers)
    if resp.status_code == 429:
        raise RuntimeError("Mistral AI API key rate limit exceeded (HTTP 429)")
    resp.raise_for_status()
    return resp.json()

def extract_markdown(ocr_json: dict) -> str:
    """Concatenate all pages’ markdown into one string."""
    return "\n\n".join(page.get("markdown", "") for page in ocr_json.get("pages", []))

def calculate_optimal_chunk_size(total_tokens: int) -> int:
    """Find the largest odd divisor < MAX_TOKENS."""
    for chunks in range(1, 34, 2):
        chunk_size = total_tokens / chunks if chunks else total_tokens
        if chunk_size < MAX_TOKENS:
            return int(min(MAX_TOKENS, chunk_size))
    return MAX_TOKENS

def load_to_twyd(name: str, markdown: str):
    # 1) Upload markdown as a file
    files_payload = {
        'file': (
            f"{os.path.splitext(name)[0]}.md",
            io.BytesIO(markdown.encode("utf-8")),
            'text/markdown'
        )
    }
    upload_resp = requests.post(
        f"https://{TWYD_URL}/api/files/upload",
        files=files_payload,
        headers=twyd_headers
    )
    upload_resp.raise_for_status()
    file_id = upload_resp.json().get("id")
    if not file_id:
        raise RuntimeError("No se devolvió 'id' al subir el archivo a TWYD")

    # 2) Compute tokens & chunk size
    total_tokens = len(tokenizer.tokenize(markdown))
    chunk_size   = calculate_optimal_chunk_size(total_tokens)
    json_body = {
        "separators": [
            "# ", "## ", "### ", "#### ", "##### ",
            "\n\n", "\n", "> ", "- ", "* ", "---"
        ],
        "isSeparatorRegex": False,
        "chunkSize": chunk_size,
        "chunkOverlap": int(chunk_size * 0.05),
        "keepSeparator": True,
        "addStartIndex": False,
        "stripWhitespace": True
    }

    # 3) Associate file to topic
    assoc_resp = requests.put(
        f"https://{TWYD_URL}/api/topics/{TWYD_TOPIC_ID}/add/{file_id}",
        headers={**twyd_headers, "Content-Type": "application/json"},
        json=json_body
    )
    assoc_resp.raise_for_status()

In [None]:
# ▶️ Single‐loop ETL → OCR → TWYD + status.json
status = {
    "repository": LAKEFS_REPO,
    "commit":     LAKEFS_COMMIT,
    "processed":  [],
    "errors":     []
}

for obj in ref.objects(prefix=""):
    key = obj.path
    # only PDFs or DOCX
    if not key.lower().endswith((".pdf", ".docx")):
        continue

    name = os.path.basename(key)
    try:
        # 1) Extract raw bytes
        with ref.object(path=key).reader(mode="rb") as r:
            data = r.read()

        # 2) Transform via OCR
        signed   = get_signed_url(data, name)
        ocr_json = run_ocr(name, signed)
        md_text  = extract_markdown(ocr_json)

        # 3) Load into TWYD
        load_to_twyd(name, md_text)

        status["processed"].append(name)

    except Exception as e:
        status["errors"].append({"file": name, "error": str(e)})

# 4) Persist status.json for the Slack reporter
with open("status.json", "w", encoding="utf-8") as f:
    json.dump(status, f, indent=2)

print(f"✅ ETL+TWYD done: {len(status['processed'])} succeeded, "
      f"{len(status['errors'])} failed.")