# ü¶ñ THE LAW EATER V7: Massive Legal Ingestion (Requests Edition)

**Mission:** Ingest thousands of Indonesian Law PDFs into Qdrant with **High Quality Context Injection**.
**Updates in V7:**
- **Pure Requests:** Uses the standard `requests` library instead of the OpenAI client to bypass library-specific connection issues.
- **SSL Verification Disabled:** Option to ignore SSL errors if Colab's proxy is interfering.
- **Connectivity Test:** Verifies connection before starting.

## üöÄ Setup
We use Colab for its high bandwidth and processing power.

In [None]:
# Install dependencies
!pip install -q qdrant-client langchain langchain-community langchain-text-splitters pypdf tqdm tiktoken requests tenacity

In [None]:
# Authentication & Configuration
from google.colab import userdata
import os

try:
    QDRANT_URL = userdata.get("QDRANT_URL")
    QDRANT_API_KEY = userdata.get("QDRANT_API_KEY")
except Exception:
    QDRANT_URL = input("Enter Qdrant URL: ")
    QDRANT_API_KEY = input("Enter Qdrant API Key: ")

# OpenAI Key
try:
    OPENAI_API_KEY = userdata.get("OPENAI_API_KEY")
except Exception:
    OPENAI_API_KEY = input("Enter OpenAI API Key: ")

os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# TARGET COLLECTION (Main Legal DB)
COLLECTION_NAME = "legal_unified"
VECTOR_SIZE = 1536  # OpenAI Standard

In [None]:
# üß™ OPENAI CONNECTIVITY TEST (Pure Requests)
import requests


def test_openai_connection():
    url = "https://api.openai.com/v1/embeddings"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    data = {"input": "Test connection", "model": "text-embedding-3-small"}

    print("üß™ Testing OpenAI Connection via requests...")
    try:
        response = requests.post(
            url, headers=headers, json=data, timeout=10, verify=False
        )  # Verify=False to bypass SSL issues
        if response.status_code == 200:
            print(
                "‚úÖ OpenAI Connection SUCCESS! Vector length:",
                len(response.json()["data"][0]["embedding"]),
            )
            return True
        else:
            print(f"‚ùå OpenAI Connection FAILED! Status: {response.status_code}")
            print(response.text)
            return False
    except Exception as e:
        print(f"‚ùå OpenAI Connection ERROR: {e}")
        return False


if not test_openai_connection():
    print("\n‚ö†Ô∏è WARNING: Connection test failed. The script might not work.")
else:
    print("\nüöÄ Connection looks good! Proceeding...")

In [None]:
# ‚è∞ WAKE UP & CONNECTION CHECK
import time

print(f"üîå Connecting to {QDRANT_URL}...")

max_retries = 10
for i in range(max_retries):
    try:
        response = requests.get(QDRANT_URL, timeout=10)
        if response.status_code == 200:
            print("‚úÖ Server is AWAKE and responding!")
            print(f"   Version: {response.json().get('version', 'Unknown')}")
            break
    except Exception as e:
        print(f"   üí§ Waiting for server to wake up... ({i + 1}/{max_retries}) - {e}")
        time.sleep(5)
else:
    print("‚ùå Server did not wake up. Check URL or Fly.io status.")

In [None]:
# üß† CORE LOGIC INJECTION (The Butcher System)
import re

# --- CONSTANTS ---
NOISE_PATTERNS = [
    re.compile(r"^Halaman\s+\d+\s+dari\s+\d+", re.IGNORECASE | re.MULTILINE),
    re.compile(
        r"^Salinan sesuai dengan aslinya.*?(?=\n)",
        re.IGNORECASE | re.MULTILINE | re.DOTALL,
    ),
    re.compile(r"^PRESIDEN REPUBLIK INDONESIA\s*\n", re.IGNORECASE | re.MULTILINE),
    re.compile(r"^\s*-\s*\d+\s*-\s*$", re.MULTILINE),
    re.compile(r"\n{3,}", re.MULTILINE),
    re.compile(r"^\s*\d+\s*$", re.MULTILINE),
]

LEGAL_TYPE_PATTERN = re.compile(
    r"(UNDANG-UNDANG|PERATURAN PEMERINTAH|KEPUTUSAN PRESIDEN|PERATURAN MENTERI|QANUN|PERATURAN DAERAH|PERATURAN KEPALA)",
    re.IGNORECASE,
)

LEGAL_TYPE_ABBREV = {
    "UNDANG-UNDANG": "UU",
    "PERATURAN PEMERINTAH": "PP",
    "KEPUTUSAN PRESIDEN": "Keppres",
    "PERATURAN MENTERI": "Permen",
    "QANUN": "Qanun",
    "PERATURAN DAERAH": "Perda",
    "PERATURAN KEPALA": "Perkep",
}

NUMBER_PATTERN = re.compile(r"NOMOR\s+(\d+[A-Z]?)(?:[/-]\d+)?", re.IGNORECASE)
YEAR_PATTERN = re.compile(r"TAHUN\s+(\d{4})", re.IGNORECASE)
TOPIC_PATTERN = re.compile(
    r"TENTANG\s+(.+?)(?=DENGAN RAHMAT|Menimbang|Mengingat|$)", re.IGNORECASE | re.DOTALL
)

PASAL_PATTERN = re.compile(
    r"^Pasal\s+(\d+[A-Z]?)\s*(.+?)(?=^Pasal\s+\d+|^BAB\s+|^Penjelasan|\Z)",
    re.IGNORECASE | re.MULTILINE | re.DOTALL,
)
BAB_PATTERN = re.compile(
    r"^BAB\s+([IVX]+|[A-Z]+|\d+)\s+(.+?)(?=\n|$)", re.IGNORECASE | re.MULTILINE
)
AYAT_PATTERN = re.compile(
    r"(?:^|\n)\s*\((\d+)\)\s*(.+?)(?=(?:^|\n)\s*\(\d+\)|$)", re.MULTILINE | re.DOTALL
)

# --- CLASSES ---


class LegalCleaner:
    def clean(self, text: str) -> str:
        if not text:
            return ""
        cleaned = text
        for pattern in NOISE_PATTERNS:
            cleaned = pattern.sub("", cleaned)
        cleaned = re.sub(r"\s+", " ", cleaned)
        cleaned = re.sub(r"\n\s+\n", "\n\n", cleaned)
        cleaned = re.sub(
            r"Pasal\s+(\d+[A-Z]?)", r"Pasal \1", cleaned, flags=re.IGNORECASE
        )
        return cleaned.strip()


class LegalMetadataExtractor:
    def extract(self, text: str) -> dict:
        meta = {
            "type": "UNKNOWN",
            "number": "UNKNOWN",
            "year": "UNKNOWN",
            "topic": "UNKNOWN",
        }

        type_match = LEGAL_TYPE_PATTERN.search(text)
        if type_match:
            doc_type = type_match.group(1).upper()
            meta["type"] = doc_type
            meta["type_abbrev"] = LEGAL_TYPE_ABBREV.get(doc_type, doc_type)

        num_match = NUMBER_PATTERN.search(text)
        if num_match:
            meta["number"] = num_match.group(1)

        year_match = YEAR_PATTERN.search(text)
        if year_match:
            meta["year"] = year_match.group(1)

        topic_match = TOPIC_PATTERN.search(text)
        if topic_match:
            meta["topic"] = re.sub(r"\s+", " ", topic_match.group(1).strip())[:200]

        return meta


class LegalStructureParser:
    def parse(self, text: str) -> dict:
        structure = {"batang_tubuh": []}
        bab_matches = list(BAB_PATTERN.finditer(text))
        for i, match in enumerate(bab_matches):
            bab_num = match.group(1)
            bab_title = match.group(2).strip()
            start = match.end()
            end = bab_matches[i + 1].start() if i + 1 < len(bab_matches) else len(text)

            # Simple pasal extraction for context finding
            pasal_list = []
            pasal_matches = list(PASAL_PATTERN.finditer(text[start:end]))
            for p_match in pasal_matches:
                pasal_list.append({"number": p_match.group(1)})

            structure["batang_tubuh"].append(
                {"number": bab_num, "title": bab_title, "pasal": pasal_list}
            )
        return structure


class LegalChunker:
    def __init__(self, max_pasal_tokens=1000):
        self.max_pasal_tokens = max_pasal_tokens

    def chunk(self, text: str, metadata: dict, structure: dict) -> list:
        chunks = []
        pasal_matches = list(PASAL_PATTERN.finditer(text))

        # If no pasal found, treat as one chunk (e.g. short decree)
        if not pasal_matches:
            context = self._build_context(metadata)
            chunks.append(self._create_chunk(text, context, metadata))
            return chunks

        for match in pasal_matches:
            pasal_num = match.group(1)
            pasal_text = match.group(2).strip()

            # Find BAB context
            bab_context = self._find_bab_for_pasal(structure, pasal_num)

            # Check length - split by Ayat if needed
            if len(pasal_text) > 3000:  # Approx chars for 1000 tokens
                ayat_matches = list(AYAT_PATTERN.finditer(pasal_text))
                if ayat_matches:
                    for am in ayat_matches:
                        ayat_num = am.group(1)
                        ayat_text = am.group(2).strip()
                        context = self._build_context(
                            metadata, bab_context, f"Pasal {pasal_num}"
                        )
                        chunks.append(
                            self._create_chunk(
                                f"Ayat ({ayat_num})\n{ayat_text}",
                                context,
                                metadata,
                                pasal_num,
                            )
                        )
                    continue

            # Default: Pasal as chunk
            context = self._build_context(metadata, bab_context, f"Pasal {pasal_num}")
            chunks.append(self._create_chunk(pasal_text, context, metadata, pasal_num))

        return chunks

    def _build_context(self, meta, bab=None, pasal=None):
        parts = [
            meta.get("type_abbrev", "UNK"),
            f"NO {meta.get('number', '?')}",
            f"TAHUN {meta.get('year', '?')}",
            f"TENTANG {meta.get('topic', 'UNK')}",
        ]
        if bab:
            parts.append(bab)
        if pasal:
            parts.append(pasal)
        return f"[CONTEXT: {' - '.join(parts)}]"

    def _create_chunk(self, content, context, meta, pasal_num=None):
        chunk_text = f"{context}\n\n{content}"
        c = {"text": chunk_text, "has_context": True}
        c.update(meta)
        if pasal_num:
            c["pasal_number"] = pasal_num
        return c

    def _find_bab_for_pasal(self, structure, pasal_num):
        for bab in structure.get("batang_tubuh", []):
            for p in bab.get("pasal", []):
                if p.get("number") == pasal_num:
                    return f"BAB {bab.get('number')} - {bab.get('title', '')}"
        return None


print("‚úÖ Nuzantara Legal Engine Loaded Successfully!")

In [None]:
# üì• DOWNLOAD DATA
!gdown 1Lx4y9TQ45uBUyvNzeHiHinxo_k_WOMmm -O /content/nuzantara_laws.zip

# üì¶ Unzip
import os
import zipfile

ZIP_PATH = "/content/nuzantara_laws.zip"
EXTRACT_DIR = "/content/nuzantara_laws"

if os.path.exists(ZIP_PATH):
    print("üìÇ Extracting...")
    with zipfile.ZipFile(ZIP_PATH, "r") as zip_ref:
        zip_ref.extractall(EXTRACT_DIR)
    print("‚úÖ Extraction complete!")
    SOURCE_DIR = EXTRACT_DIR
else:
    print("‚ùå Zip file not found!")

In [None]:
# üíæ Qdrant Connection & RE-INIT
from qdrant_client import QdrantClient
from qdrant_client.http import models

client = QdrantClient(
    url=QDRANT_URL,
    api_key=QDRANT_API_KEY,
    timeout=60,
    prefer_grpc=False,  # Force HTTP to avoid some SSL issues
)

print(
    f"‚ö†Ô∏è WIPING and Recreating collection '{COLLECTION_NAME}' for HIGH QUALITY INGESTION..."
)

try:
    if client.collection_exists(COLLECTION_NAME):
        client.delete_collection(COLLECTION_NAME)
        print(f"   Deleted existing collection '{COLLECTION_NAME}'")

    client.create_collection(
        collection_name=COLLECTION_NAME,
        vectors_config=models.VectorParams(
            size=VECTOR_SIZE, distance=models.Distance.COSINE
        ),
    )
    print("‚úÖ Collection wiped and ready!")
except Exception as e:
    print(f"‚ùå Error recreating collection: {e}")
    print("   Trying to proceed... maybe it was already deleted/created.")

In [None]:
# üöÄ EXECUTION LOOP (The Butcher)
import glob
from tqdm.notebook import tqdm
from langchain_community.document_loaders import PyPDFLoader
from tenacity import retry, stop_after_attempt, wait_exponential
import requests

# Initialize Engine Components
cleaner = LegalCleaner()
extractor = LegalMetadataExtractor()
parser = LegalStructureParser()
chunker = LegalChunker()

# Find all PDFs recursively
pdf_files = glob.glob(f"{SOURCE_DIR}/**/*.pdf", recursive=True)
print(f"üìö Found {len(pdf_files)} PDFs to ingest")


# PURE REQUESTS EMBEDDING FUNCTION
@retry(stop=stop_after_attempt(5), wait=wait_exponential(multiplier=1, min=2, max=10))
def robust_embed(texts):
    url = "https://api.openai.com/v1/embeddings"
    headers = {
        "Authorization": f"Bearer {OPENAI_API_KEY}",
        "Content-Type": "application/json",
    }
    # Replace newlines in text to avoid issues
    clean_texts = [t.replace("\n", " ") for t in texts]

    data = {"input": clean_texts, "model": "text-embedding-3-small"}

    response = requests.post(url, headers=headers, json=data, timeout=20, verify=False)
    if response.status_code != 200:
        raise Exception(f"OpenAI API Error {response.status_code}: {response.text}")

    return [d["embedding"] for d in response.json()["data"]]


BATCH_SIZE = 1  # REDUCED TO 1 FOR STABILITY
EMBED_MINI_BATCH = 5  # ULTRA SMALL BATCHES (5 chunks)

for i in tqdm(range(0, len(pdf_files), BATCH_SIZE), desc="Batch Processing"):
    batch_files = pdf_files[i : i + BATCH_SIZE]
    batch_points = []

    for pdf_file in batch_files:
        try:
            print(f"Processing: {os.path.basename(pdf_file)}")

            # 0. Check PDF Header (Magic Bytes) to avoid corrupt files
            with open(pdf_file, "rb") as f:
                header = f.read(4)
                if header != b"%PDF":
                    print(
                        f"‚ö†Ô∏è Skipping invalid PDF (bad header): {os.path.basename(pdf_file)}"
                    )
                    continue

            # 1. Load
            loader = PyPDFLoader(pdf_file)
            pages = loader.load()
            raw_text = "\n".join([p.page_content for p in pages])

            if len(raw_text) < 100:
                print("   Skipping (too short/scanned)")
                continue

            # 2. Clean
            cleaned_text = cleaner.clean(raw_text)

            # 3. Metadata
            meta = extractor.extract(cleaned_text)
            if meta["type"] == "UNKNOWN":
                meta["topic"] = os.path.basename(pdf_file)

            # 4. Structure
            structure = parser.parse(cleaned_text)

            # 5. Chunk (Butcher)
            chunks = chunker.chunk(cleaned_text, meta, structure)

            # 6. Prepare for Qdrant
            if chunks:
                print(f"   Embedding {len(chunks)} chunks...")
                # Embed in MINI-BATCHES to avoid network overload
                texts = [c["text"] for c in chunks]
                vectors = []

                # Process mini-batches
                for k in range(0, len(texts), EMBED_MINI_BATCH):
                    mini_batch_texts = texts[k : k + EMBED_MINI_BATCH]
                    try:
                        mini_batch_vectors = robust_embed(mini_batch_texts)
                        vectors.extend(mini_batch_vectors)
                    except Exception as e:
                        print(f"   ‚ùå Failed to embed batch {k}: {e}")
                        raise e  # Re-raise to skip this file

                for j, chunk in enumerate(chunks):
                    point_id = f"{meta.get('type_abbrev')}-{meta.get('number')}-{meta.get('year')}_chunk_{j}_{os.urandom(4).hex()}"
                    point_id = re.sub(r"[^a-zA-Z0-9_-]", "_", point_id)

                    batch_points.append(
                        models.PointStruct(
                            id=point_id, vector=vectors[j], payload=chunk
                        )
                    )

        except Exception as e:
            print(f"‚ùå Error processing {os.path.basename(pdf_file)}: {e}")
            print("   ‚è≠Ô∏è SKIPPING FILE due to repeated errors.")
            continue  # Explicitly continue to next file

    # Upload Batch
    if batch_points:
        try:
            client.upsert(collection_name=COLLECTION_NAME, points=batch_points)
            print(f"   ‚úÖ Uploaded {len(batch_points)} points.")
        except Exception as e:
            print(f"‚ùå Upload failed: {e}")

print("‚úÖ Ingestion Complete!")