# ü¶ñ THE LAW EATER: Massive PDF Ingestion (OpenAI Edition)

**Mission:** Ingest thousands of Indonesian Law PDFs into Qdrant.
**Model:** `text-embedding-3-small` (1536 dimensions) - **Compatible with Nuzantara Main DB**.
**Target:** Qdrant Cloud (`legal_unified` collection).
**Bonus:** Exports raw text to Drive for the "Conversation Factory".

## üöÄ Setup
We use Colab for its high bandwidth and processing power (PDF parsing).

In [None]:
# Install dependencies
!pip install -q qdrant-client langchain langchain-community langchain-openai langchain-text-splitters pypdf tqdm

In [None]:
# Authentication & Configuration
from google.colab import userdata
import os

try:
    QDRANT_URL = userdata.get('QDRANT_URL')
    QDRANT_API_KEY = userdata.get('QDRANT_API_KEY')
except Exception:
    QDRANT_URL = input("Enter Qdrant URL: ")
    QDRANT_API_KEY = input("Enter Qdrant API Key: ")

# Hardcoded API Key as requested
OPENAI_API_KEY = userdata.get('OPENAI_API_KEY') or input("Enter OpenAI API Key: ")
os.environ["OPENAI_API_KEY"] = OPENAI_API_KEY

# TARGET COLLECTION (Main Legal DB)
COLLECTION_NAME = "legal_unified"
VECTOR_SIZE = 1536 # OpenAI Standard

In [None]:
# üì• DOWNLOAD DATA (Direct from your Link)
# We use gdown to grab the zip file directly from the link you provided.
!gdown 1Lx4y9TQ45uBUyvNzeHiHinxo_k_WOMmm -O /content/nuzantara_laws.zip

# üì¶ Unzip & Setup Paths
import os
import zipfile

ZIP_PATH = "/content/nuzantara_laws.zip"
EXTRACT_DIR = "/content/nuzantara_laws"
TEXT_OUTPUT_DIR = "/content/nuzantara_laws_text"

if os.path.exists(ZIP_PATH):
    print(f"üì¶ Found zip: {ZIP_PATH}")
    if not os.path.exists(EXTRACT_DIR):
        print("üìÇ Extracting...")
        with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
            zip_ref.extractall(EXTRACT_DIR)
        print("‚úÖ Extraction complete!")
    else:
        print("‚úÖ Already extracted.")
    SOURCE_DIR = EXTRACT_DIR
else:
    print("‚ùå Zip file not found! Check the download step.")
    SOURCE_DIR = "/content/nuzantara_laws" # Attempt anyway

if not os.path.exists(TEXT_OUTPUT_DIR):
    os.makedirs(TEXT_OUTPUT_DIR)
    print(f"‚úÖ Created text output directory: {TEXT_OUTPUT_DIR}")

In [None]:
# üß† The Engine: OpenAI Embeddings
from langchain_openai import OpenAIEmbeddings

print("‚è≥ Initializing OpenAI Embeddings...")
embeddings = OpenAIEmbeddings(
    model="text-embedding-3-small",
    dimensions=1536
)
print("‚úÖ Embeddings Ready (1536 dims)")

In [None]:
# üìÑ Processing Logic
from langchain_community.document_loaders import PyPDFLoader
# ROBUST IMPORT for Text Splitter
try:
    from langchain_text_splitters import RecursiveCharacterTextSplitter
except ImportError:
    try:
        from langchain.text_splitter import RecursiveCharacterTextSplitter
    except ImportError:
        from langchain_community.text_splitter import RecursiveCharacterTextSplitter

import re
from pathlib import Path

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " ", ""]
)

def extract_metadata_from_filename(filename):
    """Simple regex to guess metadata from filename like 'UU_1_2023.pdf'"""
    meta = {"source": filename, "type": "UNKNOWN", "year": "UNKNOWN", "number": "UNKNOWN"}
    
    # Try to find year (4 digits)
    year_match = re.search(r'(19|20)\d{2}', filename)
    if year_match:
        meta['year'] = year_match.group(0)
        
    # Try to find type
    if "UU" in filename.upper(): meta['type'] = "UNDANG_UNDANG"
    elif "PP" in filename.upper(): meta['type'] = "PERATURAN_PEMERINTAH"
    elif "PERPRES" in filename.upper(): meta['type'] = "PERATURAN_PRESIDEN"
    
    return meta

def process_pdf(file_path):
    try:
        loader = PyPDFLoader(file_path)
        pages = loader.load()
        
        # Check if scanned (empty text)
        total_text = "".join([p.page_content for p in pages])
        if len(total_text.strip()) < 100:
            print(f"‚ö†Ô∏è SKIPPING {Path(file_path).name}: Likely scanned/image PDF")
            return []
            
        # SAVE RAW TEXT FOR FACTORY
        txt_filename = Path(file_path).stem + ".txt"
        txt_path = os.path.join(TEXT_OUTPUT_DIR, txt_filename)
        with open(txt_path, "w", encoding="utf-8") as f:
            f.write(total_text)
            
        # Split
        chunks = text_splitter.split_documents(pages)
        
        # Enrich Metadata
        file_meta = extract_metadata_from_filename(Path(file_path).name)
        for chunk in chunks:
            chunk.metadata.update(file_meta)
            chunk.metadata['filename'] = Path(file_path).name
            
        return chunks
        
    except Exception as e:
        print(f"‚ùå ERROR processing {file_path}: {e}")
        return []

In [None]:
# üíæ Qdrant Connection
from qdrant_client import QdrantClient
from qdrant_client.http import models

# INCREASED TIMEOUT to handle Fly.io wake-up / latency
client = QdrantClient(
    url=QDRANT_URL, 
    api_key=QDRANT_API_KEY,
    timeout=60  # Increased from default to 60s
)

# FORCE RECREATE (TABULA RASA) as requested
print(f"‚ö†Ô∏è WIPING and Recreating collection '{COLLECTION_NAME}'...")
client.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(size=VECTOR_SIZE, distance=models.Distance.COSINE),
)
print("‚úÖ Collection wiped and ready for fresh ingestion!")

In [None]:
# üöÄ EXECUTION LOOP
import glob
from tqdm.notebook import tqdm
from langchain_community.vectorstores import Qdrant

# Find all PDFs
pdf_files = glob.glob(f"{SOURCE_DIR}/**/*.pdf", recursive=True)
print(f"üìö Found {len(pdf_files)} PDFs to ingest")

# Initialize VectorStore wrapper
qdrant_store = Qdrant(
    client=client,
    collection_name=COLLECTION_NAME,
    embeddings=embeddings
)

# Batch Process
BATCH_SIZE = 10 # Process 10 PDFs at a time

for i in tqdm(range(0, len(pdf_files), BATCH_SIZE), desc="Batch Processing"):
    batch_files = pdf_files[i:i+BATCH_SIZE]
    batch_docs = []
    
    for pdf_file in batch_files:
        docs = process_pdf(pdf_file)
        batch_docs.extend(docs)
        
    if batch_docs:
        # Upload to Qdrant
        try:
            qdrant_store.add_documents(batch_docs)
            print(f"‚úÖ Uploaded {len(batch_docs)} chunks from batch {i//BATCH_SIZE + 1}")
        except Exception as e:
            print(f"‚ùå FAILED to upload batch: {e}")