In [None]:
#install dependencies
!pip install torch transformers sentence-transformers faiss-cpu accelerate

In [None]:
#GPU
!pip install faiss-gpu

# 1. DATA PREPROCESSING + CLAUSE-AWARE CHUNKING

In [None]:
import re
from typing import List, Dict

def clean_text(text: str) -> str:
    """
    Remove formatting artefacts but preserve clause numbering.
    """
    # Remove excessive whitespace
    text = re.sub(r'\n+', '\n', text)
    text = re.sub(r'\s+', ' ', text)

    # Remove page numbers if any (customise if needed)
    text = re.sub(r'\b\d+\s*DEC\s*2025\b', '', text)

    return text.strip()


def clause_aware_chunking(text: str) -> List[Dict]:
    """
    Split document by Regulation and clause markers.
    Returns structured chunks with metadata.
    """

    chunks = []

    # Split by main Regulation numbers (1. Scope, 2. Entitlement, etc.)
    regulation_pattern = r'(?=\n?\s*\d+\.\s)'
    regulations = re.split(regulation_pattern, text)

    for reg in regulations:
        reg = reg.strip()
        if not reg:
            continue

        # Extract regulation number
        reg_match = re.match(r'(\d+)\.', reg)
        if not reg_match:
            continue

        reg_number = reg_match.group(1)

        # Split clauses (i), (ii), (iii)
        clause_pattern = r'(?=\(\w+\))'
        clauses = re.split(clause_pattern, reg)

        for clause in clauses:
            clause = clause.strip()
            if len(clause) < 50:  # ignore tiny fragments
                continue

            clause_id_match = re.match(r'\((\w+)\)', clause)
            clause_id = clause_id_match.group(1) if clause_id_match else "main"

            chunks.append({
                "regulation": reg_number,
                "clause": clause_id,
                "text": clause.strip()
            })

    return chunks
