## 1) Setup
The pipeline is designed to run with **stdlib only**. Optional packages (BeautifulSoup, langdetect) are used if installed.

In [None]:
from __future__ import annotations

import re
import math
import json
import unicodedata
from dataclasses import dataclass, asdict
from html.parser import HTMLParser
from pathlib import Path
from typing import Iterable, List, Dict, Tuple, Optional

# Optional dependencies
try:
    from bs4 import BeautifulSoup  # type: ignore
except Exception:
    BeautifulSoup = None

try:
    from langdetect import detect as lang_detect  # type: ignore
except Exception:
    lang_detect = None

print("✅ Imports loaded (optional deps are optional).")

## 2) Core utilities
This section defines the preprocessing steps and a single `preprocess_text()` entrypoint.

In [None]:
_RE_MULTISPACE = re.compile(r"[\t\r\f\v ]+")
_RE_MULTINEWLINE = re.compile(r"\n{3,}")
_RE_ZERO_WIDTH = re.compile(r"[\u200B-\u200D\uFEFF]")


def normalize_unicode(text: str) -> str:
    # Unicode NFKC normalization:
    # - Makes quotes, full-width chars, and compatibility forms consistent
    # - Also decomposes/recomposes characters and may change glyphs (e.g. ﬁ -> fi)
    #   See Unicode Normalization Forms: https://unicode.org/reports/tr15/
    text = unicodedata.normalize("NFKC", text)
    return text


def normalize_whitespace(text: str) -> str:
    text = text.replace("\r\n", "\n").replace("\r", "\n")
    text = _RE_ZERO_WIDTH.sub("", text)
    # Normalize space runs but preserve newlines
    text = _RE_MULTISPACE.sub(" ", text)
    # Trim spaces around newlines
    text = "\n".join(line.strip() for line in text.split("\n"))
    # Collapse excessive blank lines
    text = _RE_MULTINEWLINE.sub("\n\n", text)
    return text.strip()


class _HTMLStripper(HTMLParser):
    def __init__(self):
        super().__init__()
        self._chunks: List[str] = []

    def handle_data(self, data: str) -> None:
        if data:
            self._chunks.append(data)

    def get_text(self) -> str:
        return "".join(self._chunks)


def strip_html(text: str) -> str:
    """Strip HTML tags from text. Handles malformed HTML gracefully."""
    if "<" not in text or ">" not in text:
        return text

    try:
        if BeautifulSoup is not None:
            soup = BeautifulSoup(text, "html.parser")
            # Remove script/style
            for tag in soup(["script", "style", "noscript"]):
                tag.decompose()
            return soup.get_text(separator="\n")

        stripper = _HTMLStripper()
        stripper.feed(text)
        return stripper.get_text()
    except Exception:
        # If HTML parsing fails, return text as-is
        return text


def dedupe_consecutive_lines(text: str) -> str:
    lines = [ln.rstrip() for ln in text.split("\n")]
    out: List[str] = []
    last = None
    for ln in lines:
        if ln and last == ln:
            continue
        out.append(ln)
        last = ln if ln else last
    return "\n".join(out).strip()


def drop_noise_lines(
    text: str,
    *,
    min_chars: int = 3,
    min_alpha_ratio: float = 0.25,
    max_punct_ratio: float = 0.35,
    drop_all_caps_short: bool = True,
) -> str:
    def is_noise(line: str) -> bool:
        ln = line.strip()
        if not ln:
            return False
        if len(ln) < min_chars:
            return True

        alpha = sum(ch.isalpha() for ch in ln)
        punct = sum(unicodedata.category(ch).startswith("P") for ch in ln)
        ratio_alpha = alpha / max(1, len(ln))
        ratio_punct = punct / max(1, len(ln))

        if ratio_alpha < min_alpha_ratio:
            return True
        if ratio_punct > max_punct_ratio:
            return True
        if drop_all_caps_short and ln.isupper() and 8 <= len(ln) <= 25 and " " in ln:
            return True
        return False

    kept: List[str] = []
    for ln in text.split("\n"):
        if not is_noise(ln):
            kept.append(ln)
    return normalize_whitespace("\n".join(kept))


# Prefer NLTK sentence tokenizer if available; fall back to regex.
try:
    from nltk.tokenize import sent_tokenize as _nltk_sent_tokenize  # type: ignore
except Exception:  # NLTK not installed or misconfigured
    _nltk_sent_tokenize = None

# Regex fallback is lightweight and has known limitations (abbreviations, ellipses, mid-sentence quotes).
_RE_SENT_SPLIT = re.compile(r"(?<=[.!?])\s+(?=[A-Z0-9\"'])")


def split_sentences(text: str) -> List[str]:
    # Lightweight splitter: good enough for chunking (not perfect).
    text = normalize_whitespace(text)
    if not text:
        return []

    if _nltk_sent_tokenize is not None:
        # Use NLTK's Punkt sentence tokenizer when available (more robust than regex).
        sentences: List[str] = []
        for para in text.split("\n"):
            para = para.strip()
            if not para:
                continue
            sentences.extend(_nltk_sent_tokenize(para))
        return sentences

    # Treat newlines as strong separators (regex-based fallback)
    parts: List[str] = []
    for para in text.split("\n"):
        para = para.strip()
        if not para:
            continue
        parts.extend(_RE_SENT_SPLIT.split(para))

    return [p.strip() for p in parts if p.strip()]


def chunk_sentences(
    sentences: List[str],
    *,
    max_chars: int = 1800,
    overlap: int = 1,
) -> List[str]:
    if not sentences:
        return []

    chunks: List[str] = []
    current: List[str] = []
    current_len = 0

    def flush() -> None:
        nonlocal current, current_len
        if current:
            chunks.append(" ".join(current).strip())
            if overlap > 0:
                current = current[-overlap:]
                # Recalculate length: sum of sentence lengths + spaces between them
                current_len = sum(len(s) for s in current) + max(0, len(current) - 1)
            else:
                current = []
                current_len = 0

    for s in sentences:
        s = s.strip()
        if not s:
            continue
        # Calculate length including space separator (if not first sentence)
        s_len = len(s) + (1 if current else 0)
        if current and (current_len + s_len) > max_chars:
            flush()
            s_len = len(s)  # Recalculate for new chunk (no leading space)
        current.append(s)
        current_len += s_len

    if current:
        chunks.append(" ".join(current).strip())

    # Safety: remove empty
    return [c for c in chunks if c]


_RE_EMAIL = re.compile(r"\b[A-Z0-9._%+-]+@[A-Z0-9.-]+\.[A-Z0-9-]{2,63}\b", re.IGNORECASE)
_RE_PHONE = re.compile(r"\b(?:\+?\d{1,3}[-. ]?)?(?:\(?\d{2,4}\)?[-. ]?)?\d{3,4}[-. ]?\d{3,4}\b")
_RE_URL = re.compile(r"\bhttps?://[^\s]+?(?=[\s\)\]\}>\"\'.,!?]|$)", re.IGNORECASE)
_RE_CREDIT_CARD = re.compile(r"\b(?:\d{13,19}|\d{4}(?:[ -]\d{4}){3})\b")


def _luhn_ok(number: str) -> bool:
    digits = [int(ch) for ch in re.sub(r"\D", "", number)]
    if len(digits) < 13 or len(digits) > 19:
        return False
    checksum = 0
    parity = len(digits) % 2
    for i, d in enumerate(digits):
        if i % 2 == parity:
            d *= 2
            if d > 9:
                d -= 9
        checksum += d
    return checksum % 10 == 0


def redact_pii(text: str) -> Tuple[str, Dict[str, List[str]]]:
    """Redact PII from text. Handles regex errors gracefully."""
    entities: Dict[str, List[str]] = {"email": [], "phone": [], "url": [], "card": []}

    try:
        def repl_factory(kind: str):
            def _repl(m: re.Match) -> str:
                val = m.group(0)
                entities[kind].append(val)
                return f"[{kind.upper()}_{len(entities[kind])}]"
            return _repl

        text = _RE_EMAIL.sub(repl_factory("email"), text)
        text = _RE_URL.sub(repl_factory("url"), text)

        # Phone regex can overmatch; keep conservative by only replacing matches with enough digits
        def phone_repl(m: re.Match) -> str:
            val = m.group(0)
            digits = re.sub(r"\D", "", val)
            if len(digits) < 9:
                return val
            entities["phone"].append(val)
            return f"[PHONE_{len(entities['phone'])}]"

        text = _RE_PHONE.sub(phone_repl, text)

        # Credit cards: validate by Luhn
        def card_repl(m: re.Match) -> str:
            val = m.group(0)
            if not _luhn_ok(val):
                return val
            entities["card"].append(val)
            return f"[CARD_{len(entities['card'])}]"

        text = _RE_CREDIT_CARD.sub(card_repl, text)
    except Exception:
        # If redaction fails, return text as-is with no entities
        pass

    # Do not return raw PII values to avoid retaining sensitive data in memory.
    # Instead, return only non-sensitive placeholders for detected entities.
    safe_entities = {k: ["[REDACTED]" for _ in v] for k, v in entities.items() if v}
    return text, safe_entities


def detect_language(text: str) -> str:
    # Optional: only if langdetect installed
    if lang_detect is None:
        return "unknown"
    sample = text
    if len(sample) > 4000:
        sample = sample[:4000]
    try:
        return lang_detect(sample)
    except Exception:
        return "unknown"


def _estimate_syllables(word: str) -> int:
    """
    Heuristic syllable estimator (English-ish). Good enough for gating, not research-grade.
    WARNING: This function is designed for English text and may produce unreliable results
    for non-English languages.
    """
    w = re.sub(r"[^a-z]", "", word.lower())
    if not w:
        return 0
    vowels = "aeiouy"
    count = 0
    prev_vowel = False
    for ch in w:
        is_v = ch in vowels
        if is_v and not prev_vowel:
            count += 1
        prev_vowel = is_v
    # silent e
    if w.endswith("e") and count > 1:
        count -= 1
    return max(1, count)


@dataclass
class TextStats:
    language: str
    char_count: int
    word_count: int
    sentence_count: int
    avg_sentence_len_words: float
    avg_word_len_chars: float
    long_word_ratio: float
    flesch_reading_ease: float


def compute_stats(text: str) -> TextStats:
    """
    Compute text statistics including Flesch Reading Ease score.
    WARNING: Syllable estimation and sentence splitting are English-focused heuristics.
    Results may be unreliable for non-English text.
    """
    lang = detect_language(text)
    words = re.findall(r"[A-Za-z]+(?:'[A-Za-z]+)?", text)
    sentences = split_sentences(text)

    word_count = len(words)
    sentence_count = max(1, len(sentences))
    avg_sentence_len = word_count / sentence_count
    avg_word_len = (sum(len(w) for w in words) / max(1, word_count))

    long_words = [w for w in words if _estimate_syllables(w) >= 3 or len(w) >= 10]
    long_ratio = len(long_words) / max(1, word_count)

    # Flesch Reading Ease (FRE) score.
    # Coefficients from the standard FRE formula:
    #   FRE = 206.835 − 1.015*(words/sentences) − 84.6*(syllables/words)
    # See e.g. https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests
    # Note: Sentence splitting is heuristic, so this FRE score is an approximation and may mis-estimate readability.
    FRE_BASE = 206.835  # Base score for English texts in the FRE formula.
    FRE_SENTENCE_WEIGHT = 1.015  # Penalty per average sentence length (words per sentence).
    FRE_SYLLABLE_WEIGHT = 84.6  # Penalty per average syllables per word.
    syllables = sum(_estimate_syllables(w) for w in words)
    syllables_per_word = syllables / max(1, word_count)
    fre = FRE_BASE - FRE_SENTENCE_WEIGHT * avg_sentence_len - FRE_SYLLABLE_WEIGHT * syllables_per_word

    return TextStats(
        language=lang,
        char_count=len(text),
        word_count=word_count,
        sentence_count=len(sentences),
        avg_sentence_len_words=float(avg_sentence_len),
        avg_word_len_chars=float(avg_word_len),
        long_word_ratio=float(long_ratio),
        flesch_reading_ease=float(fre),
    )


def requires_llm_simplification(
    stats: TextStats,
    *,
    min_words: int = 30,
    max_words: int = 4000,
    fre_threshold: float = 60.0,
) -> bool:
    """
    Determine if text needs LLM simplification based on readability metrics.
    Uses Flesch Reading Ease score and word count to gate LLM processing.
    """
    # Simple gate: avoid LLM when too short or already easy
    if stats.word_count < min_words:
        return False
    if stats.word_count > max_words:
        return False  # too long; should be chunked earlier in the pipeline
    # If FRE is high (easier), skip
    return stats.flesch_reading_ease < fre_threshold


def preprocess_text(
    text: str,
    *,
    strip_html_input: bool = True,
    redact: bool = True,
    drop_noise: bool = True,
    dedupe_lines: bool = True,
    max_chunk_chars: int = 1800,
    chunk_overlap: int = 1,
) -> Dict[str, object]:
    # Validate chunking parameters
    if max_chunk_chars <= 0:
        raise ValueError("max_chunk_chars must be positive")
    if chunk_overlap < 0:
        raise ValueError("chunk_overlap cannot be negative")
    if chunk_overlap >= max_chunk_chars:
        raise ValueError("chunk_overlap must be less than max_chunk_chars")

    original = text or ""

    cleaned = original
    if strip_html_input:
        cleaned = strip_html(cleaned)
    cleaned = normalize_unicode(cleaned)
    cleaned = normalize_whitespace(cleaned)

    if dedupe_lines:
        cleaned = dedupe_consecutive_lines(cleaned)
    if drop_noise:
        cleaned = drop_noise_lines(cleaned)

    redacted_entities = {}
    if redact:
        cleaned, redacted_entities = redact_pii(cleaned)

    stats = compute_stats(cleaned)
    sentences = split_sentences(cleaned)
    chunks = chunk_sentences(sentences, max_chars=max_chunk_chars, overlap=chunk_overlap)

    return {
        "text_original": original,
        "text_clean": cleaned,
        "pii": redacted_entities,
        "stats": asdict(stats),
        "sentences": sentences,
        "chunks": chunks,
        "send_to_llm": requires_llm_simplification(stats),
    }


print('✅ Preprocessing utilities defined.')

## 3) Demo on repo sample
This tries to load `data/samples/sample_en.txt` and runs the pipeline.

In [None]:
sample_path = Path('data/samples/sample_en.txt')
if sample_path.exists():
    raw_text = sample_path.read_text(encoding='utf-8')
    print(f'✅ Loaded sample: {sample_path} ({len(raw_text)} chars)')
else:
    raw_text = (
        'Make sure the area is safe, especially if you plan on walking home at night. '
        'It’s a good idea to use the buddy system. '
        'Contact us at support@example.com or visit https://example.com for details.'
    )
    print('ℹ️ Using inline demo text (sample file not found).')

result = preprocess_text(raw_text, max_chunk_chars=900, chunk_overlap=1)

print('--- STATS ---')
print(json.dumps(result['stats'], indent=2))
print('--- PII ---')
print(json.dumps(result['pii'], indent=2))
print('--- SEND TO LLM? ---')
print(result['send_to_llm'])

print('--- CLEAN (preview) ---')
print(result['text_clean'][:600])

print('--- CHUNKS ---')
for i, c in enumerate(result['chunks'][:5], 1):
    print(f'[{i}] {len(c)} chars: {c[:160]}...')

## 4) How to use with your LLM notebook
In your model notebook, call `preprocess_text(text)` first. Then send `result['chunks']` to the LLM one chunk at a time (or merge a few) depending on your context window.

Recommended pattern:
- if `send_to_llm == False`: skip simplification or apply minimal rule-based changes
- else: send each chunk to the LLM and then join results