In [1]:
!pip -q install pymupdf

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m39.7 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
from collections import Counter

def detect_header_footer(doc, min_count=3):
    headers, footers = [], []

    for page in doc:
        text = page.get_text("text")
        lines = [l.strip() for l in text.split("\n") if l.strip()]
        if not lines:
            continue

        # ambil kandidat header/footer
        headers.extend(lines[:2])
        footers.extend(lines[-2:])

    # Hitung frekuensi
    header_counts = Counter(headers)
    footer_counts = Counter(footers)

    # Ambil yang paling sering, tapi hanya kalau frekuensi >= min_count
    header = None
    if header_counts:
        top_header, h_count = header_counts.most_common(1)[0]
        if h_count >= min_count:
            header = top_header

    footer = None
    if footer_counts:
        top_footer, f_count = footer_counts.most_common(1)[0]
        if f_count >= min_count:
            footer = top_footer

    return header, footer

## Load Data

In [34]:
import fitz  # PyMuPDF
import os
# Path PDF
pdf_path = "/content/drive/MyDrive/ML/RAG_COVID/06 REV-05_Pedoman_P2_COVID-19_13_Juli_2020_compressed.pdf"

# Buka PDF
doc_3 = fitz.open(pdf_path)

# Jumlah halaman
total_pages = len(doc_3)

print(f"📄 Total halaman: {total_pages}")
# --- hasil deteksi otomatis ---
DETECTED_HEADER, DETECTED_FOOTER = detect_header_footer(doc_3)
detect_header_footer(doc_3)

📄 Total halaman: 214


('PEDOMAN PENCEGAHAN DAN PENGENDALIAN',
 'CORONAVIRUS DISEASE (COVID-19) REVISI KE-5')

In [31]:
# Path PDF
pdf_path = "/content/drive/MyDrive/ML/RAG_COVID/04 Rencana Operasi Penanganan COVID-19 — Kementerian Kesehatan RI.pdf"

# Buka PDF
doc_1 = fitz.open(pdf_path)

# Jumlah halaman
total_pages = len(doc_1)

print(f"📄 Total halaman: {total_pages}")

# --- hasil deteksi otomatis ---
DETECTED_HEADER, DETECTED_FOOTER = detect_header_footer(doc_1)
detect_header_footer(doc_1)


📄 Total halaman: 324


('Rencana Operasi Penanggulangan Covid -19 Bidang Kesehatan di Indonesia–Revisi 1',
 'Pemkab')

In [37]:
# Path PDF
pdf_path = "/content/drive/MyDrive/ML/RAG_COVID/05 Surat Edaran Satgas COVID-19 No. 25 Tahun 2022 — Protokol Kesehatan Perjalanan Luar Negeri.pdf"

# Buka PDF
doc_2 = fitz.open(pdf_path)

# Jumlah halaman
total_pages = len(doc_2)

print(f"📄 Total halaman: {total_pages}")

📄 Total halaman: 10


## Process Document

In [9]:
import re

# ---- Helper: normalisasi spasi ----
SPACE_RUN_RE = re.compile(r'[ \t\u00A0\u2000-\u200B\u202F\u205F\u3000]+')

def collapse_spaces(s: str) -> str:
    if not s:
        return s
    # hilangkan zero-width, samakan NBSP -> spasi biasa, lalu collapse
    s = s.replace('\u200b', '').replace('\xa0', ' ')
    return SPACE_RUN_RE.sub(' ', s).strip()


### Ekstract Pdf

In [10]:
import re, fitz
from bs4 import BeautifulSoup
from bs4.element import Tag
from typing import List, Optional
from typing import Tuple

# --- regex pembantu (pastikan sudah didefinisikan di atas file) ---
# _LEADS_LIST_RE  = re.compile(r'^(?:\d+(?:\s*\.\s*\d+)*\.|\d+\)|\(\d+\)|[A-Za-z]\.|[A-Za-z]\)|\([A-Za-z]\))\s+')
# _SUBHEAD_NUM_RE = re.compile(r'^\d+(?:\s*\.\s*\d+)+\s+\S')   # 1.1 … / 2.3.4 …
# _SUBHEAD_LET_RE = re.compile(r'^[A-Z]\.\s+\S')               # A. … / B. …

_SUBHEAD_NUM_RE = re.compile(r'^\d+(?:\s*\.\s*\d+)+\s+\S')  # 1.1 … / 2. 3. …
_SUBHEAD_LET_RE = re.compile(r'^[A-Z]\.\s+\S')             # A. … / B. …
_LEADS_LIST_RE  = re.compile(
    r'^(?:\d+(?:\s*\.\s*\d+)*\.|\d+\)|\(\d+\)|[A-Za-z]\.|[A-Za-z]\)|\([A-Za-z]\))\s+'
)

def _is_titlecase_like(s: str) -> bool:
    w = [t for t in re.split(r'\s+', s.strip()) if t]
    if not (1 <= len(w) <= 10): return False
    good = sum(1 for t in w if (t.lower() not in {"dan","atau","yang","untuk","pada","dari","di","ke"} and t[:1].isupper()))
    return good >= max(1, len(w)-2)


# ==== Config default ====
TOP_PCT, BOT_PCT = 0.06, 0.06   # potong header/footer relatif tinggi halaman
LINE_TOL, PARA_GAP = 2.5, 18.0  # toleransi baris + jarak antar paragraf

def _attr(tag, key, default=""):
    return (tag.attrs or {}).get(key, default) if isinstance(tag, Tag) else default

def _css_num(style: str, key: str) -> Optional[float]:
    m = re.search(rf"{key}\s*:\s*(-?\d+(?:\.\d+)?)", style or "", re.I)
    return float(m.group(1)) if m else None

# --- helper untuk deteksi heading saat ekstraksi ---
def _css_has_bold(style: str) -> bool:
    s = style or ""
    return (("font-weight" in s and re.search(r"font-weight\s*:\s*(bold|[6-9]\d{2})", s, re.I)) or
            ("font:" in s and re.search(r"\b(bold)\b", s, re.I)))

def _uppercase_ratio(t: str) -> float:
    letters = [c for c in t if c.isalpha()]
    return (sum(1 for c in letters if c.isupper()) / len(letters)) if letters else 0.0

_PAT_BAB = re.compile(r'^(?:Bab|BAB|Chapter)\s+[A-Za-z0-9IVXLC.\- ]+$')

def extract_lines_with_meta(page: fitz.Page,
                            top_pct=TOP_PCT, bot_pct=BOT_PCT,
                            line_tol=LINE_TOL, para_gap=PARA_GAP) -> List[dict]:
    soup = BeautifulSoup(page.get_text("html"), "html.parser")
    for sup in soup.find_all("sup"): sup.insert_before("^"); sup.unwrap()
    for sub in soup.find_all("sub"): sub.insert_before("_"); sub.unwrap()

    H = next((_css_num(_attr(d,"style",""),"height")
              for d in soup.find_all("div") if "height:" in _attr(d,"style","")), None) \
        or float(page.rect.height)
    W = float(page.rect.width)
    top_cut, bot_cut = H*top_pct, H*(1-bot_pct)

    spans=[]
    for el in soup.find_all(True):
        s=_attr(el,"style","") or ""
        if ("top:" not in s) or ("left:" not in s): continue
        if el.find(True, attrs={"style": re.compile(r'\btop\s*:.*\bleft\s*:', re.I)}): continue
        t=el.get_text(" ", strip=True)
        if not t: continue
        top=_css_num(s,"top"); left=_css_num(s,"left") or 0.0
        if top is None or top<=top_cut or top>=bot_cut: continue
        fs=_css_num(s,"font-size") or 0.0; bold=_css_has_bold(s)
        spans.append((top,left,t,fs,bold))

    spans.sort(key=lambda x:(x[0],x[1]))

    lines, buf, cur_top=[],[],None
    def flush():
        if not buf: return
        buf.sort(key=lambda x:x[1])
        parts=[t for _,_,t,_,_ in buf]
        text=" ".join(parts)
        text=re.sub(r"\s+([,.;:)\]])", r"\1", text)
        text=re.sub(r"([\[(])\s+", r"\1", text)
        left=min(b[1] for b in buf)
        right=max(b[1]+max(1.0,len(b[2])) for b in buf)
        sizes=[b[3] for b in buf if b[3]>0]
        size_max=max(sizes) if sizes else 0.0
        size_avg=(sum(sizes)/len(sizes)) if sizes else 0.0
        bold_any=any(b[4] for b in buf)
        centered=abs(((left+right)/2.0)-(W/2.0))<=0.08*W
        lines.append({"text":text.strip(),"top":buf[0][0],
                      "left":left,"right":right,
                      "size_max":size_max,"size_avg":size_avg,
                      "bold_any":bold_any,"centered":centered,
                      "pretag":None})
        buf.clear()

    for top,left,t,fs,b in spans:
        if cur_top is None: cur_top=top
        if abs(top-cur_top)>line_tol:
            flush()
            if top-cur_top>para_gap:
                lines.append({"text":"", "top":cur_top, "left":0, "right":0,
                              "size_max":0, "size_avg":0, "bold_any":False,
                              "centered":False, "pretag":None})
            cur_top=top
        buf.append((top,left,t,fs,b))
    flush()

    body_sizes=sorted([ln["size_avg"] for ln in lines if len(ln["text"])>20 and ln["size_avg"]>0])
    body_size=body_sizes[len(body_sizes)//2] if body_sizes else 0.0

    # --- heuristik HEADING ---
    # === HEADING & SUBHEADING (WAJIB BOLD) ===
    for ln in lines:
        t = ln["text"]
        if not t:
            continue

        # heading/subheading wajib bold
        if not ln["bold_any"]:
            continue

        is_listish = bool(_LEADS_LIST_RE.match(t))
        no_trail   = not t.endswith((".", ",", ";", ":"))
        shortish   = len(t.split()) <= 10
        caps_ok    = _uppercase_ratio(t) >= 0.80

        # skala relatif thd body
        size_big = (body_size > 0 and (ln["size_max"] >= body_size*1.25 or ln["size_avg"] >= body_size*1.18))
        size_mid = (body_size > 0 and body_size*1.06 <= ln["size_avg"] < body_size*1.25)

        # 1) SUBHEADING: list-ish yang bold (angka/huruf di depan)
        if is_listish and no_trail:
            if _SUBHEAD_NUM_RE.match(t) or _SUBHEAD_LET_RE.match(t) or _is_titlecase_like(t):
                if size_mid or size_big or ln["centered"]:
                    ln["pretag"] = "SUBHEADING"
            continue  # list-ish tidak boleh jadi HEADING

        # 2) HEADING: non-list, bold, tampak seperti judul
        if _PAT_BAB.match(t) and no_trail:
            ln["pretag"] = "HEADING"; continue

        if no_trail and (size_big or ln["centered"] or caps_ok) and (shortish or caps_ok or ln["centered"]):
            ln["pretag"] = "HEADING"



    return [ln for ln in lines if ln["text"]]

def _merge_bullets_meta(lines: List[dict]) -> List[dict]:
    out=[]; i=0
    while i<len(lines):
        if lines[i]["text"].strip() in {"•","-","*"} and i+1<len(lines):
            merged=dict(lines[i+1]); merged["text"]=(lines[i]["text"]+" "+lines[i+1]["text"]).strip()
            merged["pretag"]=None
            out.append(merged); i+=2
        else:
            out.append(lines[i]); i+=1
    return out

def _merge_consecutive_headings_meta(lines: List[dict]) -> List[dict]:
    out=[]; i=0
    while i<len(lines):
        cur=lines[i]
        if cur.get("pretag")=="HEADING" and i+1<len(lines):
            nxt=lines[i+1]
            if nxt.get("pretag")=="HEADING" and abs(nxt["top"]-cur["top"])<=PARA_GAP and \
               (nxt["size_avg"]==0 or cur["size_avg"]==0 or 0.75<= (nxt["size_avg"]/max(1e-6,cur["size_avg"])) <=1.33):
                cur2=dict(cur); cur2["text"]=(cur["text"].rstrip()+" "+nxt["text"].lstrip()).strip()
                out.append(cur2); i+=2; continue
        out.append(cur); i+=1
    return out

def _merge_consecutive_subheadings_meta(lines_meta: List[dict]) -> List[dict]:
    out=[]; i=0
    while i < len(lines_meta):
        cur = lines_meta[i]
        if cur.get("pretag") == "SUBHEADING" and i+1 < len(lines_meta):
            nxt = lines_meta[i+1]
            if nxt.get("pretag") == "SUBHEADING" and abs(nxt["top"]-cur["top"]) <= PARA_GAP and \
               (nxt["size_avg"]==0 or cur["size_avg"]==0 or 0.75 <= (nxt["size_avg"]/max(1e-6,cur["size_avg"])) <= 1.33):
                cur2 = dict(cur); cur2["text"] = (cur["text"].rstrip()+" "+nxt["text"].lstrip()).strip()
                out.append(cur2); i += 2; continue
        out.append(cur); i += 1
    return out

def process_page(page: fitz.Page) -> List[Tuple[str, str]]:
    rows = extract_lines_with_meta(page)            # <-- versi with_meta
    rows = _merge_bullets_meta(rows)
    rows = _merge_consecutive_headings_meta(rows)
    rows = _merge_consecutive_subheadings_meta(rows)  # <-- tambahkan

    tagged = []
    for ln in rows:
        if ln.get("pretag") == "HEADING":
            tagged.append(("HEADING", ln["text"]))
        elif ln.get("pretag") == "SUBHEADING":
            tagged.append(("SUBHEADING", ln["text"]))   # <-- tag baru
        else:
            tagged.append(parse_classified( classify_line(collapse_spaces(ln["text"])) ))
    # optional: tagged = merge_list_continuations(tagged)
    return tagged



In [11]:

def parse_classified(s: str) -> Tuple[str, str]:
    """
    Parse hasil string yang sudah diklasifikasikan oleh classify_line.

    Format input yang diharapkan: "[TAG] isi_teks"
    Contoh:
        "[LETTERED 1] A. Kasus konfirmasi"
        "[LETTERED 2] a. Kasus konfirmasi"
        "[NUMBERED] 3.2.2. Kasus probable"
        "[HEADING] BAB I PENDAHULUAN"
        "[PARAGRAPH] Ini isi paragraf"

    Output:
        Tuple (tag, text)
        - tag  : kategori hasil klasifikasi (misal "LETTERED 1", "PARAGRAPH")
        - text : konten teks asli tanpa label

    Catatan:
        Regex disini sudah diupgrade agar mendukung tag dengan spasi dan angka
        (contoh: "LETTERED 1", "LETTERED 2") selain tag standar huruf besar
        (contoh: "PARAGRAPH", "HEADING").
    """
    # m = re.match(r"^\[([A-Z0-9_ ]+)\]\s*(.*)$", s)
    m = re.match(r"^\[([A-Z0-9_ .()\-/]+)\]\s*(.*)$", s)
    return (m.group(1).strip(), m.group(2).strip()) if m else ("PARAGRAPH", s.strip())

## Tagging

In [12]:

def classify_line(line:str)->str:
    original=line.strip()
    # plain=_strip_emphasis(original)
    plain=re.sub(r"\s*\.\s*(?=\d)", ".", original)
    if DETECTED_HEADER and DETECTED_HEADER in plain: return "[HEADER]"
    if DETECTED_FOOTER and DETECTED_FOOTER in plain: return "[FOOTER]"
    if re.match(r"^\d{1,3}$", plain): return "[PAGE_NUMBER] "+original
    # if re.match(r"^(Bab|BAB|Chapter)\s+[A-Za-z0-9IVXLC\s\.\-]+$", plain): return "[HEADING] "+original
    # if re.match(r"^[A-Z0-9][A-Z0-9\s\-\.,:/()]{3,}$", plain) and 5<len(plain)<120: return "[HEADING] "+original

    # if re.match(r"^\d+\.$", plain) or re.match(r"^\d+(?:\.\d+)*\.?\s+[A-Z].*", plain): return "[NUMBERED] "+original
    # if (
    #     re.match(r"^\d+\.$", plain)                              # "1."
    #     or re.match(r"^\d+\.\s+[A-Z].*", plain)                  # "1. Judul"
    #     or re.match(r"^\d+(?:\.\d+)+\.?\s+[A-Z].*", plain)       # "1.2 Judul" / "1.2.3. Judul"
    # ):
    #     return "[NUMBERED] " + original
    # ===== NUMBERED =====
    tag_num = _classify_numbered(plain, original)
    if tag_num:
        return tag_num

    # LETTERED (baru)
    tag_let = _classify_lettered(plain, original)
    if tag_let:
        return tag_let

    STOPWORDS_BODY = {"DAN", "ATAU", "YANG", "DENGAN", "BELUM", "UNTUK"}

    # if re.match(r"^(Bab|BAB|Chapter)\s+[A-Za-z0-9IVXLC\s\.\-]+$", plain):
    #     return "[HEADING] " + original

    # ===== HEADING =====
    # Bab/Chapter ...
    if re.match(r"^(Bab|BAB|Chapter)\s+[A-Za-z0-9IVXLC\s.\-]+$", plain):
        return "[HEADING] " + original

    # Title Case heading (contoh kasusmu)
    # if _is_titlecase_heading(plain):
    #     return "[HEADING] " + original

    # ALL-CAPS heading (asal tidak diakhiri tanda kalimat)
    if re.match(r"^[A-Z0-9][A-Z0-9\s\-\.,:/()]{3,}$", plain) and 5 < len(plain) < 120 \
       and not plain.endswith((".", ",", ";", ":", "?", "!")):
        return "[HEADING] " + original


    # ALL CAPS candidate
    if re.match(r"^[A-Z0-9][A-Z0-9\s\-\.,:/()]{3,}$", plain) and 5 < len(plain) < 120:
        # 1) Jangan kalau ada stopword khas body
        tokens = set(plain.split())
        if tokens & STOPWORDS_BODY:
            return "[PARAGRAPH] " + original

        # 2) Jangan kalau diakhiri tanda baca (.,;:)
        if plain.endswith((".", ",", ";", ":")):
            return "[PARAGRAPH] " + original

        return "[HEADING] " + original

    return "[PARAGRAPH] "+ original


In [13]:
import re

_DOT_SPACES = re.compile(r'\s*\.\s*')  # normalisasi "1 . 2" -> "1.2"

# negative guards
_THOUSANDS_RE = re.compile(r'^\d{1,3}(?:\.\d{3})+(?=\s|[,;.)]|$)')     # 271.000.000 ...
_DECIMAL_RE   = re.compile(r'^\d+\.\d{1,2}(?=\s|[,;.)]|$)')            # 3.14 ...
_DOTDATE_RE   = re.compile(r'^\d{1,2}\.\d{1,2}\.\d{2,4}(?=\s|[,;.)]|$)')  # 31.12.2020

def _classify_numbered(plain: str, original: str, max_first_seg_len: int = 3):
    """
    ANGKA saja:
      - L7: (1) Judul
      - L5: 1) Judul
      - L3.depth: 1 / 1. / 1.2 / 1.2.3. [judul opsional]
    """
    s = plain.strip()
    if not s:
        return None

    s = _DOT_SPACES.sub('.', s)

    # --- yang paling spesifik dulu ---
    if re.match(r'^\(\d+\)\s*', s):
        return f"[NUMBERED 7] " + original
    if re.match(r'^\d+\)\s*', s):
        return f"[NUMBERED 5] " + original

    # --- NEGATIVE GUARDS (hindari false positive) ---
    if _THOUSANDS_RE.match(s):          # 271.000.000 ...
        return None
    if _DOTDATE_RE.match(s):            # 31.12.2020 / 01.01.20
        return None
    # desimal (3.14) — izinkan jika memang diakhiri titik "3.14."
    # if _DECIMAL_RE.match(s) and not re.match(r'^\d+\.\d{1,2}\.', s):
    #     return None
    # ✅ versi yang diubah (lebih cerdas mengenali heading vs desimal biasa)
    if _DECIMAL_RE.match(s):
        after = s.split(' ', 1)[1] if ' ' in s else ''
        if not after[:1].isupper():  # hanya tolak jika setelahnya bukan huruf kapital
            return None

    # --- Aturan utama: 1 / 1. / 1.2 / 1.2.3. (+ judul opsional) ---
    m = re.match(
        r"""^
            (?P<num>\d+(?:\.\d+)*)
            (?:
                \.\s*      |   # trailing titik
                \s+        |   # atau ada spasi -> ada judul
                $              # atau EOL (hanya "1"/"1.2")
            )
            (?P<title>.*)?
        $""",
        s, re.VERBOSE
    )
    if m:
        num = m.group('num')
        first = num.split('.', 1)[0]
        if len(first) > max_first_seg_len:   # 2020.01.02 (tahun) ditolak
            return None
        depth = num.count('.') + 1
        return f"[NUMBERED 3.{depth}] " + original

    return None


In [14]:
import re

def _classify_lettered(plain: str, original: str):
    """
    Deteksi penomoran huruf:
      L8: (a) Judul    (juga (A))
      L6: a) Judul
      L4: a. Judul
      L2: A. Judul   (juga A) Judul)
    Spasi setelah penanda opsional untuk varian 'a)' / '(a)'.
    """
    s = plain.strip()
    if not s:
        return None

    # --- paling spesifik dulu ---
    # (a) atau (A)
    if re.match(r'^\([a-zA-Z]\)\s*', s):
        return "[LETTERED 8] " + original

    # a)  (huruf kecil + ')')
    if re.match(r'^[a-z]\)\s*', s):
        return "[LETTERED 6] " + original

    # a.  (huruf kecil + '.')
    if re.match(r'^[a-z]\.\s*', s):
        return "[LETTERED 4] " + original

    # A)  (huruf besar + ')') → kita samakan ke level 2 (subbab)
    if re.match(r'^[A-Z]\)\s*', s):
        return "[LETTERED 2] " + original

    # A.  (huruf besar + '.')
    if re.match(r'^[A-Z]\.\s*', s):
        return "[LETTERED 2] " + original

    # (opsional) terima "a Judul" / "A Judul" tanpa '.' / ')'
    # uncomment kalau di dokumenmu ada:
    # if re.match(r'^[a-z]\s+\S+', s): return "[LETTERED 4] " + original
    # if re.match(r'^[A-Z]\s+\S+', s): return "[LETTERED 2] " + original

    return None


In [18]:
start_page = 11
end_page = 22
doc = doc_1

# NEW: bangun kamus TOC (opsional)
# Misal daftar isi ada di halaman 2–3. Kalau tidak ada, set None saja.

for p in range(start_page, end_page+1):
    tagged = process_page(doc[p-1])
    print(f"\n=== Halaman {p} ===")
    for tag, txt in tagged:
        print(f"{tag:10s} | {txt}")



=== Halaman 11 ===
HEADING    | Bab 1 Pendahuluan
NUMBERED 3.2 | 1.1 Latar belakang
PARAGRAPH  | Rencana Operasi adalah rencana yang dibuat/disusun dalam rangka pelaksanaan operasi
PARAGRAPH  | Tanggap Darurat Bencana. Rencana operasi ini disusun oleh Komando Tanggap Darurat dengan
PARAGRAPH  | mempertimbangkan rencana kontingensi dan hasil kaji cepat. Rencana operasi merupakan dokumen
PARAGRAPH  | perencanaan dalam penanggulangan bencana yang dibuat pada masa tanggap darurat untuk
PARAGRAPH  | menangani dampak yang ditimbulkan oleh suatu bencana. Dokumen ini bersifat dinamis sehingga
PARAGRAPH  | dapat diubah untuk menyesuaikan perkembangan situasi tanggap darurat bencana.
PARAGRAPH  | Dengan adanya infeksi Corona Virus sub tipe baru (COVID-2) yang kemudian ditetapkan sebagai
PARAGRAPH  | pandemi COVID-19 oleh Dirjen WHO dikarenakan penyebaran COVID-19 yang meluas di dunia
PARAGRAPH  | sejak awal tahun 2020, Indonesia telah menyusun Rencana Operasi Penanganan COVID-19 Bidang
PARAGRAP

## Cross document

In [19]:
import re
from typing import List, Tuple
from collections import Counter


import re
from typing import List, Tuple, Optional

def _num3_depth(tag: str) -> Optional[int]:
    m = re.match(r"^NUMBERED\s+3\.(\d+)$", tag)
    return int(m.group(1)) if m else None



# ==== join helper (pakai punyamu bila sudah ada) ====
def _default_join_wrap(a: str, b: str) -> str:
    if a and a[-1] in "-\u2010\u2011\u2012\u2013\u2014":
        return a.rstrip("-\u2010\u2011\u2012\u2013\u2014") + b.lstrip()
    if a and not a.endswith((" ", "\n")) and b and not b.startswith((" ", "\n", ".", ",", ";", ":", ")", "}", "]")):
        return a + " " + b
    return a + b

try:
    _join_wrap
except NameError:
    _join_wrap = _default_join_wrap

# ==== merge HEADING (dalam halaman) ====
def _is_upperish(s: str, min_ratio: float = 0.8) -> bool:
    s = s.strip()
    if not s:
        return False
    letters = [c for c in s if c.isalpha()]
    if not letters:
        return False
    ratio = sum(1 for c in letters if c.isupper()) / len(letters)
    return ratio >= min_ratio

_pat_bab = re.compile(r'^(?:Bab|BAB|Chapter)\s+[A-Za-z0-9IVXLC.\- ]+$')

def merge_heading_items(items: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """Gabungkan run of HEADING (all-caps) & 'BAB/Chapter ...' + judul berikutnya, dalam 1 halaman."""
    out, i = [], 0
    while i < len(items):
        tag, txt = items[i]
        if tag == "HEADING":
            acc = txt.strip()
            j = i + 1

            # Case A: ‘BAB ...’ + judul di baris berikutnya
            if _pat_bab.match(acc) and j < len(items) and items[j][0] == "HEADING":
                nxt = items[j][1].strip()
                if 2 <= len(nxt) <= 120 and nxt[0].isupper():
                    acc = acc + " " + nxt
                    j += 1

            # Case B: deretan HEADING all-caps
            while j < len(items) and items[j][0] == "HEADING" and _is_upperish(items[j][1]) and len(items[j][1].strip()) <= 120:
                acc = acc + " " + items[j][1].strip()
                j += 1

            out.append(("HEADING", acc))
            i = j
        else:
            out.append(items[i])
            i += 1
    return out

# ==== serap semua PARAGRAPH setelah list item ====
def _is_prev_anchor(tag: str) -> bool:
    return  tag == "PARAGRAPH" or tag.startswith("NUMBERED") or tag.startswith("LETTERED")

def merge_trailing_paragraphs_absorb_all(
    items: List[Tuple[str, str]],
    join_fn=_join_wrap,   # pakai helper join kamu
) -> List[Tuple[str, str]]:
    def is_anchor(tag: str) -> bool:
        # HEADING ikut dianggap anchor; LETTERED level apa pun
        return tag.startswith("NUMBERED") or tag == "BULLET" or tag == "HEADING" or tag.startswith("LETTERED")

    out: List[Tuple[str, str]] = []
    i, n = 0, len(items)

    while i < n:
        tag, txt = items[i]

        # Case 1: setelah anchor → serap semua PARAGRAPH berikutnya
        if is_anchor(tag):
            acc = txt
            j = i + 1
            while j < n and items[j][0] == "PARAGRAPH":
                acc = join_fn(acc, items[j][1])
                j += 1
            out.append((tag, acc))
            i = j
            continue

        # Case 2: tidak ada anchor di atasnya dan ini PARAGRAPH →
        # gabung semua PARAGRAPH berurutan jadi satu PARAGRAPH
        if tag == "PARAGRAPH":
            acc = txt
            j = i + 1
            while j < n and items[j][0] == "PARAGRAPH":
                acc = join_fn(acc, items[j][1])
                j += 1
            out.append(("PARAGRAPH", acc))
            i = j
            continue

        # Case 3: tag lain → pass-through
        out.append((tag, txt))
        i += 1

    return out


# ==== MAIN ====
def process_document(doc, start_page: int, end_page: int,
                     drop_repeating_headings: bool = True) -> List[Tuple[str, str]]:
    """
    - process_page per halaman
    - merge list-continuations (intra-page)
    - merge heading lines (intra-page)  ← NEW
    - filter HEADER/FOOTER/PAGE_NUMBER & drop heading berulang
    - merge lintas halaman untuk paragraf pembuka & (opsional) HEADING→HEADING
    - final pass: serap semua PARAGRAPH setelah NUMBERED/LETTERED*/BULLET
    """
    pages_tagged: List[List[Tuple[str, str]]] = []
    hcounter = Counter()

    for p in range(start_page, end_page + 1):
        tagged = process_page(doc[p - 1])                 # -> List[(TAG, text)]
        tagged = merge_list_continuations(tagged)         # lanjutan di dalam halaman
        tagged = merge_heading_items(tagged)              # ← NEW: gabung deretan HEADING di halaman
        pages_tagged.append(tagged)
        hcounter.update([txt for tag, txt in tagged if tag == "HEADING"])

    repeated_headers = set()
    if drop_repeating_headings:
        repeated_headers = {h for h, c in hcounter.items() if c >= 2 and len(h) <= 120}

    def _filter(items: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
        return [
            (t, x) for (t, x) in items
            if t not in {"HEADER", "FOOTER", "PAGE_NUMBER"}
            and not (t == "HEADING" and x in repeated_headers)
        ]

    all_tagged: List[Tuple[str, str]] = []

    for cur in map(_filter, pages_tagged):
        if not cur:
            continue
        if not all_tagged:
            all_tagged.extend(cur)
            continue

        # ===== (opsional) gabung HEADING→HEADING di batas halaman =====
        last_tag, last_txt = all_tagged[-1]
        if last_tag == "HEADING" and cur and cur[0][0] == "HEADING":
            all_tagged[-1] = ("HEADING", _join_wrap(all_tagged[-1][1], cur[0][1].strip()))
            cur = cur[1:]
            if not cur:
                continue

        # ===== gabung lintas halaman: serap PARAGRAPH pembuka =====
        j = 0
        while j < len(cur) and cur[j][0] == "PARAGRAPH" and _is_prev_anchor(last_tag):
            nxt = cur[j][1].strip()
            all_tagged[-1] = (last_tag, _join_wrap(all_tagged[-1][1], nxt))
            j += 1
        all_tagged.extend(cur[j:])

    # ===== FINAL: serap semua PARAGRAPH setelah item list =====

    # all_tagged = demote_heading_continuations(all_tagged)          # turunkan "HEADING" yang lanjut kalimat
    all_tagged = demote_paren_number_continuations(all_tagged)     # tangani "(Covid-" + "19)"
    all_tagged = retag_numeric_from_text(all_tagged)
    # all_tagged = normalize_single_level_numbered(all_tagged)       # <-- penting: 2. 3. 4. jadi 3.1
    # all_tagged = relevel_numeric_dot_strict_stack(all_tagged)
    # 3) relevel anak '1.' di dalam parent numerik (tanpa mengubah parent ke anak)
    all_tagged = relevel_numeric_dot_parent(all_tagged)

    all_tagged = merge_trailing_paragraphs_absorb_all(all_tagged, join_fn=_join_wrap)
    return all_tagged


In [20]:
import re
from typing import List, Tuple

# jika belum ada
try:
    _TERMINATOR_RE
except NameError:
    _TERMINATOR_RE = re.compile(r"[.!?]\s*$")

try:
    _pat_bab
except NameError:
    _pat_bab = re.compile(r'^(?:Bab|BAB|Chapter)\s+[A-Za-z0-9IVXLC.\- ]+$')

def _is_upperish(s: str, min_ratio: float = 0.85) -> bool:
    t = s.strip()
    letters = [c for c in t if c.isalpha()]
    if not letters: return False
    return sum(1 for c in letters if c.isupper()) / len(letters) >= min_ratio

def demote_heading_continuations(items: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """
    Jika sebuah HEADING muncul setelah NUMBERED/LETTERED/PARAGRAPH
    yg *tidak* berakhir titik/tanda akhir kalimat, dan HEADING tsb
    bukan ALL-CAPS atau 'BAB/Chapter ...', turunkan ke PARAGRAPH.
    Tujuannya agar bisa diserap sebagai lanjutan item sebelumnya.
    """
    out: List[Tuple[str, str]] = []
    for i, (tag, txt) in enumerate(items):
        if tag == "HEADING" and out:
            prev_tag, prev_txt = out[-1]
            prev_is_anchor = (
                prev_tag.startswith("NUMBERED")
                or prev_tag.startswith("LETTERED")
                or prev_tag == "PARAGRAPH"
            )
            if prev_is_anchor and not _TERMINATOR_RE.search(prev_txt) and not prev_txt.rstrip().endswith(":"):
                t = txt.strip()
                # jangan demote judul jelas (ALL-CAPS/BAB/Chapter)
                if not _is_upperish(t, 0.9) and not _pat_bab.match(t):
                    out.append(("PARAGRAPH", txt))
                    continue
        out.append((tag, txt))
    return out

import re
from typing import List, Tuple

_HYPH_RE = re.compile(r"[-\u2010-\u2014]\s*$")  # -, ‐, -, ‒, –
_NUMPAREN_HEAD = re.compile(r"^\d+\)\b")        # "19)" di awal baris

def demote_paren_number_continuations(items: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """
    Jika sebuah baris bertag NUMBERED 5 (format '1)') datang setelah baris sebelumnya
    berakhir tanda hubung dan masih ada kurung buka yang belum tertutup, anggap itu
    lanjutan kalimat (mis. '(Covid-19) ...') → turunkan ke PARAGRAPH.
    """
    out: List[Tuple[str, str]] = []
    for i, (tag, txt) in enumerate(items):
        if i > 0 and tag == "NUMBERED 5" and _NUMPAREN_HEAD.match(txt):
            prev_tag, prev_txt = out[-1]
            prev = prev_txt.rstrip()
            # konteks: sebelumnya anchor/teks biasa, berakhir hyphen, dan ada '(' yang belum tertutup
            if (prev_tag.startswith("NUMBERED") or prev_tag.startswith("LETTERED")
                or prev_tag in {"PARAGRAPH", "HEADING"}):
                if _HYPH_RE.search(prev) and prev.rfind("(") > prev.rfind(")"):
                    out.append(("PARAGRAPH", txt))
                    continue
        out.append((tag, txt))
    return out


import re
from typing import List, Tuple

# 1) Normalisasi: "N. " (ada SPASI setelah titik) => selalu NUMBERED 3.1
#    (mis. "2.\tUU ..." juga kena karena \s+)
_SINGLE_NUM_RE = re.compile(r'^\d+\.\s+')

def normalize_single_level_numbered(items: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    out = []
    for tag, txt in items:
        if tag.startswith("NUMBERED 3.") and _SINGLE_NUM_RE.match(txt):
            out.append(("NUMBERED 3.1", txt))
        else:
            out.append((tag, txt))
    return out

# util level absolut (sesuai hirarki yang kamu pakai)
import re
from typing import List, Tuple, Optional

# --- Normalisasi & retag numeric dari teks ---
_DOT_SPACES    = re.compile(r'\s*\.\s*')                       # "3 . 1 . 2" -> "3.1.2"
_MULTI_NUM_RE  = re.compile(r'^(?P<num>\d+(?:\.\d+)+)\.?\s')   # "3.1. ..." / "3.1.2 ..."
_SINGLE_NUM_RE = re.compile(r'^\d+\.\s+')                      # "1. ..." (segmen tunggal)

# optional guards
_THOUSANDS_RE = re.compile(r'^\d{1,3}(?:\.\d{3})+(?=\s|[,;.)]|$)')
_DOTDATE_RE   = re.compile(r'^\d{1,2}\.\d{1,2}\.\d{2,4}(?=\s|[,;.)]|$)')
_DECIMAL_RE   = re.compile(r'^\d+\.\d+(?=\s|[,;.)]|$)')

def retag_numeric_from_text(items: List[Tuple[str,str]]) -> List[Tuple[str,str]]:
    """Pastikan depth NUMBERED 3.x sesuai jumlah segmen angka pada teks depan."""
    out = []
    for tag, txt in items:
        if not tag.startswith("NUMBERED 3."):
            out.append((tag, txt)); continue

        s = _DOT_SPACES.sub('.', txt.strip())

        if _THOUSANDS_RE.match(s) or _DOTDATE_RE.match(s) or (_DECIMAL_RE.match(s) and not re.match(r'^\d+\.\d+\.', s)):
            out.append((tag, txt)); continue  # bukan outline

        m = _MULTI_NUM_RE.match(s)
        if m:
            depth = m.group('num').count('.') + 1   # "3.1" -> 2, "3.1.2" -> 3
            out.append((f"NUMBERED 3.{depth}", txt))
            continue

        if _SINGLE_NUM_RE.match(s):
            out.append(("NUMBERED 3.1", txt))
            continue

        out.append((tag, txt))
    return out

# --- Relevel ketat: pakai parent_depth saja (bukan stack top anak) ---
def _abs_level(tag: str) -> Optional[int]:
    if tag.startswith("HEADING"): return 1
    if tag.startswith("LETTERED "):
        try: return int(tag.split()[1])         # 2/4/6/8
        except: return None
    if tag.startswith("NUMBERED 3."):
        try:
            d = int(tag.split()[1].split(".")[1])
            return 3 + (d - 1)                  # 3.1→3, 3.2→4, 3.3→5 ...
        except: return None
    if tag == "NUMBERED 5": return 5
    if tag == "NUMBERED 7": return 7
    return None

def relevel_numeric_dot_parent(items: List[Tuple[str,str]]) -> List[Tuple[str,str]]:
    """
    Naikkan 'NUMBERED 3.1' jadi '3.(p+1)' hanya bila ada parent numerik aktif 'NUMBERED 3.p' (p>=2).
    Parent TIDAK berubah ketika mempromosikan anak.
    Reset parent saat ketemu anchor dengan level <= parent_level atau jenis lain (HEADING/LETTERED/5/7).
    """
    out: List[Tuple[str,str]] = []
    parent_depth: Optional[int] = None      # p dari 3.p (>=2)
    parent_level: Optional[int] = None      # level absolut parent

    def abs_of(p: int) -> int: return 3 + (p - 1)

    for tag, txt in items:
        cur_level = _abs_level(tag)

        # reset parent jika melintasi section lain
        if parent_level is not None and cur_level is not None and cur_level <= parent_level:
            parent_depth = None; parent_level = None
        if tag.startswith(("HEADING", "LETTERED")) or tag in {"NUMBERED 5", "NUMBERED 7"}:
            parent_depth = None; parent_level = None

        if tag.startswith("NUMBERED 3."):
            d = int(tag.split()[1].split(".")[1])
            if d >= 2:
                # ini parent numerik asli (mis. 3.1, 2.3, 1.2.1 → 3.3 di taggingmu)
                parent_depth = d
                parent_level = abs_of(d)
                out.append((tag, txt))
            else:
                # d == 1 → "1."
                if parent_depth is not None:
                    newd = parent_depth + 1
                    out.append((f"NUMBERED 3.{newd}", txt))
                    # PENTING: JANGAN ubah parent_depth di sini
                else:
                    out.append((tag, txt))
        else:
            out.append((tag, txt))

    return out

# --- tambahkan di bawah parse_classified() ---
_TERMINATOR_RE = re.compile(r"[.!?]\s*$")
_NEW_ITEM_RE   = re.compile(r"^(?:[-*•]\s+|\d+(?:\.\d+)*\.?\s+|[A-Za-z]\.\s+)")

def _looks_like_new_item(s: str) -> bool:
    return bool(_NEW_ITEM_RE.match(s.strip()))

def merge_list_continuations(tagged: List[Tuple[str, str]]) -> List[Tuple[str, str]]:
    """
    Gabungkan kelanjutan baris untuk NUMBERED/LETTERED/BULLET
    yang salah terklasifikasi menjadi PARAGRAPH di baris berikutnya.
    """
    out, i = [], 0
    while i < len(tagged):
        tag, txt = tagged[i]
        if tag == "BULLET" or tag.startswith("LETTERED") or tag.startswith("NUMBERED"):
            j, merged = i + 1, txt
            while j < len(tagged):
                n_tag, n_txt = tagged[j]
                if n_tag != "PARAGRAPH": break
                if _looks_like_new_item(n_txt) or n_txt.startswith(("BAB ","Bab ","CHAPTER","Chapter")):
                    break
                # jika kalimat sudah "tutup" dan baris lanjutan diawali huruf besar → hentikan
                if _TERMINATOR_RE.search(merged) and n_txt[:1].isupper():
                    break
                merged = _join_wrap(merged, n_txt)
                j += 1
            out.append((tag, merged))
            i = j
        else:
            out.append((tag, txt))
            i += 1
    return out

In [26]:
start_page = 19
end_page = 19
doc = doc_3

merged = process_document(doc, start_page, end_page)
for tag, txt in merged:
    print(f"{tag:10s} | {txt}")

PARAGRAPH  | Penanganan Corona Virus Disease 2019 (COVID-19). Sampai saat ini, situasi COVID-19 di tingkat global maupun nasional masih dalam risiko sangat tinggi. Selama pengembangan vaksin masih dalam proses, dunia dihadapkan pada kenyataan untuk mempersiapkan diri hidup berdampingan dengan COVID-19. Oleh karenanya diperlukan pedoman dalam upaya pencegahan dan pengendalian COVID-19 untuk memberikan panduan bagi petugas kesehatan agar tetap sehat, aman, dan produktif, dan seluruh penduduk Indonesia mendapatkan pelayanan yang sesuai standar. Pedoman pencegahan dan pengendalian COVID-19 disusun berdasarkan rekomendasi WHO yang disesuaikan dengan perkembangan pandemi COVID-19, dan ketentuan peraturan perundang-undangan yang berlaku.
NUMBERED 3.2 | 1. 2. Tujuan Pedoman
NUMBERED 3.3 | 1. 2. 1. Tujuan Umum Melaksanakan pencegahan dan pengendalian COVID-19 di Indonesia.
NUMBERED 3.3 | 1. 2. 2. Tujuan Khusus
LETTERED 4 | a. Memahami strategi dan indikator penanggulangan
LETTERED 4 | b. Melaks

## JSON

In [29]:
import re

# ---------- util ----------
_SPACE = re.compile(r"[ \t\u00A0\u2000-\u200B\u202F\u205F\u3000]+")
def norm(s: str) -> str:
    if not s: return ""
    s = s.replace("\u200b", "").replace("\xa0", " ")
    return _SPACE.sub(" ", s).strip()

# ambil marker + isi (jangan hapus di 'text', tapi sediakan 'body' tanpa marker)
# urutan cek penting: desimal dulu
_MARK_DEC   = re.compile(r"^\s*(\d+(?:\s*\.\s*\d+){1,}\.?)\s+")
_MARK_CAP   = re.compile(r"^\s*([A-Z]\.)\s+")
_MARK_NUM   = re.compile(r"^\s*(\d+\.)\s+")
_MARK_LOW   = re.compile(r"^\s*([a-z]\.)\s+")
_MARK_NUMC  = re.compile(r"^\s*(\d+\))\s+")
_MARK_LOWC  = re.compile(r"^\s*([a-z]\))\s+")
_MARK_NUMP  = re.compile(r"^\s*(\(\s*\d+\s*\))\s+")
_MARK_LOWP  = re.compile(r"^\s*(\(\s*[a-z]\s*\))\s+")

def split_marker(s: str):
    t = s or ""
    for rx in (_MARK_DEC, _MARK_CAP, _MARK_NUM, _MARK_LOW, _MARK_NUMC, _MARK_LOWC, _MARK_NUMP, _MARK_LOWP):
        m = rx.match(t)
        if m:
            marker = norm(m.group(1))
            body   = norm(t[m.end():])
            return marker, body
    return "", norm(t)

# ---------- level dari TAG saja ----------
_num3_re     = re.compile(r"^NUMBERED\s+3\.(\d+)$")
_lettered_re = re.compile(r"^LETTERED\s+(\d+)$")

def _level_from_tag_only(tag: str, stack_levels: list[int]) -> int | None:
    """
    Hirarki dari TAG:
      1: HEADING / LETTERED 2 (A.)
      2: NUMBERED 3.1  -> '1.' (heading)
      3: NUMBERED 3.p (p>=2) -> '1.1', '1.1.1' ... (heading desimal)
      4: NUMBERED 3.1  -> '1.' (LIST) jika ADA level-3 aktif di stack
      5: LETTERED 4     -> 'a.'
      6: NUMBERED 5     -> '1)'
      7: LETTERED 6     -> 'a)'
      8: NUMBERED 7     -> '(1)'
      9: LETTERED 8     -> '(a)'
    """
    tag = (tag or "").strip()

    if tag.startswith("HEADING"):
        return 1

    m = _num3_re.match(tag)
    if m:
        p = int(m.group(1))
        if p == 1:
            return 4 if any(lv == 3 for lv in stack_levels) else 2
        else:
            # depth-aware: 1.2 -> lvl 3, 1.2.1 -> lvl 4, 1.2.1.1 -> lvl 5, ...
            return min(9, 3 + (p - 2))

    if tag == "NUMBERED 5":  # 1)
        return 6
    if tag == "NUMBERED 7":  # (1)
        return 8

    lm = _lettered_re.match(tag)
    if lm:
        n = int(lm.group(1))
        if n == 2: return 1  # A. (jika ada)
        if n == 4: return 5  # a.
        if n == 6: return 7  # a)
        if n == 8: return 9  # (a)

    return None

def build_tree_from_tags_h9_preserve(merged):
    """
    merged: list[(tag, text_asli_dengan_marker)]
    Output node:
      - type, level
      - text  : TEKS UTUH (marker dipertahankan)
      - marker: penanda (mis. '1.', 'a.', '(1)')
      - body  : isi tanpa marker (buat embedding RAG)
      - tag_src: tag asli
    Tidak ada auto-promote LETTERED 4 -> 7/9; biarkan jadi sibling bila level tidak cocok.
    """
    root = {"type": "ROOT", "children": []}
    stack: list[tuple[int, dict]] = []  # (level_abs, node)

    def node_type_for_level(level: int) -> str:
        if level <= 3: return "HEADING"
        return {
            4: "NUMBERED",           # 1. (list)
            5: "LETTERED",           # a.
            6: "NUMBERED_PAREN",     # 1)
            7: "LETTERED_PAREN",     # a)
            8: "NUMBERED_INPAREN",   # (1)
            9: "LETTERED_INPAREN",   # (a)
        }[level]

    def push(level: int, tag_src: str, raw_text: str):
        # keluar dari level >= level baru
        while stack and stack[-1][0] >= level:
            stack.pop()
        parent = stack[-1][1] if stack else root

        text_full = norm(raw_text)
        marker, body = split_marker(text_full)

        node = {
            "type": node_type_for_level(level),
            "tag": f"HIER {level}",
            "level": level,
            "tag_src": tag_src,
            "marker": marker,        # ← marker disimpan
            # "body": body,            # ← isi tanpa marker
            "text": text_full,       # ← TEKS UTUH (marker tetap ada)
            "children": []
        }
        parent["children"].append(node)
        stack.append((level, node))

    for tag, raw_text in merged:
        stack_lv = [lv for lv, _ in stack]
        level = _level_from_tag_only(tag, stack_lv)

        if level is None:
            # paragraf biasa (tanpa struktur)
            text_full = norm(raw_text)
            marker, body = split_marker(text_full)
            parent = stack[-1][1] if stack else root
            parent["children"].append({
                "type": "PARAGRAPH",
                "tag": "HIER 0",
                "level": 0,
                "tag_src": tag,
                "marker": marker,
                "body": body,
                "text": text_full,
                "children": []
            })
            continue

        push(level, tag, raw_text)

    return root


In [22]:
import json

In [40]:
# === pakai ===
start_page = 1
end_page = 10
doc        = doc_2

merged = process_document(doc, start_page, end_page)
# perbaiki hirarki angka bertitik dulu
# merged = relevel_numeric_dot_smart(merged)
# (opsional) jika kamu punya fungsi relevel_numeric_dot, panggil di sini agar 1.,2.,3. di bawah 3.1 dinaikkan levelnya:
# merged = relevel_numeric_dot(merged)

tree = build_tree_from_tags_h9_preserve(merged)
print(json.dumps(tree["children"], ensure_ascii=False, indent=2))

[
  {
    "type": "HEADING",
    "tag": "HIER 1",
    "level": 1,
    "tag_src": "HEADING",
    "marker": "",
    "text": "SATUAN TUGAS PENANGANAN COVID-19 SURAT EDARAN NOMOR 25 TAHUN 2022 TENTANG PROTOKOL KESEHATAN PERJALANAN LUAR NEGERI PADA MASA PANDEMI CORONA VIRUS DISEASE 2019 (COVID-19)",
    "children": []
  },
  {
    "type": "HEADING",
    "tag": "HIER 1",
    "level": 1,
    "tag_src": "LETTERED 2",
    "marker": "A.",
    "text": "A. Latar Belakang",
    "children": [
      {
        "type": "HEADING",
        "tag": "HIER 2",
        "level": 2,
        "tag_src": "NUMBERED 3.1",
        "marker": "1.",
        "text": "1. Bahwa dalam rangka menindaklanjuti perkembangan situasi persebaran virus SARS-CoV-2 pada berbagai negara di dunia dan hasil evaluasi lintas sektoral maka diperlukan penyesuaian mekanisme pengendalian terhadap perjalanan luar negeri.",
        "children": []
      },
      {
        "type": "HEADING",
        "tag": "HIER 2",
        "level": 2,
        "t

In [36]:
out_path = "/content/drive/MyDrive/ML/RAG_COVID/doc_01-15_171.json"   # ubah nama sesuai kebutuhan
with open(out_path, "w", encoding="utf-8") as f:
    # simpan anak ROOT saja biar bersih
    json.dump(tree["children"], f, ensure_ascii=False, indent=2)

print("Saved:", out_path)

Saved: /content/drive/MyDrive/ML/RAG_COVID/doc_01-15_171.json


In [39]:
out_path = "/content/drive/MyDrive/ML/RAG_COVID/doc_02-1_10.json"   # ubah nama sesuai kebutuhan
with open(out_path, "w", encoding="utf-8") as f:
    # simpan anak ROOT saja biar bersih
    json.dump(tree["children"], f, ensure_ascii=False, indent=2)

print("Saved:", out_path)

Saved: /content/drive/MyDrive/ML/RAG_COVID/doc_02-1_10.json


In [33]:
out_path = "/content/drive/MyDrive/ML/RAG_COVID/doc_03-11_22.json"   # ubah nama sesuai kebutuhan
with open(out_path, "w", encoding="utf-8") as f:
    # simpan anak ROOT saja biar bersih
    json.dump(tree["children"], f, ensure_ascii=False, indent=2)

print("Saved:", out_path)

Saved: /content/drive/MyDrive/ML/RAG_COVID/doc_03-11_22.json


kalau misal kita punya 3 document json yg berbeda:

"/content/drive/MyDrive/ML/RAG_COVID/doc_01-15_171.json -> 06 REV-05_Pedoman_P2_COVID-19_13_Juli_2020_compressed

/content/drive/MyDrive/ML/RAG_COVID/doc_02-1_10.json -> 05 Surat Edaran Satgas COVID-19 No. 25 Tahun 2022 — Protokol Kesehatan Perjalanan Luar Negeri

/content/drive/MyDrive/ML/RAG_COVID/doc_03-11_317.json -> 04 Rencana Operasi Penanganan COVID-19 — Kementerian Kesehatan RI

## Chromadb

In [None]:
!pip install chromadb sentence-transformers rapidfuzz

In [None]:
# 0) Install (kalau perlu):
# pip install chromadb sentence-transformers rapidfuzz

import json, re, os, uuid
from pathlib import Path
from itertools import chain
from rapidfuzz import fuzz
import chromadb
from chromadb.utils import embedding_functions

# ===== Helpers =====
SPACE_RUN_RE = re.compile(r'[ \t\u00A0\u2000-\u200B\u202F\u205F\u3000]+')
def collapse_spaces(s: str) -> str:
    if not s: return s
    s = s.replace('\u200b', '').replace('\xa0', ' ')
    return SPACE_RUN_RE.sub(' ', s).strip()

def walk_nodes(node, path, out):
    """Flatten tree -> list of {id, text, path, tag, level}."""
    text = collapse_spaces(node.get("text", "") or "")
    if text:
        out.append({
            "id": str(uuid.uuid4()),
            "text": text,
            "path": " > ".join(path + [text[:40]]),  # ringkas path untuk metadata
            "tag": node.get("tag"),
            "level": node.get("level"),
        })
    for ch in node.get("children", []) or []:
        walk_nodes(ch, path + [text[:40]] if text else path, out)

# ===== 1) Load & Flatten =====
json_path = "/content/doc_tree_1-10.json"   # ganti jika lokasi berbeda
data = json.loads(Path(json_path).read_text(encoding="utf-8"))

flat = []
for root in data:
    walk_nodes(root, path=[collapse_spaces(root.get("text","")[:40])], out=flat)

# (Opsional) buang duplikat konten identik
seen = set(); unique = []
for r in flat:
    key = (r["tag"], r["text"])
    if key in seen:
        continue
    seen.add(key)
    unique.append(r)
flat = unique

# (Opsional) filter node terlalu pendek
flat = [r for r in flat if len(r["text"]) >= 40]

# ===== 2) Index di Chroma =====
persist_dir = "./chroma_covid_idx"
client = chromadb.PersistentClient(path=persist_dir)

embed_fn = embedding_functions.SentenceTransformerEmbeddingFunction(
    model_name="intfloat/multilingual-e5-base"  # bagus untuk multi-bahasa
)

col = client.get_or_create_collection(
    name="covid_kebijakan_id",
    embedding_function=embed_fn,
    metadata={"source": "dokumen_kebijakan_covid"}
)

# Upsert
col.upsert(
    ids=[r["id"] for r in flat],
    documents=[r["text"] for r in flat],
    metadatas=[{"path": r["path"], "tag": r["tag"], "level": r["level"]} for r in flat]
)

# ===== 3) Retrieval helper =====
def expand_query(q):
    q = collapse_spaces(q.lower())
    # sinonim sederhana
    variants = {
        "pengertian": ["pengertian", "definisi", "apa itu", "arti"],
        "coronavirus": ["coronavirus", "virus corona", "corona virus", "sars-cov-2"]
    }
    toks = q.split()
    outs = set([q])
    # kombinasi ringan
    for p in variants["pengertian"]:
        for c in variants["coronavirus"]:
            outs.add(f"{p} {c}")
    return list(outs)

def retrieve(q, k=5):
    expanded = expand_query(q)
    res = col.query(query_texts=expanded, n_results=k)
    docs = list(chain.from_iterable(res.get("documents", [])))
    metas = list(chain.from_iterable(res.get("metadatas", [])))
    # de-dup kasar berdasar fuzzy ratio
    picked = []
    for d, m in zip(docs, metas):
        if not any(fuzz.ratio(d, x[0]) > 95 for x in picked):
            picked.append((d, m))
    return picked[:k]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/387 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/57.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/694 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.11G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/418 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/280 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/200 [00:00<?, ?B/s]

In [None]:
# ===== 4) Contoh: tanya 'pengertian coronavirus adalah' =====
top = retrieve("Strategi Penanggulangan ", k=5)
for i, (d, m) in enumerate(top, 1):
    print(f"[{i}] {m.get('path')}\n{d[:900]}...\n")

[1] AB I PENDAHULUAN > AB I PENDAHULUAN > di dunia dapat dilihat pada gambar 1.1 S > 4. 2. Etiologi Penyebab COVID-19 adalah 
4. 2. Etiologi Penyebab COVID-19 adalah virus yang tergolong dalam family coronavirus. Coronavirus merupakan virus RNA strain tunggal positif, berkapsul dan tidak bersegmen. Terdapat 4 struktur protein utama pada Coronavirus yaitu: protein N (nukleokapsid), glikoprotein M (membran), glikoprotein spike S (spike), protein E (selubung). Coronavirus tergolong ordo Nidovirales, keluarga Coronaviridae. Coronavirus ini dapat menyebabkan penyakit pada hewan atau manusia. Terdapat 4 genus yaitu alphacoronavirus, betacoronavirus, gammacoronavirus, dan deltacoronavirus. Sebelum adanya COVID-19, ada 6 jenis coronavirus yang dapat menginfeksi manusia, yaitu HCoV-229E (alphacoronavirus), HCoV-OC43 (betacoronavirus), HCoVNL63 (alphacoronavirus) HCoV-HKU1 (betacoronavirus), SARS-CoV (betacoronavirus), dan MERS-CoV (betacoronavirus). Sumber: Shereen, et al. (2020) Journal of Adv

## Hirarki search

In [None]:
!pip install -q chromadb sentence-transformers

In [None]:
import json, uuid, re, os, shutil
from pathlib import Path
from itertools import chain
import chromadb
from chromadb.utils import embedding_functions

# ===== Konfigurasi =====
JSON_PATH   = "/content/doc_tree_1-10.json"     # ganti sesuai file yang kamu upload
STORE_PATH  = "/content/chroma_store_e5"        # store baru agar tidak konflik
HEAD_NAME   = "doc_headings_e5"
PARA_NAME   = "doc_paragraphs_e5"
MODEL_NAME  = "intfloat/multilingual-e5-base"   # E5 multi-bahasa

# Kalau ingin rebuild total, set True (hapus store lama)
RESET_STORE = False

# ===== Helpers =====
SPACE_RUN_RE = re.compile(r'[ \t\u00A0\u2000-\u200B\u202F\u205F\u3000]+')
def collapse_spaces(s: str) -> str:
    if not s: return s
    s = s.replace('\u200b','').replace('\xa0',' ')
    return SPACE_RUN_RE.sub(' ', s).strip()

def normalize_title(s: str) -> str:
    s = collapse_spaces(s or "")
    # opsional perbaikan OCR & nomor depan
    s = re.sub(r'^\s*AB(\s+|$)', 'BAB ', s)
    s = re.sub(r'^\s*\d+(\.\d+)*\s*[\)\.]?\s*', '', s)
    return s

def flatten_tree(json_roots):
    """Ratakan JSON bertingkat -> nodes (headings & paragraphs)."""
    nodes, edges, id2idx = [], {}, {}
    def visit(node, parent_id=None, path_titles=None, level=0, order_base=0):
        nid = node.get("id") or str(uuid.uuid4())
        text = collapse_spaces(node.get("text",""))
        tag  = node.get("tag")
        title_like = tag in {"HEADING","SUBHEADING","TITLE"}
        title = normalize_title(text) if title_like else None

        new_path = list(path_titles or [])
        if title_like and title:
            new_path.append(title)

        node_obj = {
            "node_id": nid,
            "parent_id": parent_id or "",
            "children_ids": [],
            "level": level,
            "tag": tag,
            "title": title if title_like else None,
            "text": text if not title_like else "",   # paragraf punya text, heading tidak
            "path_titles": new_path,
            "path_str": " > ".join(new_path),
            "order_idx": order_base,
        }
        idx = len(nodes)
        nodes.append(node_obj)
        id2idx[nid] = idx
        edges[nid] = {"parent": parent_id or "", "children": []}

        order = order_base
        for ch in node.get("children", []) or []:
            order += 1
            cid = visit(ch, nid, new_path, level+1, order)
            edges[nid]["children"].append(cid)
        return nid

    for r in json_roots:
        visit(r, None, [], 0, 0)

    for pid, rel in edges.items():
        if pid in id2idx:
            nodes[id2idx[pid]]["children_ids"] = rel["children"]
    return nodes, edges

def ensure_collections(store_path=STORE_PATH, model_name=MODEL_NAME):
    os.makedirs(store_path, exist_ok=True)
    client  = chromadb.PersistentClient(path=store_path)
    embedfn = embedding_functions.SentenceTransformerEmbeddingFunction(model_name=model_name)

    # Hapus koleksi lama jika ingin reset
    if RESET_STORE:
        try: client.delete_collection(HEAD_NAME)
        except Exception: pass
        try: client.delete_collection(PARA_NAME)
        except Exception: pass

    # Selalu buat koleksi baru dgn embedfn E5 supaya konsisten
    col_head = client.get_or_create_collection(name=HEAD_NAME, embedding_function=embedfn)
    col_para = client.get_or_create_collection(name=PARA_NAME, embedding_function=embedfn)
    return client, col_head, col_para

def build_index(json_path=JSON_PATH, store_path=STORE_PATH, model_name=MODEL_NAME):
    print("[INFO] Loading JSON...")
    roots = json.loads(Path(json_path).read_text(encoding="utf-8"))
    nodes, _ = flatten_tree(roots)
    print(f"[INFO] Total nodes: {len(nodes)}")

    heading_nodes = [n for n in nodes if n["title"]]
    para_nodes    = [n for n in nodes if n["text"]]
    print(f"[INFO] Heading nodes: {len(heading_nodes)} | Paragraph nodes: {len(para_nodes)}")

    client, col_head, col_para = ensure_collections(store_path, model_name)

    # HEADINGS → embed path_str (kuatkan sinyal bab/subbab)
    if heading_nodes:
        col_head.upsert(
            ids=[n["node_id"] for n in heading_nodes],
            documents=[(n["path_str"] or n["title"] or "").strip() for n in heading_nodes],
            metadatas=[{
                "node_id": n["node_id"],
                "parent_id": n["parent_id"],
                "level": n["level"],
                "tag": n["tag"] or "",
                "title": n.get("title") or "",
                "path_str": n["path_str"],
                "order_idx": n["order_idx"],
            } for n in heading_nodes]
        )

    # PARAGRAPHS → embed (path_str + teks) agar judul ikut mempengaruhi embedding
    if para_nodes:
        col_para.upsert(
            ids=[n["node_id"] for n in para_nodes],
            documents=[(f"{n['path_str']}\n\n{n['text']}".strip()) for n in para_nodes],
            metadatas=[{
                "node_id": n["node_id"],
                "parent_id": n["parent_id"],
                "level": n["level"],
                "tag": n["tag"] or "",
                "title": n.get("title") or "",
                "path_str": n["path_str"],
                "order_idx": n["order_idx"],
            } for n in para_nodes]
        )
    print("[INFO] Upsert selesai.")
    return client, col_head, col_para

def print_results(results, top=3):
    docs  = results.get("documents", [[]])[0]
    metas = results.get("metadatas", [[]])[0]
    for i, (d, m) in enumerate(zip(docs[:top], metas[:top]), 1):
        print(f"\n--- Result {i} ---")
        print("Path:", m.get("path_str",""))
        print(d[:1000])

# ===== FUNGSI QUERY (bisa dipanggil di cell lain) =====
client = col_head = col_para = None  # akan diisi setelah build_index()

def ask(query, n_results=5):
    """Query langsung ke paragraf (seluruh dokumen)."""
    if col_para is None:
        raise RuntimeError("Index belum dibuat. Jalankan build_index(...) dulu.")
    res = col_para.query(query_texts=[query], n_results=n_results)
    print_results(res, top=n_results)

def ask_hierarchical(query, beam_width=3, final_k=5):
    """Top-down: pilih heading terbaik → cari paragraf di bawah heading tersebut."""
    if col_head is None or col_para is None:
        raise RuntimeError("Index belum dibuat. Jalankan build_index(...) dulu.")

    # 1) Ambil beberapa heading terbaik (global)
    heads = col_head.query(
        query_texts=[query],
        n_results=max(beam_width, 3),
        include=["documents", "metadatas", "distances"]  # ← HAPUS "ids"
    )
    head_metas = heads.get("metadatas", [[]])[0]

    # 2) Untuk tiap heading kandidat, cari paragraf di bawahnya (filter by parent_id)
    cand = []
    for hmeta in head_metas:
        pid = hmeta.get("node_id") or ""
        if not pid:
            continue
        sub = col_para.query(
            query_texts=[query],
            n_results=final_k,
            where={"parent_id": pid},
            include=["documents", "metadatas", "distances"]  # ← HAPUS "ids"
        )
        for d, m, dist in zip(sub.get("documents",[[]])[0],
                              sub.get("metadatas",[[]])[0],
                              sub.get("distances",[[]])[0]):
            cand.append((dist, d, m))

    # 3) Ambil gabungan top-N terbaik
    cand.sort(key=lambda x: x[0])
    docs  = [c[1] for c in cand[:final_k]]
    metas = [c[2] for c in cand[:final_k]]
    results = {"documents":[docs], "metadatas":[metas]}
    print_results(results, top=len(docs))


# ===== Bangun index sekali (atau ulangi jika ganti JSON) =====
if RESET_STORE and os.path.isdir(STORE_PATH):
    shutil.rmtree(STORE_PATH, ignore_errors=True)
client, col_head, col_para = build_index(JSON_PATH, STORE_PATH, MODEL_NAME)
print("\n[READY] Panggil ask('...') atau ask_hierarchical('...') di cell lain.")


[INFO] Loading JSON...
[INFO] Total nodes: 101
[INFO] Heading nodes: 3 | Paragraph nodes: 98
[INFO] Upsert selesai.

[READY] Panggil ask('...') atau ask_hierarchical('...') di cell lain.


In [None]:
ask("Strategi Penanggulangan Pandemi", n_results=5)


--- Result 1 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI

Jumlah kematian

--- Result 2 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI

1. Strategi Penanggulangan Pandemi Sejak kasus pertama diumumkan pada tanggal 2 Maret 2020, penyebaran penularan COVID-19 terjadi dengan cepat di Indonesia. Hal ini memerlukan strategi penanggulangan sesuai dengan transmisi yang terjadi baik di tingkat nasional maupun provinsi, dengan tujuan:

--- Result 3 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI

Jumlah konfirmasi

--- Result 4 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI

Jumlah kasus rawat RS

--- Result 5 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PAND

In [None]:
ask_hierarchical("Strategi Penanggulangan Pandemi", beam_width=4, final_k=5)


--- Result 1 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI

1. Strategi Penanggulangan Pandemi Sejak kasus pertama diumumkan pada tanggal 2 Maret 2020, penyebaran penularan COVID-19 terjadi dengan cepat di Indonesia. Hal ini memerlukan strategi penanggulangan sesuai dengan transmisi yang terjadi baik di tingkat nasional maupun provinsi, dengan tujuan:

--- Result 2 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI

1. Strategi Penanggulangan Pandemi Sejak kasus pertama diumumkan pada tanggal 2 Maret 2020, penyebaran penularan COVID-19 terjadi dengan cepat di Indonesia. Hal ini memerlukan strategi penanggulangan sesuai dengan transmisi yang terjadi baik di tingkat nasional maupun provinsi, dengan tujuan:

--- Result 3 ---
Path: BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI
BAB II STRATEGI DAN INDIKATOR PENANGGULANGAN PANDEMI

Wilayah yang belum 