In [1]:
!pip install faiss-cpu sentence-transformers wikipedia beautifulsoup4 pymupdf \
             pytesseract easyocr "numpy<2.0" langchain-text-splitters -q

!pip uninstall -y protobuf -q
!pip install protobuf==4.25.3 -q

import nltk
nltk.download("punkt", quiet=True)
nltk.download("punkt_tab", quiet=True)

print("\n\n--- All packages installed and NLTK imported successfully! ---")


  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m32.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.1/24.1 MB[0m [31m35.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m43.9 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m34.2 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m664.8/664.8 MB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m:00:01[0m00:01[0m


In [2]:
import os
import shutil
import io
import time
import json
import re
import hashlib
import unicodedata
import html
from pathlib import Path
from functools import partial
from concurrent.futures import ProcessPoolExecutor, as_completed
from urllib.parse import quote, unquote


import numpy as np
import faiss
import torch
import requests
import wikipedia
import pymupdf
from PIL import Image
from tqdm import tqdm
from bs4 import BeautifulSoup, Tag
from sentence_transformers import SentenceTransformer
from transformers import AutoTokenizer
from langchain_text_splitters import (
    RecursiveCharacterTextSplitter,
    SentenceTransformersTokenTextSplitter,
)

# OCR backends
try:
    import pytesseract
    _tesseract_available = True
except ImportError:
    _tesseract_available = False

try:
    import easyocr
    _easyocr_available = True
except ImportError:
    _easyocr_available = False

# Suppress TensorFlow warning if present
os.environ["TF_ENABLE_ONEDNN_OPTS"] = "0"

print("--- Imports complete ---")


2025-11-16 04:29:05.682844: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1763267345.883557      48 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1763267345.934807      48 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


--- Imports complete ---


In [13]:
class Constants:
    TESSERACT_PY_AVAILABLE = _tesseract_available
    EASYOCR_AVAILABLE = _easyocr_available

    EMBED_MODEL_NAME = "intfloat/multilingual-e5-large-instruct"
    EMBED_BATCH_SIZE = 32

    CHUNK_MAX_TOKENS = 300
    CHUNK_TOKEN_OVERLAP = 50
    EMBED_DTYPE = "float32"

    # OCR
    OCR_DPI = 250
    USE_TESSERACT_AUTO = True     # Use Tesseract if available, default EasyOCR
    TESSERACT_LANGS = "vie"       #
    #
    OCR_WORKERS = max(1, min(4, (os.cpu_count() or 2) - 1)) # Number of OCR worker processes
    DOWNSCALE_MAX_WIDTH = 1200    # Max width (px) to downscale images before OCR
    PAGE_RENDER_BATCH = 32      # Page render batch size for memory control
    
    # File names
    FAISS_INDEX_FILE = "faiss.index"
    CHUNKS_FILE = "chunks.jsonl"
    EMBEDDINGS_FILE = "embeddings.npy"

print("--- Constants defined ---")


--- Constants defined ---


In [5]:
class Utils:
    @staticmethod
    def sha1(text: str, length: int = 12) -> str:
        if not isinstance(text, str):
            text = str(text)
        h = hashlib.sha1(text.encode("utf-8")).hexdigest()
        return h[:length]

    @staticmethod
    def slugify(text: str) -> str:
        text = unicodedata.normalize("NFKD", text)
        text = "".join(ch for ch in text if not unicodedata.combining(ch))
        text = re.sub(r"[^0-9a-zA-Z]+", "-", text).strip("-")
        return text.lower() or "doc"

    @staticmethod
    def normalize_vi_text(text: str) -> str:
        if not isinstance(text, str):
            return ""
        # Basic cleanup: unescape HTML, normalize spaces
        text = html.unescape(text)
        text = text.replace("\u00a0", " ")
        text = re.sub(r"\s+", " ", text).strip()
        return text
    @staticmethod
    def clean_ocr_text(text: str) -> str:
        text = re.sub(r"(\w)-\n(\w)", r"\1\2", text)
        text = re.sub(r"(?<!\n)\n(?!\n)", " ", text)
        text = re.sub(r"\s+", " ", text).strip()
        return text

print("--- Utils defined ---")


--- Utils defined ---


In [6]:
class Chunker:

    @staticmethod
    def _emit_chunk(
        chunks: list,
        cur_text: str,
        chunk_id: int,
        pinfo: dict,
        extra_meta: dict | None = None,
    ) -> int:
        if not cur_text:
            return chunk_id

        fingerprint = Utils.sha1(cur_text)

        # Preserve your original metadata layout
        chunk_meta = {
            "id": f"chunk_{chunk_id}",
            "doc_id": pinfo.get("title", pinfo.get("source", "doc")),
            "source": pinfo.get("source"),
            "page": pinfo.get("page", 1),
            "text": cur_text,
            "image_url": pinfo.get("image_url"),
            "url": pinfo.get("url"),
            "hash": fingerprint,
            "iucn_text": pinfo.get("iucn_text"),
            "iucn_code": pinfo.get("iucn_code"),
            "vn_redbook_code": pinfo.get("vn_redbook_code"),
            "vn_redbook_text": pinfo.get("vn_redbook_text"),
        }

        if extra_meta:
            chunk_meta.update(extra_meta)

        chunks.append(chunk_meta)
        return chunk_id + 1

    @staticmethod
    def make_chunks(
        pages: list,
        strategy: str = "sentences",
        max_tokens: int = Constants.CHUNK_MAX_TOKENS,
        overlap_tokens: int = Constants.CHUNK_TOKEN_OVERLAP,
        model_name: str = Constants.EMBED_MODEL_NAME,
    ) -> list:
        print(f"[chunker] Loading tokenizer: {model_name}")
        try:
            tokenizer = AutoTokenizer.from_pretrained(model_name)
        except Exception as e:
            print(f"FATAL: Could not load tokenizer {model_name}. Error: {e}")
            return []

        from langchain_core.documents import Document

        langchain_docs = [
            Document(page_content=p["text"], metadata=p)
            for p in pages
            if p.get("text", "").strip()
        ]

        if strategy == "sentences":
            splitter = SentenceTransformersTokenTextSplitter(
                model_name=model_name,
                chunk_overlap=overlap_tokens,
                tokens_per_chunk=max_tokens,
            )
            final_docs = splitter.split_documents(langchain_docs)
        else:
            if strategy == "wiki_sections":
                separators = ["\n== ", "\n=== ", "\n\n", "\n", ". ", " "]
            else:
                separators = ["\n\n", "\n", ". ", " ", ""]

            splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer(
                tokenizer,
                chunk_size=max_tokens,
                chunk_overlap=overlap_tokens,
                separators=separators,
                strip_whitespace=True,
                add_start_index=True,  # gives you start_index metadata
            )
            final_docs = splitter.split_documents(langchain_docs)

        print(f"[chunker] Đã xử lý {len(pages)} trang, tạo ra {len(final_docs)} chunks.")

        final_chunks_list = []
        chunk_id_counter = 0
        for d in final_docs:
            extra_meta = {}
            if "start_index" in d.metadata:
                extra_meta["start_index"] = d.metadata["start_index"]

            chunk_id_counter = Chunker._emit_chunk(
                chunks=final_chunks_list,
                cur_text=d.page_content,
                chunk_id=chunk_id_counter,
                pinfo=d.metadata,
                extra_meta=extra_meta,
            )

        return final_chunks_list

print("--- Chunker class defined ---")


--- Chunker class defined ---


In [7]:
class Deduplicator:
    @staticmethod
    def dedupe_chunks(chunks: list, existing_hashes: set | None = None) -> tuple:
        seen = set(existing_hashes) if existing_hashes else set()
        unique_chunks = []
        added_hashes = set()
        for ch in chunks:
            h = ch.get("hash")
            if h and h not in seen:
                seen.add(h)
                unique_chunks.append(ch)
                added_hashes.add(h)
        return unique_chunks, added_hashes

print("--- Deduplicator class defined ---")


--- Deduplicator class defined ---


In [8]:
class Embedder:
    @staticmethod
    def embed_chunks(
        chunks: list,
        model_name: str = Constants.EMBED_MODEL_NAME,
        batch_size: int = Constants.EMBED_BATCH_SIZE,
    ) -> np.ndarray:
        if not chunks:
            dim = 1024  # e5-large-instruct
            return np.zeros((0, dim), dtype=Constants.EMBED_DTYPE)

        device = "cuda" if torch.cuda.is_available() else "cpu"
        print(f"[embed] Using device: {device}")

        model = SentenceTransformer(model_name, device=device)

        print("[embed] Applying 'passage: ' prefix for instruct model.")
        texts = [f"passage: {Utils.normalize_vi_text(c['text'])}" for c in chunks]

        embeddings = model.encode(
            texts,
            batch_size=batch_size,
            convert_to_numpy=True,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        return embeddings.astype(Constants.EMBED_DTYPE)

print("--- Embedder class defined ---")


--- Embedder class defined ---


In [9]:
class Indexer:
    @staticmethod
    def build_faiss(embeddings: np.ndarray) -> faiss.Index:
        if embeddings.shape[0] == 0:
            dim = 1024
            return faiss.IndexFlatIP(dim)
        dim = embeddings.shape[1]
        index = faiss.IndexFlatIP(dim)
        index.add(embeddings)
        return index

    @staticmethod
    def save_index(index: "faiss.Index", path: str):
        faiss.write_index(index, path)

    @staticmethod
    def load_index(path: str) -> "faiss.Index":
        return faiss.read_index(path)

print("--- Indexer class defined ---")


--- Indexer class defined ---


In [10]:
class Ingestion:
    _worker_easy_reader = None

    # -----------------------------
    # WIKIPEDIA HELPERS
    # -----------------------------
    @staticmethod
    def page_url_from_title(title: str, lang: str = "vi") -> str:
        safe_title = title.replace(" ", "_")
        return f"https://{lang}.wikipedia.org/wiki/{quote(safe_title)}"

    @staticmethod
    def extract_first_column_titles_from_url(
        main_url: str,
        max_titles: int | None = None
    ) -> list[dict]:
        headers = {"User-Agent": "Mozilla/5.0 (RAG-bot/1.0)"}
        resp = requests.get(main_url, headers=headers, timeout=30)
        resp.raise_for_status()

        soup = BeautifulSoup(resp.content, "html.parser")

        rows = []
        seen_titles = set()

        tables = soup.select("table.wikitable")
        for table in tables:
            for row in table.select("tr"):
                cells = row.find_all(["th", "td"])
                if len(cells) < 1:
                    continue

                # column 1: animal name
                a_tag = cells[0].find("a", href=True)
                if not a_tag:
                    continue

                href = a_tag["href"]
                if not href.startswith("/wiki/") or "redlink=1" in href:
                    continue

                title = unquote(href.split("/wiki/", 1)[-1]).replace("_", " ")
                if not title or title in seen_titles:
                    continue

                seen_titles.add(title)

                # column 3: VN Red Book code on the Red List page
                vn_code = None
                if len(cells) >= 3:
                    vn_code_raw = cells[2].get_text(" ", strip=True)
                    vn_code_raw = vn_code_raw.strip()
                    if vn_code_raw:
                        vn_code = vn_code_raw

                row_info = {
                    "title": title,
                    "vn_redbook_code": vn_code,
                    "vn_redbook_text": vn_code,
                }
                rows.append(row_info)

                if max_titles and len(rows) >= max_titles:
                    break
            if max_titles and len(rows) >= max_titles:
                break

        return rows

    # -----------------------------
    # HTML → CLEAN TEXT
    # -----------------------------
    @staticmethod
    def _get_clean_text_from_html(html: str) -> str:
        soup = BeautifulSoup(html, "html.parser")

        # Find the main content body
        body = soup.find("div", {"id": "bodyContent"}) or soup

        # 1. REMOVE JUNK ELEMENTS FIRST
        junk_selectors = [
            "table.infobox",
            "div.navbox",
            "div.thumb",
            "div.reflist",
            "ol.references",
            "div.mw-references-wrap",
            "div#catlinks",
            "div.hatnote",
            "span.mw-editsection",
            "div.vertical-navbox",
            "table.vertical-navbox",
            "table.navbox",
            "table.wikitable"
            "div#mw-navigation",
            "div#footer",
        ]
        for selector in junk_selectors:
            for el in body.select(selector):
                el.decompose()
        STOP_HEADLINES = {
            "chú thích",
            "tham khảo",
            "liên kết ngoài",
            "danh mục sách đỏ",
            "tài liệu tham khảo",
            "xem thêm",
        }

        texts = []
        for el in body.find_all(["p", "li", "h2", "h3", "h4"], recursive=True):
            if el.name in ("h2", "h3", "h4"):
                headline = el.get_text(" ", strip=True).lower()
                if any(stop_word in headline for stop_word in STOP_HEADLINES):
                    break

            # If it's a text element, get the text
            elif el.name in ("p", "li"):
                txt = el.get_text(" ", strip=True)
                lower = txt.lower()
                if lower.startswith("tài liệu dẫn:"):
                    continue
                generic_law = (
                    "danh mục cấm" in lower
                    or "luật các loài động vật hoang dã" in lower
                    or "luật các loài hoang dã" in lower
                    or "cấm săn bắt" in lower
                )
                if generic_law and "việt nam" not in lower:
                    continue
                if txt and len(txt.split()) > 3:
                    texts.append(txt)

        return "\n\n".join(texts)

    # -----------------------------
    # LOW-LEVEL HTTP
    # -----------------------------
    @staticmethod
    def _fetch_page_html(url: str) -> str | None:
        headers = {"User-Agent": "Mozilla/5.0 (RAG-bot/1.0)"}
        try:
            resp = requests.get(url, headers=headers, timeout=30)
            resp.raise_for_status()
            return resp.text
        except Exception as e:
            print(f"[Ingestion] Failed to fetch URL {url}: {e}")
            return None

    @staticmethod
    def _extract_first_image_url_from_html(html_content: str, lang: str = "vi") -> str | None:

        try:
            soup = BeautifulSoup(html_content, "html.parser")

            # 1) Prefer infobox
            infobox = soup.find("table", class_="infobox")
            img = None
            if infobox:
                img = infobox.find("img")

            # 2) Fallback: any image in content
            if img is None:
                img = soup.find("img")

            if not img or not img.get("src"):
                return None

            src = img["src"]
            if src.startswith("//"):
                return "https:" + src
            if src.startswith("/"):
                return f"https://{lang}.wikipedia.org{src}"
            return src
        except Exception as e:
            print(f"[Ingestion] Failed to parse image from HTML: {e}")
            return None

    @staticmethod
    def _fetch_page_wikitext_by_title(title: str, lang: str = "vi") -> str | None:

        api_url = f"https://{lang}.wikipedia.org/w/api.php"
        headers = {"User-Agent": "Mozilla/5.0 (RAG-bot/1.0)"}
        params = {
            "action": "parse",
            "page": title,
            "prop": "wikitext",
            "format": "json",
            "formatversion": 2,
        }
        try:
            resp = requests.get(api_url, params=params, headers=headers, timeout=30)
            resp.raise_for_status()
            data = resp.json()
            return data.get("parse", {}).get("wikitext", None)
        except Exception as e:
            print(f"[Ingestion] Failed to fetch wikitext for {title}: {e}")
            return None

    @staticmethod
    def _extract_iucn_from_wikitext(wikitext: str) -> dict | None:
        if not wikitext:
            return None

        text = wikitext.replace("\r\n", "\n")

        status_code = None
        system = None

        # 1) Template form: | status = {{LC}} or {{VU|...}}
        m = re.search(
            r"\|\s*status\s*=\s*\{\{\s*([^}|]+)",
            text,
            flags=re.IGNORECASE,
        )
        if m:
            status_code = m.group(1).strip()
        else:
            # 2) Simple form: | status = LC
            m = re.search(
                r"\|\s*status\s*=\s*([A-Za-z0-9\.]+)",
                text,
                flags=re.IGNORECASE,
            )
            if m:
                status_code = m.group(1).strip()

        # status_system: usually "iucn3.1"
        m = re.search(
            r"\|\s*status_system\s*=\s*([^\n|]+)",
            text,
            flags=re.IGNORECASE,
        )
        if m:
            system = m.group(1).strip()

        # If we found nothing, return None
        if not status_code and not system:
            return None

        return {
            "iucn_global_code": status_code,
            "iucn_global_system": system,
        }

    # -----------------------------
    # WIKIPEDIA PAGE FETCHING
    # -----------------------------
    @staticmethod
    def fetch_wikipedia_titles(titles: list[str], lang: str = "vi") -> list[dict]:
        pages = []
        for title in titles:
            url = Ingestion.page_url_from_title(title, lang=lang)

            # 1) HTML → cleaned text + image
            html_content = Ingestion._fetch_page_html(url)
            if not html_content:
                continue

            # Use the new, robust scraper
            text = Ingestion._get_clean_text_from_html(html_content)
            image_url = Ingestion._extract_first_image_url_from_html(html_content, lang=lang)

            # 2) Wikitext → IUCN info
            wikitext = Ingestion._fetch_page_wikitext_by_title(title, lang=lang)
            iucn_meta = Ingestion._extract_iucn_from_wikitext(wikitext) if wikitext else None

            iucn_code = None
            iucn_text = None
            if iucn_meta and iucn_meta.get("iucn_global_code"):
                iucn_code = iucn_meta["iucn_global_code"]
                system = iucn_meta.get("iucn_global_system") or "IUCN"
                iucn_text = f"{iucn_code} ({system})"

            page_info = {
                "id": f"wiki_{Utils.sha1(url)}",
                "title": title,
                "source": url,
                "url": url,
                "page": 1,
                "text": text,
                "image_url": image_url,
                "iucn_code": iucn_code,
                "iucn_text": iucn_text,
            }
            pages.append(page_info)

        return pages
    # ---------------------------------
    # --- PDF / OCR METHODS (ADDED) ---
    # ---------------------------------

    @staticmethod
    def render_page_to_png_bytes(page: 'pymupdf.Page', dpi: int = Constants.OCR_DPI) -> bytes:
        
        mat = pymupdf.Matrix(dpi / 72.0, dpi / 72.0) 
        pix = page.get_pixmap(matrix=mat, alpha=False) 
        return pix.tobytes("png") 

    @staticmethod
    def _ocr_worker_png_bytes(png_bytes: bytes, use_tesseract: bool,
                            tesseract_langs: str, downscale_max_width: int):
        
        try:
            from io import BytesIO 
            from PIL import Image as PILImage 
        except Exception as e:
            return f"[ocr_error] missing PIL in worker: {e}" 

        # Open image from bytes
        try:
            img = PILImage.open(io.BytesIO(png_bytes)).convert("RGB") 
        except Exception as e:
            return f"[ocr_error] failed to open image: {e}" 

        # Downscale image
        try:
            w, h = img.size 
            if downscale_max_width and w > downscale_max_width: 
                new_h = int(h * (downscale_max_width / float(w))) 
                img = img.resize((downscale_max_width, new_h), PILImage.LANCZOS) 
        except Exception:
            pass 

        # Try OCR with pytesseract
        if use_tesseract and Constants.TESSERACT_PY_AVAILABLE: 
            try:
                import pytesseract as _pt 
                config = "--psm 6" 
                txt = _pt.image_to_string(img, lang=tesseract_langs, config=config) 
                return Utils.clean_ocr_text(txt) 
            except Exception:
                # If Tesseract OCR fails, fall through to EasyOCR
                pass 

        # use EasyOCR if available
        if Constants.EASYOCR_AVAILABLE: 
            try:
                # Initialize EasyOCR reader once per process
                if Ingestion._worker_easy_reader is None: 
                    import easyocr as _easy 
                    # Use English and Vietnamese by default
                    Ingestion._worker_easy_reader = _easy.Reader(["en", "vi"], gpu=False) 
                np_img = np.array(img) 
                result = Ingestion._worker_easy_reader.readtext(np_img) 
                text = "\n".join([r[1] for r in result]) 
                return Utils.clean_ocr_text(text) 
            except Exception as e:
                return f"[ocr_easy_error] {e}" 

        # If no OCR backend succeeded
        return "[ocr_error] no ocr backend available in worker" 

    @staticmethod
    def _run_parallel_ocr(ocr_jobs: list, use_tesseract: bool,
                        tesseract_langs: str, workers: int, downscale_max_width: int) -> list:
        
        pages_out = [] 
        if not ocr_jobs: 
            return pages_out 

        # Prepare the worker function with fixed parameters using partial
        worker_func = partial(Ingestion._ocr_worker_png_bytes, 
                            use_tesseract=use_tesseract, 
                            tesseract_langs=tesseract_langs, 
                            downscale_max_width=downscale_max_width) 

        with ProcessPoolExecutor(max_workers=workers) as executor: 
            futures = {executor.submit(worker_func, png_bytes): (title, page_no) 
                    for (title, page_no, png_bytes) in ocr_jobs} 
            
            for future in tqdm(as_completed(futures), total=len(futures), desc="OCR pages", unit="page"): 
                title, page_no = futures[future] 
                try:
                    text = future.result() 
                except Exception as e:
                    text = f"[ocr_exception] {e}" 
                
                pages_out.append({ 
                    "page": page_no, 
                    "text": text if text else "", 
                    "source": "pdf_ocr", 
                    "title": title 
                })

        # Sort results by document title and page number for consistency
        pages_out.sort(key=lambda x: (x.get("title", ""), x.get("page", 0))) 
        return pages_out 

    @staticmethod
    def pdf_to_pages_with_jobs(pdf_path: str, dpi: int = Constants.OCR_DPI) -> tuple:
        
        if not os.path.exists(pdf_path): 
            raise FileNotFoundError(f"PDF file not found: {pdf_path}") 
        
        doc = pymupdf.open(pdf_path) 
        pages_with_text = [] 
        ocr_jobs = [] 
        base_title = os.path.basename(pdf_path) 

        for i, page in enumerate(doc, start=1): 
            page_text = page.get_text().strip() 
            if page_text: 
                pages_with_text.append({ 
                    "page": i, 
                    "text": Utils.clean_ocr_text(page_text), 
                    "source": "pdf", 
                    "title": base_title 
                })
            else:
                # Page has no text, schedule for OCR
                png_bytes = Ingestion.render_page_to_png_bytes(page, dpi=dpi) 
                ocr_jobs.append((base_title, i, png_bytes)) 
        
        return pages_with_text, ocr_jobs

print("--- Ingestion class defined ---\n")


--- Ingestion class defined ---



In [11]:
class Base:
    _embedder_model = None

    # -----------------------------
    # Embedder
    # -----------------------------
    @staticmethod
    def _get_embedder(model_name: str = Constants.EMBED_MODEL_NAME):
        if Base._embedder_model is None:
            device = "cuda" if torch.cuda.is_available() else "cpu"
            print(f"[Base] Loading embedder {model_name} on {device}")
            Base._embedder_model = SentenceTransformer(model_name, device=device)
        return Base._embedder_model

    @staticmethod
    def _embed_chunks(chunks: list) -> np.ndarray:
        if not chunks:
            dim = 1024
            return np.zeros((0, dim), dtype=Constants.EMBED_DTYPE)

        model = Base._get_embedder(Constants.EMBED_MODEL_NAME)

        texts = [f"passage: {Utils.normalize_vi_text(c['text'])}" for c in chunks]

        embeddings = model.encode(
            texts,
            batch_size=Constants.EMBED_BATCH_SIZE,
            convert_to_numpy=True,
            show_progress_bar=True,
            normalize_embeddings=True,
        )
        return embeddings.astype(Constants.EMBED_DTYPE)

    # -----------------------------
    # FAISS
    # -----------------------------
    @staticmethod
    def _build_faiss_index(embeddings: np.ndarray) -> faiss.Index:
        return Indexer.build_faiss(embeddings)

    # -----------------------------
    # Saving
    # -----------------------------
    @staticmethod
    def _save_artifacts(out_dir: str, chunks: list, embeddings: np.ndarray, index: faiss.Index):
        out_path = Path(out_dir)
        out_path.mkdir(parents=True, exist_ok=True)

        chunks_path = out_path / Constants.CHUNKS_FILE
        emb_path = out_path / Constants.EMBEDDINGS_FILE
        index_path = out_path / Constants.FAISS_INDEX_FILE

        with chunks_path.open("w", encoding="utf-8") as f:
            for ch in chunks:
                f.write(json.dumps(ch, ensure_ascii=False) + "\n")

        np.save(emb_path, embeddings)
        Indexer.save_index(index, str(index_path))

        print("[Base] Saved:")
        print(f"  - {chunks_path}")
        print(f"  - {emb_path}")
        print(f"  - {index_path}")

    # -----------------------------
    # Main prepare function
    # -----------------------------
    @staticmethod
    def prepare_from_pdf_paths(
        pdf_paths: list[str],
        wiki_titles: list[dict] | None,
        wiki_lang: str,
        out_dir: str,
        params: dict,
        force: bool = False,
    ):
        strategy = params.get("CHUNKING_STRATEGY", "paragraph")
        max_tokens = params.get("CHUNK_MAX_TOKENS", Constants.CHUNK_MAX_TOKENS)
        overlap_tokens = params.get("CHUNK_TOKEN_OVERLAP", Constants.CHUNK_TOKEN_OVERLAP)
        
        # --- Get OCR params from constants ---
        ocr_dpi = params.get("OCR_DPI", Constants.OCR_DPI)
        ocr_workers = params.get("OCR_WORKERS", Constants.OCR_WORKERS)
        tesseract_langs = params.get("TESSERACT_LANGS", Constants.TESSERACT_LANGS)
        downscale_width = params.get("DOWNSCALE_MAX_WIDTH", Constants.DOWNSCALE_MAX_WIDTH)
        # Check if tesseract is enabled and available
        use_tesseract_auto = params.get("USE_TESSERACT_AUTO", Constants.USE_TESSERACT_AUTO)
        use_tesseract = use_tesseract_auto and Constants.TESSERACT_PY_AVAILABLE

        out_path = Path(out_dir)
        chunks_path = out_path / Constants.CHUNKS_FILE
        emb_path = out_path / Constants.EMBEDDINGS_FILE
        index_path = out_path / Constants.FAISS_INDEX_FILE

        if not force and chunks_path.exists() and emb_path.exists() and index_path.exists():
            print("[Base] Artifacts already exist. Skipping prepare.")
            index = Indexer.load_index(str(index_path))
            embeddings = np.load(emb_path)
            chunks = [json.loads(l) for l in chunks_path.read_text(encoding="utf-8").splitlines()]
            return chunks, embeddings, index

        # This list will hold all "pages" (text from PDFs, text from OCR, text from Wiki)
        pages = []
        
        # --- 1) PDF ingestion (NOW FUNCTIONAL) ---
        collected_pages = []
        ocr_jobs = []
        for p in pdf_paths or []:
            if not os.path.exists(p):
                print(f"[Base/warn] missing pdf: {p}; skipping")
                continue
            
            print(f"[Base/pdf] Processing PDF: {p}")
            
            pdf_pages, pdf_jobs = Ingestion.pdf_to_pages_with_jobs(p, dpi=ocr_dpi) 
            collected_pages.extend(pdf_pages) 
            ocr_jobs.extend(pdf_jobs) 

        pages.extend(collected_pages)
        print(f"[Base/pdf] Found {len(collected_pages)} text pages and {len(ocr_jobs)} pages needing OCR.")

        # --- 2) Run OCR jobs (NEW SECTION) ---
        if ocr_jobs:
            print(f"[Base/ocr] Running OCR on {len(ocr_jobs)} pages with {ocr_workers} workers. Using Tesseract: {use_tesseract}")
            
            new_pages_from_ocr = Ingestion._run_parallel_ocr(
                ocr_jobs,
                use_tesseract=use_tesseract,
                tesseract_langs=tesseract_langs,
                workers=ocr_workers,
                downscale_max_width=downscale_width
            )
            pages.extend(new_pages_from_ocr)
            print(f"[Base/ocr] OCR complete, got {len(new_pages_from_ocr)} pages of text.")

        # --- 3) Wikipedia ingestion (Unchanged) ---
        if wiki_titles:
            titles = [row["title"] for row in wiki_titles]
            redbook_map = {
                row["title"]: {
                    "vn_redbook_code": row.get("vn_redbook_code"),
                    "vn_redbook_text": row.get("vn_redbook_text"),
                }
                for row in wiki_titles
            }
            wiki_pages = Ingestion.fetch_wikipedia_titles(titles, lang=wiki_lang)

            for p in wiki_pages:
                extra = redbook_map.get(p["title"], {}) or {}
                vn_code = extra.get("vn_redbook_code")
                vn_text = extra.get("vn_redbook_text")
                
                p["vn_redbook_code"] = vn_code
                p["vn_redbook_text"] = vn_text
                
                if not p.get("iucn_code") and vn_code:
                    p["iucn_code"] = vn_code
                    p["iucn_text"] = vn_text or vn_code
                else:
                    p.setdefault("iucn_code", None)
                    p.setdefault("iucn_text", None)
                
                pages.append(p)

        print(f"[Base] Total pages collected (PDF + OCR + Wiki): {len(pages)}")
        if not pages:
            print("[Base] No pages found. Stopping.")
            # Return empty but valid artifacts
            empty_embeds = np.zeros((0, 1024), dtype=Constants.EMBED_DTYPE)
            empty_index = Base._build_faiss_index(empty_embeds)
            return [], empty_embeds, empty_index

        # --- 4) Chunking (Unchanged) ---
        chunks = Chunker.make_chunks(
            pages,
            strategy=strategy,
            max_tokens=max_tokens,
            overlap_tokens=overlap_tokens,
            model_name=params.get("EMBED_MODEL_NAME", Constants.EMBED_MODEL_NAME),
        )
        print(f"[Base] Total chunks from Chunker: {len(chunks)}")

        # --- 5) Embedding (Unchanged) ---
        print(f"[Base] Loading embedder: {Constants.EMBED_MODEL_NAME}")
        # Note: Using the class's _embed_chunks, not the standalone Embedder.
        embeddings = Base._embed_chunks(chunks) 

        print(f"[Base] Embedding shape: {embeddings.shape}")

        # --- 6) FAISS Index (Unchanged) ---
        index = Base._build_faiss_index(embeddings)
        print(f"[Base] FAISS index built with {index.ntotal} vectors (dim={index.d}).")

        # --- 7) Save (Unchanged) ---
        Base._save_artifacts(out_dir, chunks, embeddings, index)

        return chunks, embeddings, index

print("--- Base helper defined ---")


--- Base helper defined ---


In [15]:
out_dir = "/kaggle/working/data_files_paragraph"
redlist_url = "https://vi.wikipedia.org/wiki/Danh_m%E1%BB%A5c_s%C3%A1ch_%C4%91%E1%BB%8F_%C4%91%E1%BB%99ng_v%E1%BA%ADt_Vi%E1%BB%87t_Nam"
params = {
    "CHUNK_MAX_TOKENS": 300,
    "CHUNK_TOKEN_OVERLAP": 50,
    "EMBED_MODEL_NAME": Constants.EMBED_MODEL_NAME,
    "CHUNKING_STRATEGY": "paragraph",
    "MAX_ANIMALS": ,
}

max_animals = params.get("MAX_ANIMALS")

wiki_rows = Ingestion.extract_first_column_titles_from_url(
    redlist_url,
    max_titles=max_animals,
)

print("--- Running PARAGRAPH Strategy ---")
print(f"Using {len(wiki_rows)} animal rows from first column.")
print("Example rows:", wiki_rows[:3])

chunks, embeddings, index = Base.prepare_from_pdf_paths(
    [],
    wiki_titles=wiki_rows,
    wiki_lang="vi",
    out_dir=out_dir,
    force=True,
    params=params,
)
if chunks:
    print("Example chunk 0:", json.dumps(chunks[0], ensure_ascii=False)[:500])


--- Running PARAGRAPH Strategy ---
Using 289 animal rows from first column.
Example rows: [{'title': 'Ác là', 'vn_redbook_code': 'V', 'vn_redbook_text': 'V'}, {'title': 'Báo gấm', 'vn_redbook_code': 'V', 'vn_redbook_text': 'V'}, {'title': 'Báo hoa mai Đông Dương', 'vn_redbook_code': 'E', 'vn_redbook_text': 'E'}]
[Base/pdf] Processing PDF: /kaggle/input/chunking-11/sachdongvattest.pdf
[Base/pdf] Found 28 text pages and 0 pages needing OCR.
[Base] Total pages collected (PDF + OCR + Wiki): 317
[chunker] Loading tokenizer: intfloat/multilingual-e5-large-instruct


tokenizer_config.json: 0.00B [00:00, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.1M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/964 [00:00<?, ?B/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (524 > 512). Running this sequence through the model will result in indexing errors


[chunker] Đã xử lý 317 trang, tạo ra 1319 chunks.
[Base] Total chunks from Chunker: 1319
[Base] Loading embedder: intfloat/multilingual-e5-large-instruct
[Base] Loading embedder intfloat/multilingual-e5-large-instruct on cuda


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/128 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_xlm-roberta_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/690 [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

config.json:   0%|          | 0.00/271 [00:00<?, ?B/s]

Batches:   0%|          | 0/42 [00:00<?, ?it/s]

[Base] Embedding shape: (1319, 1024)
[Base] FAISS index built with 1319 vectors (dim=1024).
[Base] Saved:
  - /kaggle/working/data_files_paragraph/chunks.jsonl
  - /kaggle/working/data_files_paragraph/embeddings.npy
  - /kaggle/working/data_files_paragraph/faiss.index
Example chunk 0: {"id": "chunk_0", "doc_id": "sachdongvattest.pdf", "source": "pdf", "page": 1, "text": "1 Bảo vệ Động vật hoang dã - Hướng dẫn tích hợp vào môn Sinh học Lớp 7 Hướng dẫn tích hợp vào môn Sinh học lớp 7 Bảo vệ động vật hoang dã Đỗ Thị Thanh Huyền Phạm Phương Bình Trần Văn Quang", "image_url": null, "url": null, "hash": "3d5aa5017199", "iucn_text": null, "iucn_code": null, "vn_redbook_code": null, "vn_redbook_text": null, "start_index": 0}


In [None]:
out_dir_sent = "/kaggle/working/data_files_sentences"
params_sent = {
    "CHUNK_MAX_TOKENS": 300,
    "CHUNK_TOKEN_OVERLAP": 20,
    "EMBED_MODEL_NAME": Constants.EMBED_MODEL_NAME,
    "CHUNKING_STRATEGY": "sentences",
    "MAX_ANIMALS": 358,
}
wiki_rows_sent = wiki_rows  # reuse rows from Cell 11

print("--- Running SENTENCES Strategy ---")
print(f"Using {len(wiki_rows_sent)} animal rows from first column.")

chunks_s, embeddings_s, index_s = Base.prepare_from_pdf_paths(
    [],
    wiki_titles=wiki_rows_sent,
    wiki_lang="vi",
    out_dir=out_dir_sent,
    force=True,
    params=params_sent,
)


In [None]:
out_dir_wiki = "/kaggle/working/data_files_wiki_sections"
params_wiki = {
    "CHUNK_MAX_TOKENS": 300,
    "CHUNK_TOKEN_OVERLAP": 50,
    "EMBED_MODEL_NAME": Constants.EMBED_MODEL_NAME,
    "CHUNKING_STRATEGY": "wiki_sections",
    "MAX_ANIMALS": 358,
}

wiki_rows_wiki = wiki_rows  # reuse

print("--- Running WIKI_SECTIONS Strategy ---")
print(f"Using {len(wiki_rows_wiki)} animal rows from first column.")

chunks_w, embeddings_w, index_w = Base.prepare_from_pdf_paths(
    [],
    wiki_titles=wiki_rows_wiki,
    wiki_lang="vi",
    out_dir=out_dir_wiki,
    force=True,
    params=params_wiki,
)
