In [None]:
import kagglehub
riatleozia_filess_path = kagglehub.dataset_download('riatleozia/filess')
riatleozia_chunking_2_path = kagglehub.dataset_download('riatleozia/chunking-2')

print('Data source import complete.')


In [None]:
!pip install "transformers>=4.40.0" sentence-transformers faiss-cpu safetensors -q
!pip uninstall -y protobuf
!pip install protobuf==4.25.3

In [None]:
import os, textwrap, pathlib, pprint
root = "/kaggle/input/filess"
for path, dirs, files in os.walk(root):
    print(path)
    print("  dirs:", dirs)
    print("  files:", files)
    print("-" * 40)

In [None]:
import json
import re
from pathlib import Path
from typing import List, Dict, Tuple

import faiss
import numpy as np
from sentence_transformers import SentenceTransformer

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM


# ===========================
# Config for Kaggle
# ===========================
INPUT_ROOT = Path("/kaggle/input/chunking-2")

# Writable outputs
WORK_ROOT = Path("/kaggle/working")
KB_CONFIGS = {
    "sentences": {
        "base_dir": INPUT_ROOT / "data_files_sentences",
        "chunks_file": "chunks.jsonl",
        "index_file": "index.faiss",
        "embed_model_name": "intfloat/multilingual-e5-large-instruct",
    },
    "wiki_sections": {
        "base_dir": INPUT_ROOT / "data_files_wiki_sections",
        "chunks_file": "chunks.jsonl",
        "index_file": "index.faiss",
        "embed_model_name": "intfloat/multilingual-e5-large-instruct",
    },
    "paragraph": {
        "base_dir": INPUT_ROOT / "data_files_paragraph",
        "chunks_file": "chunks.jsonl",
        "index_file": "index.faiss",
        "embed_model_name": "intfloat/multilingual-e5-large-instruct",
    },
}

QA_PATH = INPUT_ROOT / "sach_do_dong_vat_vietnam_qa_dataset.json"
OUT_DIR = WORK_ROOT / "eval_results"
OUT_DIR.mkdir(exist_ok=True)


# ===========================
# LLM loading (Hugging Face)
# ===========================

# You can swap this to a larger model if Kaggle GPU allows, e.g.
# _LLM_MODEL_ID = "Phat-Dat/Llama-3.2-8B-RLHF-DPO"
_LLM_MODEL_ID = "Phat-Dat/Llama-3.2-1B-RLHF-DPO"
_llm_model = None
_llm_tokenizer = None


def get_llm():
    """Lazy-load HF model & tokenizer (GPU if available)."""
    global _llm_model, _llm_tokenizer

    if _llm_model is not None and _llm_tokenizer is not None:
        return _llm_model, _llm_tokenizer

    print(f"[llm] loading model: {_LLM_MODEL_ID}")
    _llm_tokenizer = AutoTokenizer.from_pretrained(_LLM_MODEL_ID)
    _llm_model = AutoModelForCausalLM.from_pretrained(
        _LLM_MODEL_ID,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
        device_map="auto" if torch.cuda.is_available() else None,
    )

    if _llm_tokenizer.pad_token_id is None:
        _llm_tokenizer.pad_token_id = _llm_tokenizer.eos_token_id

    _llm_model.eval()

    return _llm_model, _llm_tokenizer


# ===========================
# QA loading
# ===========================

def load_qa_pairs() -> List[Dict]:
    data = json.loads(QA_PATH.read_text(encoding="utf-8"))
    return data["qa_pairs"]


# ===========================
# KB loading (one chunking method)
# ===========================

def load_kb(kb_id: str) -> Tuple[faiss.Index, List[Dict], str]:
    cfg = KB_CONFIGS[kb_id]
    base_dir = cfg["base_dir"]

    chunks_path = base_dir / cfg["chunks_file"]
    index_path = base_dir / cfg["index_file"]
    embed_model_name = cfg["embed_model_name"]

    # load chunks
    chunks: List[Dict] = []
    with chunks_path.open(encoding="utf-8") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            chunks.append(json.loads(line))

    # load FAISS index
    index = faiss.read_index(str(index_path))

    return index, chunks, embed_model_name


# ===========================
# Embedding + retrieval
# ===========================

_embedder_cache: Dict[str, SentenceTransformer] = {}


def get_embedder(model_name: str) -> SentenceTransformer:
    if model_name not in _embedder_cache:
        print(f"[embed] loading model {model_name}")
        _embedder_cache[model_name] = SentenceTransformer(model_name)
    return _embedder_cache[model_name]


def retrieve_topk(
    index: faiss.Index,
    embedder: SentenceTransformer,
    question: str,
    chunks: List[Dict],
    top_k: int = 5,
) -> Tuple[List[Dict], List[float]]:
    # E5-style query prefix
    q_text = f"query: {question}"
    q_vec = embedder.encode([q_text], normalize_embeddings=True)
    q_vec = q_vec.astype("float32")
    D, I = index.search(q_vec, top_k)
    indices = I[0]
    scores = D[0]
    retrieved = [chunks[i] for i in indices]
    return retrieved, scores.tolist()


# ===========================
# Text normalization + scoring
# ===========================

VN_STOPWORDS = {
    "và", "có", "là", "các", "một", "ở", "của", "trong", "đã", "được", "bị",
    "trên", "đến", "tại", "không", "từ", "về", "vào", "những", "năm", "như",
    "khi", "để", "cho", "rất", "rằng", "này", "kia", "đó", "này", "nên",
    "thì", "ra", "vẫn", "cũng", "nữa", "cùng", "do", "vì", "nên", "nếu",
    "hay", "hoặc", "v.v", "v.v.",
    "loài", "động", "vật", "việt", "nam", "việt", "nam.", "sách", "đỏ",
    "nhóm", "nguy", "cấp", "quý", "hiếm", "bảo", "tồn", "tình", "trạng",
    "khoa", "học", "tên", "khoa", "học", "được", "xếp", "vào", "thuộc",
    "họ", "giống", "loài", "đây", "là", "một", "trong", "các", "loài",
    "vùng", "miền", "khu", "vực", "sống", "phân", "bố",
    "châu", "á", "âu", "phi", "đông", "nam", "tây", "bắc",
}


def normalize_text(text: str) -> str:
    text = text.lower()
    text = re.sub(r"\s+", " ", text)
    text = text.strip()
    return text


def tokenize(text: str) -> list[str]:
    """
    Simple VN-friendly tokenization: lowercase, split on non-word chars.
    """
    text = text.lower()
    # \w keeps letters and digits (including Vietnamese letters in most setups)
    tokens = re.findall(r"\w+", text, flags=re.UNICODE)
    return tokens


def remove_stopwords(tokens: list[str]) -> list[str]:
    return [t for t in tokens if t not in VN_STOPWORDS and len(t) > 1]


def f1_bow(pred_tokens: list[str], gold_tokens: list[str]) -> float:
    """
    Bag-of-words F1 (like SQuAD): counts duplicates.
    """
    if not pred_tokens or not gold_tokens:
        return 0.0

    from collections import Counter

    pred_counts = Counter(pred_tokens)
    gold_counts = Counter(gold_tokens)

    common = 0
    for t, g_count in gold_counts.items():
        common += min(g_count, pred_counts.get(t, 0))

    if common == 0:
        return 0.0

    precision = common / sum(pred_counts.values())
    recall = common / sum(gold_counts.values())
    return 2 * precision * recall / (precision + recall)


def exact_match(pred: str, gold: str) -> bool:
    """
    Strict EM: normalized string equality.
    """
    return normalize_text(pred) == normalize_text(gold)


def smoothed_f1(pred: str, gold: str) -> tuple[float, dict]:
    """
    More lenient F1:
      - compute F1 on all tokens
      - compute F1 on content tokens (stopwords removed)
      - if prediction is basically the gold answer but shorter
        (subset of content tokens or substring), boost score.
      - final score = max(F1_all, F1_content), then possibly boosted.
    Returns (final_f1, debug_info).
    """
    pred_norm = normalize_text(pred)
    gold_norm = normalize_text(gold)

    pred_tokens_all = tokenize(pred)
    gold_tokens_all = tokenize(gold)

    pred_tokens_content = remove_stopwords(pred_tokens_all)
    gold_tokens_content = remove_stopwords(gold_tokens_all)

    f1_all = f1_bow(pred_tokens_all, gold_tokens_all)
    f1_content = f1_bow(pred_tokens_content, gold_tokens_content)

    # base: take the more generous of the two
    base_f1 = max(f1_all, f1_content)

    # If all content words in prediction are contained in gold content words,
    # treat this as a good short answer and bump to at least 0.8.
    pred_set = set(pred_tokens_content)
    gold_set = set(gold_tokens_content)
    subset_boost = False
    if pred_set and pred_set.issubset(gold_set):
        base_f1 = max(base_f1, 0.8)
        subset_boost = True

    # substring heuristic: if one string contains the other and they are not
    # *too* short, treat as a near perfect match
    substring_match = False
    if len(pred_norm) >= 10 and len(gold_norm) >= 10:
        if pred_norm in gold_norm or gold_norm in pred_norm:
            substring_match = True

    final_f1 = base_f1
    if substring_match:
        final_f1 = max(final_f1, 0.9)

    # clamp to [0, 1]
    final_f1 = max(0.0, min(1.0, final_f1))

    debug = {
        "f1_all": f1_all,
        "f1_content": f1_content,
        "base_f1": base_f1,
        "substring_match": substring_match,
        "subset_boost": subset_boost,
    }
    return final_f1, debug


# ===========================
# LLM call (uses retrieved chunks)
# ===========================

def call_llm(question: str, context_chunks: List[Dict]) -> str:
    model, tokenizer = get_llm()

    # Build context from retrieved chunks
    retrieved_context = "\n\n".join(c.get("text", "") for c in context_chunks)

    # If context is basically empty, obey the task rule explicitly
    if not retrieved_context.strip():
        return "Không tìm thấy trong tài liệu."

    # ===== system & user prompts =====
    system_prompt = """Bạn là trợ lý AI chuyên nghiệp, trả lời các câu hỏi về động vật trong Sách đỏ Việt Nam.

NHIỆM VỤ CỐT LÕI:
- Chỉ được phép sử dụng thông tin từ "TÀI LIỆU CUNG CẤP" để trả lời..
- Tuyệt đối không thêm bất kỳ thông tin nào bên ngoài tài liệu.
- Nếu không tìm thấy thông tin trong tài liệu để trả lời câu hỏi, hãy trả lời chính xác là: "Không tìm thấy trong tài liệu."
"""

    user_prompt = f"""Dựa vào tài liệu sau:

TÀI LIỆU CUNG CẤP:
\"\"\"
{retrieved_context}
\"\"\"

Hãy trả lời câu hỏi sau:
CÂU HỎI: {question}
"""

    device = next(model.parameters()).device

    if hasattr(tokenizer, "apply_chat_template"):
        # Use chat template if available (LLaMA-style)
        messages = [
            {"role": "system", "content": system_prompt},
            {"role": "user", "content": user_prompt},
        ]

        inputs = tokenizer.apply_chat_template(
            messages,
            add_generation_prompt=True,
            tokenize=True,
            return_tensors="pt",   # on this HF version this returns a Tensor
        )

        # Handle both Tensor and dict (future-proof)
        if isinstance(inputs, torch.Tensor):
            input_ids = inputs.to(device)
        else:
            input_ids = inputs["input_ids"].to(device)

        with torch.no_grad():
            outputs = model.generate(
                input_ids=input_ids,
                max_new_tokens=256,
                do_sample=False,
                temperature=0.0,
                top_p=1.0,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )

        # Cut off the prompt part
        prompt_len = input_ids.shape[1]
        gen_ids = outputs[0, prompt_len:]
        answer = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    else:
        # Fallback: no chat template, build plain prompt
        prompt = (
            f"{system_prompt}\n\n"
            f"{user_prompt}\n\n"
            "TRẢ LỜI:"
        )

        inputs = tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=2048,
        )
        inputs = {k: v.to(device) for k, v in inputs.items()}

        with torch.no_grad():
            outputs = model.generate(
                **inputs,
                max_new_tokens=256,
                do_sample=False,
                temperature=0.0,
                top_p=1.0,
                eos_token_id=tokenizer.eos_token_id,
                pad_token_id=tokenizer.pad_token_id,
            )

        generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
        if generated.startswith(prompt):
            answer = generated[len(prompt):].strip()
        else:
            answer = generated.strip()

    # Clean up leading markers like "TRẢ LỜI:" etc.
    answer = re.sub(
        r"^(TRẢ LỜI|trả lời|assistant|trợ lý)[:：]\s*",
        "",
        answer,
        flags=re.IGNORECASE,
    ).strip()

    return answer



# ===========================
# Evaluate one KB
# ===========================

def evaluate_kb(kb_id: str, top_k: int = 10) -> None:
    print(f"=== Evaluating KB: {kb_id} ===")

    index, chunks, embed_model_name = load_kb(kb_id)
    embedder = get_embedder(embed_model_name)
    qa_pairs = load_qa_pairs()

    results = []
    em_scores = []
    f1_scores = []
    relaxed_hits = []  # F1 >= 0.3

    for i, qa in enumerate(qa_pairs):
        q = qa["question"]
        gold = qa["answer"]

        retrieved, scores = retrieve_topk(index, embedder, q, chunks, top_k=top_k)
        pred = call_llm(q, retrieved)

        em = exact_match(pred, gold)
        f1, debug = smoothed_f1(pred, gold)

        em_scores.append(1.0 if em else 0.0)
        f1_scores.append(f1)
        relaxed_hits.append(1.0 if f1 >= 0.3 else 0.0)

        results.append(
            {
                "id": i,
                "question": q,
                "gold_answer": gold,
                "model_answer": pred,
                "exact_match": em,
                "f1": f1,
                "relaxed_hit": f1 >= 0.3,
                "retrieved_chunk_ids": [c.get("id") for c in retrieved],
                "retrieved_scores": [float(s) for s in scores],
                "f1_debug": debug,
            }
        )

        print(f"[{kb_id}] {i}/{len(qa_pairs)} → EM={em}, F1={f1:.3f}")

    summary = {
        "kb_id": kb_id,
        "n_questions": len(qa_pairs),
        "exact_match": sum(em_scores) / len(em_scores) if em_scores else 0.0,
        "f1": sum(f1_scores) / len(f1_scores) if f1_scores else 0.0,
        "relaxed_accuracy": sum(relaxed_hits) / len(relaxed_hits) if relaxed_hits else 0.0,
    }

    out = {
        "summary": summary,
        "results": results,
    }

    out_path = OUT_DIR / f"{kb_id}.json"
    out_path.write_text(json.dumps(out, ensure_ascii=False, indent=2), encoding="utf-8")

    print(
        f"[{kb_id}] DONE. "
        f"EM={summary['exact_match']:.3f}, "
        f"F1={summary['f1']:.3f}, "
        f"RelaxedAcc(F1>=0.3)={summary['relaxed_accuracy']:.3f}"
    )
    print(f"Saved to {out_path}")


# ===========================
# Main
# ===========================

def main():
    for kb_id in KB_CONFIGS.keys():
        evaluate_kb(kb_id, top_k=10)

main()
