In [10]:
!pip install faiss-cpu PyMuPDF



In [11]:
!pip install -U transformers



In [12]:
from huggingface_hub import login
login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv‚Ä¶

In [13]:
!pip install rank_bm25



In [14]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("bkai-foundation-models/vietnamese-bi-encoder")
model.save("./vietnamese-bi-encoder")  

In [15]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("cross-encoder/ms-marco-MiniLM-L-12-v2")
model.save("./ms-marco-MiniLM-L-12-v2")  

In [16]:
%%writefile HFLLMAdapter.py
import os
import faiss
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from sentence_transformers import SentenceTransformer
from typing import List
import textwrap

class HFLLMAdapter:
    def __init__(self, model_id="/kaggle/input/qwen_law_instruct_v3/keras/default/2", device=None):
        self.device = device or ("cuda" if torch.cuda.is_available() else "cpu")
        self.tokenizer = AutoTokenizer.from_pretrained(model_id)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16 if "cuda" in self.device else torch.float32,
            device_map="auto"
        )

    def generate(self, prompt: str, max_new_tokens: int = 512) -> str:
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        with torch.no_grad():
            outputs = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.3,
                do_sample=False
            )
        return self.tokenizer.decode(outputs[0], skip_special_tokens=True)


Overwriting HFLLMAdapter.py


In [17]:
%%writefile LegalRetriever.py
import re
import numpy as np
import faiss
from rank_bm25 import BM25Okapi
from typing import List, Tuple, Dict
from sentence_transformers import SentenceTransformer
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification


class HybridLegalRetriever:
    def __init__(
        self,
        embedder_path: str = "./vietnamese-bi-encoder",
        reranker_model: str = "./ms-marco-MiniLM-L-12-v2",
        top_k: int = 3,
        batch_size: int = 8,
    ):
        # Embedding model
        self.embedder = SentenceTransformer(embedder_path)
        self.top_k = top_k
        self.batch_size = batch_size

        # Reranker (sequence classification model)
        self.reranker_tokenizer = AutoTokenizer.from_pretrained(reranker_model)
        self.reranker_model = AutoModelForSequenceClassification.from_pretrained(reranker_model)
        self.device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
        self.reranker_model.to(self.device)
        self.reranker_model.eval()

        # Indexes / storage
        self.bm25 = None
        self.faiss_index = None
        self.text_chunks: List[str] = []
        self.embeddings = None

    # =========================================================
    # Build index: BM25 + FAISS
    # =========================================================
    def build_index(self, chunks: List[str]):
        if not chunks:
            raise ValueError("‚ùå Danh s√°ch chunks r·ªóng, kh√¥ng th·ªÉ x√¢y d·ª±ng index.")

        self.text_chunks = chunks

        # --- BM25 index (tokenize with words) ---
        print("üîπ X√¢y d·ª±ng BM25 index...")
        tokenized_corpus = [re.findall(r'\w+', c.lower()) for c in chunks]
        self.bm25 = BM25Okapi(tokenized_corpus)

        # --- FAISS index (embedding vectors) ---
        print("üîπ ƒêang t·∫°o embeddings v√† x√¢y d·ª±ng FAISS index...")
        embeddings = self.embedder.encode(chunks, convert_to_numpy=True, show_progress_bar=True)
        embeddings = np.asarray(embeddings, dtype=np.float32)

        # normalize and add to index
        faiss.normalize_L2(embeddings)
        dim = embeddings.shape[1]
        self.faiss_index = faiss.IndexFlatIP(dim)
        self.faiss_index.add(embeddings)
        self.embeddings = embeddings

        print(f"‚úÖ Ho√†n t·∫•t! ƒê√£ x√¢y d·ª±ng index cho {len(chunks):,} ƒëo·∫°n lu·∫≠t v·ªõi vector dim = {dim}.")
        print(f"   ‚Ä¢ FAISS vectors: {self.faiss_index.ntotal}")
        print(f"   ‚Ä¢ Thi·∫øt b·ªã reranker: {self.device}")

    # =========================================================
    # Topic priority (placeholder, can extend)
    # =========================================================
    def _apply_topic_priority(self, query: str, text: str) -> float:
        # Example: boost when query contains keywords; keep 1.0 by default
        q = query.lower()
        if "m·∫•t nƒÉng l·ª±c" in q or "gi√°m h·ªô" in q or "giao d·ªãch" in q:
            # small boost for chunks that explicitly mention ƒêi·ªÅu/Kho·∫£n related
            if re.search(r"ƒêi·ªÅu\s+\d+", text):
                return 1.15
        return 1.0

    # =========================================================
    # Cross-encoder rerank. Returns list of (text, score)
    # =========================================================
    def _rerank(self, query: str, candidates: List[Tuple[str, float]]) -> List[Tuple[str, float]]:
        if not candidates:
            return []

        pairs = [(query, c[0]) for c in candidates]
        scores = []

        for batch_start in range(0, len(pairs), self.batch_size):
            batch = pairs[batch_start: batch_start + self.batch_size]
            # tokenizer accepts list of tuples for some HF tokenizers; for safety, join with special sep
            # we use encoding of pairs as two sentences
            enc = self.reranker_tokenizer(
                [q for q, _ in batch],
                [c for _, c in batch],
                padding=True,
                truncation=True,
                return_tensors="pt",
                max_length=512,
            )
            enc = {k: v.to(self.device) for k, v in enc.items()}
            with torch.no_grad():
                logits = self.reranker_model(**enc).logits  # shape (batch, num_labels)
                # If num_labels >1, pick first logit or compute score appropriately.
                # Many cross-encoders return single logit per pair (regression); handle both cases:
                if logits.dim() == 2 and logits.size(1) == 1:
                    batch_scores = torch.sigmoid(logits).squeeze(-1).cpu().numpy()
                else:
                    # fallback: take softmax over labels and take max-prob
                    probs = torch.softmax(logits, dim=1)
                    batch_scores = probs[:, 0].cpu().numpy()  # heuristic
            scores.extend(batch_scores.tolist())

        reranked = [(c[0], float(s)) for c, s in zip(candidates, scores)]
        reranked.sort(key=lambda x: x[1], reverse=True)
        return reranked

    # =========================================================
    # Retrieve top_k candidates
    # =========================================================
    def retrieve(self, query: str, top_k: int = None) -> List[Tuple[str, float]]:
        if self.faiss_index is None or self.bm25 is None:
            raise RuntimeError("‚ùå Ch∆∞a c√≥ index. H√£y g·ªçi build_index() tr∆∞·ªõc khi truy v·∫•n.")

        if top_k is None:
            top_k = self.top_k

        # --- FAISS semantic search ---
        q_emb = self.embedder.encode([query], convert_to_numpy=True)
        q_emb = np.asarray(q_emb, dtype=np.float32)
        faiss.normalize_L2(q_emb)
        D, I = self.faiss_index.search(q_emb, top_k)
        faiss_results = []
        for idx, score in zip(I[0], D[0]):
            if idx < 0:
                continue
            faiss_results.append((self.text_chunks[int(idx)], float(score)))

        # --- BM25 lexical search ---
        bm25_scores = self.bm25.get_scores(re.findall(r'\w+', query.lower()))
        bm25_top_idx = np.argsort(bm25_scores)[::-1][:top_k]
        bm25_results = [(self.text_chunks[int(i)], float(bm25_scores[int(i)])) for i in bm25_top_idx]

        # --- Merge, keep best score per doc ---
        merged: Dict[str, float] = {}
        for text, score in faiss_results + bm25_results:
            merged[text] = max(merged.get(text, 0.0), float(score))

        # --- Apply topic boosting ---
        boosted = [(text, score * self._apply_topic_priority(query, text)) for text, score in merged.items()]

        # --- Rerank by cross-encoder ---
        reranked = self._rerank(query, boosted)

        # --- Final top_k ---
        results = reranked[:top_k]

        # Print brief preview for debugging
        print(f"üîé Truy v·∫•n: {query!s}")
        print(f"üìë Top {len(results)} k·∫øt qu·∫£:")
        for i, (text, score) in enumerate(results, 1):
            preview = text.replace("\n", " ")[:200]
            print(f"   {i:02d}. ({score:.4f}) {preview}...")

        return results

    def __repr__(self):
        n = len(self.text_chunks)
        dim = self.embeddings.shape[1] if self.embeddings is not None else "?"
        return f"<HybridLegalRetriever: {n} docs | dim={dim} | device={self.device}>"


Overwriting LegalRetriever.py


In [18]:
%%writefile LegalRAGProcessor.py
import re
import ast
import textwrap
import torch
from typing import List, Tuple


LEGAL_PROMPT = """\
B·∫°n l√† **tr·ª£ l√Ω ph√°p l√Ω chuy√™n nghi·ªáp**, chuy√™n t∆∞ v·∫•n v√† ph√¢n t√≠ch **theo ph√°p lu·∫≠t Vi·ªát Nam**.

- Kh√¥ng tr√≠ch nguy√™n vƒÉn to√†n b·ªô ƒëi·ªÅu lu·∫≠t, ch·ªâ n√™u s·ªë ƒëi·ªÅu, kho·∫£n, ch∆∞∆°ng.

- Kh√¥ng vi·∫øt lan man ho·∫∑c l·∫≠p lu·∫≠n ngo√†i quy ƒë·ªãnh ph√°p lu·∫≠t.



‚öñÔ∏è H∆∞·ªõng d·∫´n ƒë·ªãnh d·∫°ng b·∫Øt bu·ªôc:

Tr·∫£ l·ªùi theo **ƒë√∫ng 4 m·ª•c ƒë√°nh s·ªë sau** (b·∫Øt bu·ªôc):

1. **Lƒ©nh v·ª±c** ‚Äì x√°c ƒë·ªãnh lƒ©nh v·ª±c lu·∫≠t √°p d·ª•ng (d√¢n s·ª±, lao ƒë·ªông, h√¥n nh√¢n v√† gia ƒë√¨nh, h√¨nh s·ª±, ...).

2. **CƒÉn c·ª© ph√°p l√Ω** ‚Äì ghi r√µ [CH∆Ø∆†NG], [ƒêI·ªÄU LU·∫¨T], [KHO·∫¢N] c√≥ li√™n quan.

3. **Ph√¢n t√≠ch** ‚Äì gi·∫£i th√≠ch ng·∫Øn g·ªçn t√¨nh hu·ªëng theo quy ƒë·ªãnh ph√°p lu·∫≠t.

4. **K·∫øt lu·∫≠n** ‚Äì n√™u r√µ giao d·ªãch, h√†nh vi, ho·∫∑c t√¨nh hu·ªëng c√≥ h·ª£p l·ªá hay kh√¥ng, v√† ƒëi·ªÅu ki·ªán c·∫ßn c√≥.



---



**C√¢u h·ªèi:**

{query}



**C√°c cƒÉn c·ª© ph√°p l√Ω ƒë∆∞·ª£c truy xu·∫•t t·ª´ kho d·ªØ li·ªáu (gi·ªØ nguy√™n k√Ω hi·ªáu):**

{context_text}



---



‚û°Ô∏è **Ph·∫ßn tr·∫£ l·ªùi ch√≠nh th·ª©c:**
"""


class LegalRAGProcessor:
    def __init__(self, llm_adapter, retriever, top_k=5, max_new_tokens=512):
        self.llm = llm_adapter
        self.retriever = retriever
        self.top_k = top_k
        self.max_new_tokens = max_new_tokens

    def _build_prompt(self, query, context_text):
        return LEGAL_PROMPT.format(query=query.strip(), context_text=context_text.strip())

    def _clean_context(self, docs: List[Tuple[str, float]]) -> str:
        merged = []
        seen = set()
        for i, (text, score) in enumerate(docs, 1):
            cleaned = re.sub(r'[ \t]+', ' ', text.strip())
            cleaned = cleaned.replace("√è", "I")
            if cleaned not in seen:
                seen.add(cleaned)
                merged.append(f"[ƒêO·∫†N {i}] {cleaned}")
        return "\n\n".join(merged[: self.top_k])

    def _generate(self, prompt_text: str) -> str:
        inputs = self.llm.tokenizer(prompt_text, return_tensors="pt", truncation=True).to(self.llm.device)
        outputs = self.llm.model.generate(
            **inputs,
            max_new_tokens=self.max_new_tokens,
            temperature=0.3,
            top_p=0.9,
            repetition_penalty=1.1,
            do_sample=False,
            eos_token_id=self.llm.tokenizer.eos_token_id,
        )
        text = self.llm.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return self._clean_response(text)

    def _try_parse_dict_text(self, text: str):
        m = re.search(r"\{.*\}", text, re.S)
        if not m:
            return None
        blob = m.group(0)
        try:
            parsed = ast.literal_eval(blob)
            if isinstance(parsed, dict):
                return parsed
        except Exception:
            try:
                clean_blob = blob.replace("\n", " ").replace("None", "null")
                parsed = ast.literal_eval(clean_blob)
                if isinstance(parsed, dict):
                    return parsed
            except Exception:
                return None
        return None

    def _extract_law_citations(self, text: str) -> str:
        if not text:
            return ""
        t = re.sub(r"\s+", " ", text)
        matches = []
        for m in re.finditer(r"ƒêi·ªÅu\s*\d+", t, flags=re.I):
            s = m.group(0)
            tail = ""
            rest = t[m.end(): m.end() + 60]
            k = re.search(r"(Kho·∫£n|kho·∫£n)\s*\d+", rest)
            if k:
                tail = " " + k.group(0)
            d = re.search(r"(ƒêi·ªÉm|ƒëi·ªÉm)\s*[a-z0-9]+", rest)
            if d:
                tail += " " + d.group(0)
            matches.append((s + tail).strip())
        if not matches:
            for m in re.finditer(r"Kho·∫£n\s*\d+", t, flags=re.I):
                matches.append(m.group(0))
        uniq = []
        for it in matches:
            it_norm = re.sub(r"\s+", " ", it).strip()
            if it_norm not in uniq:
                uniq.append(it_norm)
        return ", ".join(uniq)

    def _clean_cite_field(self, cite_text: str, context_text: str) -> str:
        c = ""
        if cite_text and len(cite_text) < 300:
            c = self._extract_law_citations(cite_text)
        if not c:
            c = self._extract_law_citations(context_text)
        if not c and context_text:
            tags = re.findall(r"\[ƒêI·ªÄU LU·∫¨T\]\s*ƒêi·ªÅu\s*\d+", context_text, flags=re.I)
            tags += re.findall(r"\[KHO·∫¢N\]\s*Kho·∫£n\s*\d+", context_text, flags=re.I)
            tags = [re.sub(r"\[.*?\]\s*", "", t).strip() for t in tags]
            c = ", ".join(dict.fromkeys(tags))
        return c or "Ch∆∞a x√°c ƒë·ªãnh"

    def _clean_response(self, text: str) -> str:
        text = re.sub(r"(?is)^.*?Ph·∫ßn tr·∫£ l·ªùi ch√≠nh th·ª©c[:Ôºö]\s*", "", text).strip()
        original_text = text
        
        structured = {
            "Lƒ©nh v·ª±c": "",
            "CƒÉn c·ª© ph√°p l√Ω": "",
            "Ph√¢n t√≠ch": "",
            "K·∫øt lu·∫≠n": ""
        }

        parsed = self._try_parse_dict_text(text)
        if parsed and isinstance(parsed, dict):
            for raw_key, value in parsed.items():
                if not isinstance(raw_key, str):
                    continue
                key_lower = raw_key.strip().lower()
                if any(x in key_lower for x in ["lƒ©nh v·ª±c", "linh v·ª±c", "field"]):
                    structured["Lƒ©nh v·ª±c"] = str(value).strip()
                elif any(x in key_lower for x in ["cƒÉn c·ª©", "can cu", "legal", "ph√°p l√Ω"]):
                    structured["CƒÉn c·ª© ph√°p l√Ω"] = str(value).strip()
                elif any(x in key_lower for x in ["ph√¢n t√≠ch", "phan tich", "analysis"]):
                    structured["Ph√¢n t√≠ch"] = str(value).strip()
                elif any(x in key_lower for x in ["k·∫øt lu·∫≠n", "ket luan", "conclusion"]):
                    structured["K·∫øt lu·∫≠n"] = str(value).strip()
        else:
            lines = text.split('\n')
            current_section = None
            section_content = []
            for line in lines:
                line_stripped = line.strip()
                if not line_stripped:
                    continue
                is_heading = False
                if re.search(r"^\s*1[\.\):]?\s*\*{0,2}Lƒ©nh\s*v·ª±c", line_stripped, flags=re.I):
                    if current_section and section_content:
                        structured[current_section] = " ".join(section_content).strip()
                    current_section = "Lƒ©nh v·ª±c"
                    section_content = []
                    content = re.sub(r"^\s*1[\.\):]?\s*\*{0,2}Lƒ©nh\s*v·ª±c.*?:\s*", "", line_stripped, flags=re.I)
                    if content:
                        section_content.append(content)
                    is_heading = True
                elif re.search(r"^\s*2[\.\):]?\s*\*{0,2}CƒÉn\s*c·ª©\s*ph√°p\s*l√Ω", line_stripped, flags=re.I):
                    if current_section and section_content:
                        structured[current_section] = " ".join(section_content).strip()
                    current_section = "CƒÉn c·ª© ph√°p l√Ω"
                    section_content = []
                    content = re.sub(r"^\s*2[\.\):]?\s*\*{0,2}CƒÉn\s*c·ª©\s*ph√°p\s*l√Ω.*?:\s*", "", line_stripped, flags=re.I)
                    if content:
                        section_content.append(content)
                    is_heading = True
                elif re.search(r"^\s*3[\.\):]?\s*\*{0,2}Ph√¢n\s*t√≠ch", line_stripped, flags=re.I):
                    if current_section and section_content:
                        structured[current_section] = " ".join(section_content).strip()
                    current_section = "Ph√¢n t√≠ch"
                    section_content = []
                    content = re.sub(r"^\s*3[\.\):]?\s*\*{0,2}Ph√¢n\s*t√≠ch.*?:\s*", "", line_stripped, flags=re.I)
                    if content:
                        section_content.append(content)
                    is_heading = True
                elif re.search(r"^\s*4[\.\):]?\s*\*{0,2}K·∫øt\s*lu·∫≠n", line_stripped, flags=re.I):
                    if current_section and section_content:
                        structured[current_section] = " ".join(section_content).strip()
                    current_section = "K·∫øt lu·∫≠n"
                    section_content = []
                    content = re.sub(r"^\s*4[\.\):]?\s*\*{0,2}K·∫øt\s*lu·∫≠n.*?:\s*", "", line_stripped, flags=re.I)
                    if content:
                        section_content.append(content)
                    is_heading = True
                if not is_heading and current_section:
                    if not (line_stripped.startswith("'") or line_stripped.startswith("{") or line_stripped.startswith("}")):
                        section_content.append(line_stripped)
            if current_section and section_content:
                structured[current_section] = " ".join(section_content).strip()

        if not structured["Lƒ©nh v·ª±c"]:
            linh_vuc_patterns = [
                r"(?:Lƒ©nh\s*v·ª±c|Field)[:Ôºö\-‚Äì]?\s*([^\n\.\,]{5,80})",
                r"thu·ªôc\s+lƒ©nh\s+v·ª±c\s+([^\n\.\,]{5,50})",
                r"(?:Lu·∫≠t|LU·∫¨T)\s+([A-ZƒêƒÇ√Ç√ä√î∆†∆Ø][a-zƒëƒÉ√¢√™√¥∆°∆∞\s]+)",
            ]
            for pattern in linh_vuc_patterns:
                match = re.search(pattern, original_text, flags=re.I)
                if match:
                    structured["Lƒ©nh v·ª±c"] = match.group(1).strip()
                    break

        if not structured["CƒÉn c·ª© ph√°p l√Ω"]:
            structured["CƒÉn c·ª© ph√°p l√Ω"] = self._extract_law_citations(original_text)

        if not structured["Ph√¢n t√≠ch"]:
            phan_tich_patterns = [
                r"(?:Ph√¢n\s*t√≠ch|Analysis)[:Ôºö\-‚Äì]?\s*([^\n]{50,400})",
                r"(?:Theo|cƒÉn c·ª©|d·ª±a v√†o).*?(?:ƒêi·ªÅu|Kho·∫£n).*?[,\.]?\s*([^\n]{50,300})",
            ]
            for pattern in phan_tich_patterns:
                match = re.search(pattern, original_text, flags=re.I | re.S)
                if match:
                    structured["Ph√¢n t√≠ch"] = match.group(1).strip()
                    break

        if not structured["K·∫øt lu·∫≠n"]:
            ket_luan_patterns = [
                r"(?:K·∫øt\s*lu·∫≠n|Conclusion)[:Ôºö\-‚Äì]?\s*([^\n]{20,400})",
                r"(?:V·∫≠y|Do ƒë√≥|Nh∆∞ v·∫≠y)[,\s]+([^\n]{20,300})",
                r"(?:h·ª£p\s*l·ªá|kh√¥ng\s*h·ª£p\s*l·ªá|ƒë∆∞·ª£c\s*ph√©p|kh√¥ng\s*ƒë∆∞·ª£c\s*ph√©p).*?([^\n]{10,200})",
            ]
            for pattern in ket_luan_patterns:
                match = re.search(pattern, original_text, flags=re.I)
                if match:
                    structured["K·∫øt lu·∫≠n"] = match.group(1).strip()
                    break

        structured["CƒÉn c·ª© ph√°p l√Ω"] = self._clean_cite_field(structured.get("CƒÉn c·ª© ph√°p l√Ω", ""), original_text)

        for key in ["Ph√¢n t√≠ch", "K·∫øt lu·∫≠n", "Lƒ©nh v·ª±c"]:
            value = structured.get(key, "") or ""
            value = re.sub(r"^[\'\"\{\[].*?[\'\"\}\]]\s*[:,]?\s*", "", value)
            value = re.sub(r"[\{\}\[\]]+", "", value)
            value = re.sub(r"[\'\"]{2,}", "", value)
            value = re.sub(r"\s+", " ", value).strip()
            if len(value) > 500:
                value = textwrap.shorten(value, width=500, placeholder="...")
            structured[key] = value

        # =========================
        # CLEAN-UP K·∫æT LU·∫¨N CH·∫∂T H∆†N
        #  - Lo·∫°i b·ªè m·ªçi m·∫£nh dict-like ho·∫∑c nh√£n tr∆∞·ªùng c√≤n s√≥t
        #  - Lo·∫°i b·ªè c√°c fragment d·∫°ng: , 'Lƒ©nh v·ª±c': '...'
        #  - Gi·ªØ t·ªëi ƒëa 1-2 c√¢u sau khi d·ªçn s·∫°ch
        # =========================
        if structured["K·∫øt lu·∫≠n"]:
            kl = structured["K·∫øt lu·∫≠n"]

            # 1) lo·∫°i b·ªè c√°c fragment dict-like: 'Lƒ©nh v·ª±c': '...'
            kl = re.sub(r"[,\s]*['\"]?\s*(Lƒ©nh v·ª±c|CƒÉn c·ª© ph√°p l√Ω|Ph√¢n t√≠ch|K·∫øt lu·∫≠n)\s*['\"]?\s*[:Ôºö]\s*['\"]?[^,'\"\}\]]+['\"]?", "", kl, flags=re.I)

            # 2) lo·∫°i b·ªè c√°c fragment d·∫°ng , Lƒ©nh v·ª±c: ... ho·∫∑c , Lƒ©nh v·ª±c '...'
            kl = re.sub(r",\s*(Lƒ©nh v·ª±c|CƒÉn c·ª© ph√°p l√Ω|Ph√¢n t√≠ch|K·∫øt lu·∫≠n)\s*[:Ôºö]\s*[^,\.]+", "", kl, flags=re.I)

            # 3) lo·∫°i b·ªè b·∫•t k·ª≥ c·∫∑p kh√≥a-gi√° tr·ªã d·∫°ng key=... ho·∫∑c key:... c√≤n s√≥t
            kl = re.sub(r"\b(Lƒ©nh v·ª±c|CƒÉn c·ª© ph√°p l√Ω|Ph√¢n t√≠ch|K·∫øt lu·∫≠n)\b\s*[=:]\s*[^,\.]{1,200}", "", kl, flags=re.I)

            # 4) d·ªçn s·∫°ch ngo·∫∑c, ngo·∫∑c nh·ªçn, d·∫•u nh√°y d∆∞
            kl = re.sub(r"[\{\}\[\]\"]+", "", kl)
            kl = re.sub(r"\s+", " ", kl).strip()

            # 5) c·∫Øt ch·ªâ 1-2 c√¢u ch·ªët
            sentences = re.split(r"(?<=[.!?])\s+", kl)
            # Remove any empty strings
            sentences = [s.strip() for s in sentences if s.strip()]
            if sentences:
                kl_final = " ".join(sentences[:2])
            else:
                kl_final = kl.strip()
            structured["K·∫øt lu·∫≠n"] = kl_final

        defaults = {
            "Lƒ©nh v·ª±c": "Lu·∫≠t D√¢n s·ª±",
            "CƒÉn c·ª© ph√°p l√Ω": "Ch∆∞a x√°c ƒë·ªãnh c·ª• th·ªÉ",
            "Ph√¢n t√≠ch": "C·∫ßn xem x√©t c√°c quy ƒë·ªãnh ph√°p lu·∫≠t c√≥ li√™n quan ƒë·ªÉ ƒë∆∞a ra ph√¢n t√≠ch ch√≠nh x√°c.",
            "K·∫øt lu·∫≠n": "C·∫ßn th√™m th√¥ng tin ƒë·ªÉ k·∫øt lu·∫≠n ch√≠nh x√°c v·ªÅ t√≠nh h·ª£p l·ªá."
        }
        for key in ["Lƒ©nh v·ª±c", "CƒÉn c·ª© ph√°p l√Ω", "Ph√¢n t√≠ch", "K·∫øt lu·∫≠n"]:
            if not structured[key] or len(structured[key]) < 3:
                structured[key] = defaults[key]

        final = []
        order = ["Lƒ©nh v·ª±c", "CƒÉn c·ª© ph√°p l√Ω", "Ph√¢n t√≠ch", "K·∫øt lu·∫≠n"]
        for i, key in enumerate(order, start=1):
            content = structured[key].strip()
            final.append(f"{i}. **{key}** ‚Äì {content}")
        return "\n\n".join(final)

    def _validate_structure(self, text: str) -> bool:
        required = ["Lƒ©nh v·ª±c", "CƒÉn c·ª© ph√°p l√Ω", "Ph√¢n t√≠ch", "K·∫øt lu·∫≠n"]
        return all(k in text for k in required)

    def _regenerate_if_incomplete(self, text: str, query: str, context_text: str) -> str:
        if self._validate_structure(text):
            return text
        repair_prompt = (
            f"C√¢u tr·∫£ l·ªùi tr√™n ch∆∞a ƒë√∫ng 4 m·ª•c. H√£y vi·∫øt l·∫°i ng·∫Øn g·ªçn theo 4 m·ª•c:\n"
            f"1. **Lƒ©nh v·ª±c** ‚Äì lƒ©nh v·ª±c lu·∫≠t √°p d·ª•ng\n"
            f"2. **CƒÉn c·ª© ph√°p l√Ω** ‚Äì ch·ªâ n√™u [CH∆Ø∆†NG], [ƒêI·ªÄU LU·∫¨T], [KHO·∫¢N]\n"
            f"3. **Ph√¢n t√≠ch** ‚Äì gi·∫£i th√≠ch ng·∫Øn g·ªçn\n"
            f"4. **K·∫øt lu·∫≠n** ‚Äì n√™u r√µ t√≠nh h·ª£p l·ªá v√† ƒëi·ªÅu ki·ªán\n\n"
            f"C√¢u h·ªèi: {query}\n\nCƒÉn c·ª© ph√°p l√Ω:\n{context_text}"
        )
        inputs = self.llm.tokenizer(repair_prompt, return_tensors="pt", truncation=True).to(self.llm.device)
        outputs = self.llm.model.generate(
            **inputs, 
            max_new_tokens=self.max_new_tokens, 
            temperature=0.3
        )
        fixed = self.llm.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return self._clean_response(fixed)

    def process_query(self, query: str, top_k: int = None) -> str:
        docs = self.retriever.retrieve(query, top_k or self.top_k)
        context_text = self._clean_context(docs)
        prompt = self._build_prompt(query, context_text)
        answer = self._generate(prompt)
        return self._regenerate_if_incomplete(answer, query, context_text).strip()


Overwriting LegalRAGProcessor.py


In [19]:
%%writefile build_rag_from_txt_folder.py
import re
import os
from tqdm import tqdm
from typing import List

def normalize_legal_text(text: str) -> str:
    text = re.sub(r"\s+", " ", text)
    text = re.sub(r"(?<=ƒêi·ªÅu)\s+(\d+)", r" \1.", text)
    text = re.sub(r"(?<=Kho·∫£n)\s+(\d+)", r" \1.", text)
    text = re.sub(r"(?<!\.)\s*(?=ƒêi·ªÅu\s+\d+\.)", "\n", text)
    text = re.sub(r"(?<!\.)\s*(?=Kho·∫£n\s+\d+\.)", "\n", text)
    return text.strip()

def _tag_structures(text: str) -> str:
    """G·∫Øn tag [CH∆Ø∆†NG], [ƒêI·ªÄU], [KHO·∫¢N]"""
    chapter = re.findall(r"(CH∆Ø∆†NG\s+[IVXLC]+[^ƒê]*)", text)
    article = re.findall(r"(ƒêI·ªÄU LU·∫¨T\s+\d+)", text)
    clause = re.findall(r"(KHO·∫¢N\s+\d+)", text)

    ch = f"[CH∆Ø∆†NG] {chapter[0].strip()}" if chapter else ""
    di = f"[ƒêI·ªÄU LU·∫¨T] {article[0].strip()}" if article else ""
    kh = f"[KHO·∫¢N] {clause[0].strip()}" if clause else ""

    return " ".join([ch, di, kh]).strip()

def split_legal_text(text: str, max_length: int = 1000) -> List[str]:
    """T√°ch vƒÉn b·∫£n lu·∫≠t v√† g·∫Øn nh√£n c·∫•u tr√∫c."""
    text = re.sub(r"\s+", " ", text)
    sections = re.split(r"(?=ƒêi·ªÅu\s+\d+\.?)", text)
    refined_chunks = []

    for sec in sections:
        if len(sec.strip()) < 20:
            continue
        if len(sec) > max_length * 2:
            subchunks = re.split(r"(?=Kho·∫£n\s+\d+\.?)", sec)
            refined_chunks += [f"{_tag_structures(s)} {s.strip()}" for s in subchunks if len(s.strip()) > 50]
        else:
            refined_chunks.append(f"{_tag_structures(sec)} {sec.strip()}")

    return [c.strip() for c in refined_chunks if len(c.strip()) > 50]



def extract_text_from_txt(txt_path: str) -> str:
    """ƒê·ªçc n·ªôi dung vƒÉn b·∫£n t·ª´ file .txt (UTF-8)."""
    try:
        with open(txt_path, "r", encoding="utf-8") as f:
            text = f.read()
        return text
    except UnicodeDecodeError:
        # fallback n·∫øu file m√£ h√≥a kh√°c
        with open(txt_path, "r", encoding="utf-8-sig", errors="ignore") as f:
            return f.read()


def build_rag_from_txt_folder(folder_path: str):
    """
    ƒê·ªçc t·∫•t c·∫£ file .txt trong th∆∞ m·ª•c, chia nh·ªè th√†nh c√°c ƒëo·∫°n (chunks)
    ƒë·ªÉ d√πng l√†m d·ªØ li·ªáu cho m√¥ h√¨nh RAG.
    """
    txt_files = [f for f in os.listdir(folder_path) if f.lower().endswith(".txt")]
    print(f"üìö ƒêang ƒë·ªçc {len(txt_files)} t·ªáp TXT t·ª´: {folder_path}")

    all_chunks = []
    for file_name in tqdm(txt_files):
        txt_path = os.path.join(folder_path, file_name)
        try:
            text = extract_text_from_txt(txt_path)
            # ‚úÖ D√πng h√†m m·ªõi
            chunks = split_legal_text(text, max_length=1000)
            all_chunks.extend(chunks)
        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói khi x·ª≠ l√Ω {file_name}: {e}")

    print(f"‚úÖ T·ªïng s·ªë ƒëo·∫°n vƒÉn b·∫£n (chunks): {len(all_chunks)}")
    return all_chunks


Writing build_rag_from_txt_folder.py


In [20]:
%%writefile Decoder.py
import gc, torch
from transformers import AutoTokenizer, AutoModelForCausalLM

class Decoder:
    """
    Decoder v·ªõi prompt template ƒë∆∞·ª£c t·ªëi ∆∞u ƒë·ªÉ sinh ƒë·∫ßy ƒë·ªß 4 ph·∫ßn
    """
    def __init__(self, model_id):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"
        from transformers import AutoTokenizer, AutoModelForCausalLM
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True,use_fast=False)
        self.model = AutoModelForCausalLM.from_pretrained(
            model_id,
            torch_dtype=torch.float16,
            #device_map = 'auto'
            low_cpu_mem_usage=True
        ).eval()
        torch.set_grad_enabled(False)
    
    def _build_context(self, question, retrieved_context):
        """
        Prompt template ƒë∆∞·ª£c c·∫£i thi·ªán ƒë·ªÉ ƒë·∫£m b·∫£o model sinh ƒë·ªß 4 ph·∫ßn
        """
        return f"""B·∫°n l√† tr·ª£ l√Ω ph√°p l√Ω chuy√™n nghi·ªáp c·ªßa Vi·ªát Nam. Nhi·ªám v·ª• c·ªßa b·∫°n l√† tr·∫£ l·ªùi c√¢u h·ªèi ph√°p l√Ω theo ƒë·ªãnh d·∫°ng CHU·∫®N 4 ph·∫ßn b√™n d∆∞·ªõi.

üìö CƒÇN C·ª® PH√ÅP LU·∫¨T:
{retrieved_context}

‚ùì C√ÇU H·ªéI: {question}

‚ö†Ô∏è Y√äU C·∫¶U B·∫ÆT BU·ªòC - Tr·∫£ l·ªùi theo ƒê√öNG 4 PH·∫¶N sau (kh√¥ng ƒë∆∞·ª£c b·ªè s√≥t):

1. **Lƒ©nh v·ª±c** ‚Äì X√°c ƒë·ªãnh r√µ lƒ©nh v·ª±c lu·∫≠t (V√≠ d·ª•: Lu·∫≠t D√¢n s·ª±, Lu·∫≠t H√¨nh s·ª±, Lu·∫≠t Lao ƒë·ªông...).

2. **CƒÉn c·ª© ph√°p l√Ω** ‚Äì Ghi CH√çNH X√ÅC c√°c ƒêi·ªÅu, Kho·∫£n, Ch∆∞∆°ng li√™n quan (V√≠ d·ª•: ƒêi·ªÅu 123 B·ªô lu·∫≠t D√¢n s·ª± 2015).

3. **Ph√¢n t√≠ch** ‚Äì Gi·∫£i th√≠ch CHI TI·∫æT t√¨nh hu·ªëng theo quy ƒë·ªãnh ph√°p lu·∫≠t (t·ªëi thi·ªÉu 2-3 c√¢u).

4. **K·∫øt lu·∫≠n** ‚Äì Kh·∫≥ng ƒë·ªãnh R√ï R√ÄNG t√¨nh hu·ªëng c√≥ h·ª£p ph√°p/b·∫•t h·ª£p ph√°p hay kh√¥ng, v√† ƒëi·ªÅu ki·ªán c·∫ßn c√≥.

QUAN TR·ªåNG: B·∫°n PH·∫¢I sinh ƒë·ªß c·∫£ 4 ph·∫ßn theo th·ª© t·ª± tr√™n. Kh√¥ng ƒë∆∞·ª£c b·ªè qua b·∫•t k·ª≥ ph·∫ßn n√†o.
"""
    def _generate_one(self, inputs, temperature, max_new_tokens):
        with torch.inference_mode():
            output = self.model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=temperature,
                top_p=0.9,
                do_sample=True,
                pad_token_id=self.tokenizer.eos_token_id,
                repetition_penalty=1.1  # Tr√°nh l·∫∑p l·∫°i
            )
        gen_ids = output[0][inputs["input_ids"].shape[-1]:]
        text = self.tokenizer.decode(gen_ids, skip_special_tokens=True).strip()
        del output
        torch.cuda.empty_cache()
        gc.collect()
        return text
    
    def decode(self, prompt, temperature=0.5, max_new_tokens=512, num_samples=1):
        """
        Sinh tu·∫ßn t·ª± ƒë·ªÉ tr√°nh OOM, m·ªói l·∫ßn 1 c√¢u
        """
        results = []
        inputs = self.tokenizer(prompt, return_tensors="pt").to(self.device)
        
        for i in range(num_samples):
            text = self._generate_one(inputs, temperature, max_new_tokens)
            print(f"\nüß† H∆∞·ªõng #{i+1} (ƒë·ªô d√†i: {len(text)} k√Ω t·ª±)")
            print(text[:500], "..." if len(text) > 500 else "")
            results.append(text)
            torch.cuda.empty_cache()
        
        return results

Writing Decoder.py


In [21]:
%%writefile legal_self_consistency_rag.py
import gc
import re
import time
import json
import torch
from collections import Counter
from sentence_transformers import SentenceTransformer, util

# === GLOBAL: Shared Embedder ===
EMBEDDER = SentenceTransformer("./vietnamese-bi-encoder", device="cuda")
_EMB_BATCH_SIZE = 64
_RETRIEVAL_CACHE = {}
_SENT_EMB_CACHE = {}


def _embed_sentences_cached(sents, device="cuda", batch_size=_EMB_BATCH_SIZE):
    key = tuple(sents)
    if key in _SENT_EMB_CACHE:
        return _SENT_EMB_CACHE[key]
    use_cpu = len(sents) > 200
    enc_device = "cpu" if use_cpu else device
    with torch.no_grad():
        embs = EMBEDDER.encode(
            sents,
            convert_to_tensor=True,
            device=enc_device,
            show_progress_bar=False,
            batch_size=batch_size
        )
    if enc_device == "cpu" and device == "cuda":
        embs = embs.to("cuda")
    _SENT_EMB_CACHE[key] = embs
    return embs


def _clean_model_output(text: str) -> str:
    """
    Lo·∫°i b·ªè c√°c ph·∫ßn kh√¥ng c·∫ßn thi·∫øt t·ª´ output c·ªßa model
    """
    if not text:
        return ""

    # 1. Lo·∫°i b·ªè "C·∫£m ∆°n b·∫°n ƒë√£ tu√¢n th·ªß!"
    text = re.sub(r'(?i)c·∫£m\s+∆°n.*?tu√¢n\s+th·ªß.*?(?:\n|\r)+', '', text)

    # 2. Lo·∫°i b·ªè ph·∫ßn ### Instruction v√† ### Response
    text = re.sub(r'###\s*Instruction:.*?(?=###\s*Response:|$)', '', text, flags=re.DOTALL | re.IGNORECASE)
    text = re.sub(r'###\s*Response:\s*', '', text, flags=re.IGNORECASE)

    # 3. Lo·∫°i b·ªè emoji v√† c√°c k√Ω t·ª± kh√¥ng mong mu·ªën
    text = re.sub(r'[\U0001F300-\U0001F6FF\U0001F900-\U0001F9FFüß†üôèüòä‚≠ê‚ú®üåüüí¨üìùüîÑ]+', '', text)

    # 4. B·ªè ph·∫ßn "C·∫£m ∆°n" ·ªü cu·ªëi ho·∫∑c gi·ªØa
    text = re.sub(r'(?i)^\s*c·∫£m\s+∆°n[\s\S]*$', '', text, flags=re.MULTILINE)

    # 5. Trim v√† chu·∫©n ho√° kho·∫£ng tr·∫Øng
    text = re.sub(r'\r', '\n', text)
    text = re.sub(r'\n{3,}', '\n\n', text)
    text = re.sub(r'[ \t]+', ' ', text)

    return text.strip()

def _extract_from_dict_format(text: str) -> dict:
    """
    Tr√≠ch d·∫°ng dict t·ª´ output c·ªßa model.
    ∆Øu ti√™n dict c√≥ ƒë·ªß 2‚Äì4 m·ª•c h·ª£p l·ªá. Kh√¥ng crash, kh√¥ng m·∫•t d·ªØ li·ªáu.
    """
    sections = {}
    if not text:
        return sections

    # T√¨m t·∫•t c·∫£ block {...}
    dict_matches = re.findall(r'\{[^{}]{10,2000}\}',text)

    if not dict_matches:
        return {}

    best = None
    best_score = -1

    # Ch·ªçn dict c√≥ nhi·ªÅu key h·ª£p l·ªá nh·∫•t
    for d in dict_matches:
        test = d.replace("‚Äú", '"').replace("‚Äù", '"').replace("'", '"')
        try:
            parsed = json.loads(test)
            if not isinstance(parsed, dict):
               continue

            valid_keys = ["lƒ©nh", "linh", "cƒÉn", "can", "ph√¢n", "phan", "k·∫øt", "ket"]
            if len(parsed) > 6:
               continue
            if not any(any(vk in k.lower() for vk in valid_keys) for k in parsed.keys()):
               continue
            keys = parsed.keys()
            score = sum(k in keys for k in ['Lƒ©nh v·ª±c', 'CƒÉn c·ª© ph√°p l√Ω', 'Ph√¢n t√≠ch', 'K·∫øt lu·∫≠n'])
            if score > best_score:
                best_score = score
                best = parsed
        except Exception:
            continue

    if not best:
        return {}

    parsed = best

    # H√†m l·∫•y gi√° tr·ªã theo nhi·ªÅu t√™n kh·∫£ dƒ©
    def take(*keys):
        for k, v in parsed.items():
            if any(x.lower() in k.lower() for x in keys):
                return str(v).strip()
        return ""

    val = take("lƒ©nh", "linh", "field", "area")
    if isinstance(val, dict):
        val = ""
    sections = {
    "linh_vuc": val,
    "can_cu": take("cƒÉn", "can", "legal", "basis"),
    "phan_tich": take("ph√¢n", "phan", "analysis", "reasoning"),
    "ket_luan": take("k·∫øt", "ket", "conclusion", "result")
}


    # Ch·ªâ tr·∫£ c√°c key c√≥ n·ªôi dung th·∫≠t s·ª±
    sections = {k: v for k, v in sections.items() if v and len(v) > 3}

    return sections

    
def _parse_four_sections(text: str) -> dict:
    """
    Parse vƒÉn b·∫£n th√†nh 4 ph·∫ßn b·∫Øt bu·ªôc.
    Tr·∫£ v·ªÅ dict v·ªõi keys: linh_vuc, can_cu, phan_tich, ket_luan
    """
    text = _clean_model_output(text)
    sections = {"linh_vuc": "", "can_cu": "", "phan_tich": "", "ket_luan": ""}

    # ===== FIX: t√°ch block dict JSON-like tr∆∞·ªõc khi regex =====
    dict_sections = _extract_from_dict_format(text)
    if dict_sections:
        return dict_sections

    # N·∫øu kh√¥ng parse ƒë∆∞·ª£c dict ‚Üí khi ƒë√≥ m·ªõi lo·∫°i b·ªè block dict ƒë·ªÉ regex kh√¥ng ƒÉn nh·∫ßm
    m_dict = re.search(r'\{[\s\S]{10,5000}\}', text)
    if m_dict:
        text = text.replace(m_dict.group(0), "").strip()

    patterns = {
    "linh_vuc": r"(?:^|\n)\s*Lƒ©nh v·ª±c\s*[:\-‚Äì]\s*(.+?)(?=\n\S)",
    "can_cu": r"(?:^|\n)\s*CƒÉn c·ª© ph√°p l√Ω\s*[:\-‚Äì]\s*(.+?)(?=\n\S)",
    "phan_tich": r"(?:^|\n)\s*Ph√¢n t√≠ch\s*[:\-‚Äì]\s*(.+?)(?=\n\S)",
    "ket_luan": r"(?:^|\n)\s*K·∫øt lu·∫≠n\s*[:\-‚Äì]\s*(.+)$"
    }
    for key, patt in patterns.items():
        m = re.search(patt, text, re.DOTALL)
        if m:
           sections[key] = m.group(1).strip()

    # N·∫øu v·∫´n ch∆∞a c√≥, th·ª≠ t√°ch theo ƒëo·∫°n vƒÉn (double newline) v√† map heuristically
    if not any(sections.values()):
        parts = [p.strip() for p in re.split(r'\n\n+', text) if p.strip()]
        if parts:
            # heuristic mapping: ƒë·∫ßu ti√™n -> lƒ©nh v·ª±c, ti·∫øp -> cƒÉn c·ª©, ti·∫øp -> ph√¢n t√≠ch, cu·ªëi -> k·∫øt lu·∫≠n
            order = ['linh_vuc', 'can_cu', 'phan_tich', 'ket_luan']
            for i, part in enumerate(parts[:4]):
                sections[order[i]] = part

    return sections


def _deduplicate_sentences(text: str, threshold: float = 0.82) -> str:
    """
    Lo·∫°i c√¢u l·∫∑p d·ª±a tr√™n cosine embedding trong t·ª´ng ph·∫ßn.
    Gi·ªØ l·∫°i c√¢u mang th√¥ng tin m·ªõi.
    """
    if not text:
        return ""

    text = text.strip()
    parts = re.split(r'\n\s*\n', text)
    cleaned_parts = []

    for part in parts:
        sents = re.split(r'(?<=[.?!])\s+', part)
        sents = [s.strip() for s in sents if s.strip()]
        if len(sents) <= 1:
            cleaned_parts.append(part.strip())
            continue

        try:
            embs = _embed_sentences_cached(sents)
        except:
            cleaned_parts.append(part.strip())
            continue

        kept = [sents[0]]
        kept_embs = [embs[0]]

        for i in range(1, len(sents)):
            score = util.cos_sim(embs[i], torch.stack(kept_embs)).max().item()
            if score < threshold:
                kept.append(sents[i])
                kept_embs.append(embs[i])

        cleaned_parts.append(" ".join(kept).strip())

    return "\n\n".join(cleaned_parts).strip()



def extract_legal_conclusion(text: str):
    """Tr√≠ch xu·∫•t k·∫øt lu·∫≠n ph√°p l√Ω t·ª´ vƒÉn b·∫£n"""
    if not text:
        return ""
    m = re.search(
        r'([^.?!]*?(ƒêi·ªÅu\s*\d+|Kho·∫£n\s*\d+|Quy·∫øt\s*ƒë·ªãnh|K·∫øt\s*lu·∫≠n)[^.?!]*[.?!])',
        text, re.I
    )
    if m:
        return m.group(1).strip()
    sents = re.split(r'(?<=[.?!])\s+', text.strip())
    for s in reversed(sents[-3:]):
        if len(s.strip()) > 10:
            return s.strip()
    return text.strip()


def _validate_and_complete_sections(sections: dict, original_text: str) -> dict:
    """
    Ki·ªÉm tra v√† b·ªï sung c√°c ph·∫ßn c√≤n thi·∫øu.
    C·∫£i ti·∫øn: ph√°t hi·ªán lƒ©nh v·ª±c ph√°p l√Ω linh ho·∫°t, tr√°nh m·∫∑c ƒë·ªãnh 'D√¢n s·ª±' cho m·ªçi tr∆∞·ªùng h·ª£p.
    """
    cleaned_original = _clean_model_output(original_text)
    lower_text = cleaned_original.lower()

    # ƒê·∫£m b·∫£o ƒë·ªß kh√≥a
    for k in ['linh_vuc', 'can_cu', 'phan_tich', 'ket_luan']:
        sections.setdefault(k, "")

    # === üîπ Ph√°t hi·ªán lƒ©nh v·ª±c ph√°p l√Ω ===
    if not sections['linh_vuc']:
        field_map = {
            "d√¢n s·ª±": [
                "h·ª£p ƒë·ªìng", "b·ªìi th∆∞·ªùng", "di ch√∫c", "th·ª´a k·∫ø", "quy·ªÅn s·ªü h·ªØu", "t√†i s·∫£n", "giao d·ªãch d√¢n s·ª±"
            ],
            "h√¥n nh√¢n v√† gia ƒë√¨nh": [
                "ly h√¥n", "k·∫øt h√¥n", "nu√¥i con", "t√†i s·∫£n chung", "t√†i s·∫£n ri√™ng", "h√¥n nh√¢n", "gia ƒë√¨nh", "cha m·∫π", "con c√°i"
            ],
            "lao ƒë·ªông": [
                "ng∆∞·ªùi lao ƒë·ªông", "h·ª£p ƒë·ªìng lao ƒë·ªông", "ti·ªÅn l∆∞∆°ng", "ngh·ªâ vi·ªác", "sa th·∫£i", "b·∫£o hi·ªÉm x√£ h·ªôi", "l√†m vi·ªác", "tr·ª£ c·∫•p"
            ]
        }

        scores = Counter()
        for field, keywords in field_map.items():
            for kw in keywords:
                if re.search(rf"\b{re.escape(kw)}\b", lower_text):
                    scores[field] += 1

        if scores:
            best_field = scores.most_common(1)[0][0]
            sections['linh_vuc'] = f"Lu·∫≠t {best_field.title()}"
        else:
            sections['linh_vuc'] = "Ch∆∞a x√°c ƒë·ªãnh r√µ ‚Äì c·∫ßn cƒÉn c·ª© th√™m v√†o t√¨nh ti·∫øt"

    # === üîπ B·ªï sung cƒÉn c·ª© ph√°p l√Ω ===
    if not sections['can_cu']:
        legal_refs = re.findall(r'(ƒêi·ªÅu\s+\d+[a-zA-Z]?|Kho·∫£n\s+\d+|Ch∆∞∆°ng\s+[IVXLC]+)', cleaned_original, re.IGNORECASE)
        if legal_refs:
            seen = set()
            unique_refs = []
            for ref in legal_refs:
                if ref not in seen:
                    seen.add(ref)
                    unique_refs.append(ref)
            sections['can_cu'] = ", ".join(unique_refs[:5])
        else:
            sections['can_cu'] = "Theo quy ƒë·ªãnh c·ªßa ph√°p lu·∫≠t hi·ªán h√†nh"

    # === üîπ B·ªï sung ph·∫ßn ph√¢n t√≠ch ===
    if not sections['phan_tich']:
        sentences = re.split(r'(?<=[.?!])\s+', cleaned_original.strip())
        meaningful = [s for s in sentences if len(s) > 30 and not s.strip().startswith(('1.', '2.', '3.', '4.', '###'))]
        if meaningful:
            sections['phan_tich'] = " ".join(meaningful[:3])
        else:
            sections['phan_tich'] = "Theo quy ƒë·ªãnh ph√°p lu·∫≠t, c·∫ßn xem x√©t c√°c y·∫øu t·ªë li√™n quan."

    # === üîπ B·ªï sung ph·∫ßn k·∫øt lu·∫≠n ===
    if not sections['ket_luan'] or len(sections['ket_luan']) < 15:
        concl = extract_legal_conclusion(cleaned_original)
        if concl:
            sections['ket_luan'] = concl
        else:
            sections['ket_luan'] = "C·∫ßn xem x√©t c·ª• th·ªÉ t·ª´ng tr∆∞·ªùng h·ª£p theo quy ƒë·ªãnh ph√°p lu·∫≠t."

    # === üîπ Chu·∫©n ho√° l·∫°i c√°c ph·∫ßn ===
    for key in sections:
        if sections[key]:
            sections[key] = re.sub(r'^[\-‚Äì‚Äî\s]+', '', sections[key]).strip()

    return sections


def _legal_style_cleanup(text: str) -> str:
    """
    Chu·∫©n h√≥a vƒÉn phong ph√°p l√Ω: kh√°ch quan, ch√≠nh x√°c, b·ªè c·∫£m t√≠nh.
    """
    if not text:
        return text

    text = re.sub(r'\b(r√µ r√†ng|hi·ªÉn nhi√™n|theo quan ƒëi·ªÉm c·ªßa t√¥i|ch√∫ng ta th·∫•y r·∫±ng|t√¥i cho r·∫±ng)\b', '', text, flags=re.IGNORECASE)
    text = re.sub(r'\s{2,}', ' ', text)
    text = re.sub(r'ƒëi·ªÅu\s+(\d+)', lambda m: f"ƒêi·ªÅu {m.group(1)}", text, flags=re.IGNORECASE)
    text = text.replace(" - ", " ‚Äì ")
    text = text.strip()
    return text


def _format_legal_answer_enhanced(answer: str) -> str:
    """
    Chu·∫©n ho√° c√¢u tr·∫£ l·ªùi th√†nh 4 ph·∫ßn: Lƒ©nh v·ª±c, CƒÉn c·ª© ph√°p l√Ω, Ph√¢n t√≠ch, K·∫øt lu·∫≠n
    ƒê√É FIX l·ªói tr·ªôn nh·∫ßm CƒÉn c·ª© v√†o Lƒ©nh v·ª±c v√† tr√πng ƒë√°nh s·ªë.
    """
    text = _clean_model_output(answer)
    sections = _parse_four_sections(text)
    sections = _validate_and_complete_sections(sections, text)
    if all(sections[k] for k in ['linh_vuc', 'can_cu', 'phan_tich', 'ket_luan']):
        return (
        f"1. **Lƒ©nh v·ª±c** ‚Äì {sections['linh_vuc']}\n\n"
        f"2. **CƒÉn c·ª© ph√°p l√Ω** ‚Äì {sections['can_cu']}\n\n"
        f"3. **Ph√¢n t√≠ch** ‚Äì {sections['phan_tich']}\n\n"
        f"4. **K·∫øt lu·∫≠n** ‚Äì {sections['ket_luan']}"
    ).strip()
    # Lo·∫°i b·ªè numbering/bold d∆∞ ·ªü ƒë·∫ßu ph·∫ßn
    for key in sections:
        if sections[key]:
            sections[key] = re.sub(
                r'^\s*\d+\s*[.)\-‚Äì]*\s*', '',  # x√≥a "1. ", "1)", "1 -", ...
                sections[key], flags=re.IGNORECASE).strip()
            sections[key] = re.sub(
                r'^\*\*.+?\*\*\s*[‚Äì:\-]\s*', '',
                sections[key], flags=re.IGNORECASE).strip()

    # D·ªçn vƒÉn phong
    for k in sections:
        sections[k] = _legal_style_cleanup(_deduplicate_sentences(sections[k]))

    # Construct final standardized format (Duy nh·∫•t 1 l·∫ßn numbering)
    return (
        f"1. **Lƒ©nh v·ª±c** ‚Äì {sections['linh_vuc']}\n\n"
        f"2. **CƒÉn c·ª© ph√°p l√Ω** ‚Äì {sections['can_cu']}\n\n"
        f"3. **Ph√¢n t√≠ch** ‚Äì {sections['phan_tich']}\n\n"
        f"4. **K·∫øt lu·∫≠n** ‚Äì {sections['ket_luan']}"
    ).strip()


# === Main RAG orchestration (example simplified) ===
# Note: The functions below assume `retriever` and `decoder` objects are provided

def _retrieve_with_cache(retriever, query, top_k=5, use_cache=True):
    key = (query, top_k)
    if use_cache and key in _RETRIEVAL_CACHE:
        return _RETRIEVAL_CACHE[key]
    retrieved = retriever.retrieve(query, top_k=top_k)
    normalized = []
    for item in retrieved:
        try:
            if isinstance(item, (list, tuple)) and len(item) >= 1:
                normalized.append((item[0], item[1] if len(item) > 1 else None))
            else:
                normalized.append((str(item), None))
        except Exception:
            normalized.append((str(item), None))
    if use_cache:
        _RETRIEVAL_CACHE[key] = normalized
    return normalized


def legal_self_consistency_rag(
    retriever,
    decoder,
    query: str,
    num_samples: int = 2,
    temperature: float = 0.5,
    max_new_tokens: int = 384,
    top_k: int = 3,
    use_retrieval_cache: bool = True,
    dedup_threshold: float = 0.85,
    enforce_structure: bool = True
):
    """
    RAG v·ªõi self-consistency: sinh nhi·ªÅu h∆∞·ªõng l·∫≠p lu·∫≠n, chu·∫©n ho√° sau khi ch·ªçn k·∫øt qu·∫£ t·ªët nh·∫•t.
    """
    t0 = time.perf_counter()
    retrieved = _retrieve_with_cache(retriever, query, top_k=top_k, use_cache=use_retrieval_cache)
    context_text = "\n\n".join([doc for doc, _ in retrieved])
    prompt = decoder._build_context(query, context_text)

    # üîπ outputs_raw = ch·ª©a c√°c c√¢u tr·∫£ l·ªùi CH∆ØA format
    outputs_raw = []
    print("\nüîÑ ƒêang sinh c√°c h∆∞·ªõng l·∫≠p lu·∫≠n:")

    for i in range(num_samples):
        try:
            out = decoder.decode(prompt, temperature=temperature, max_new_tokens=max_new_tokens, num_samples=1)
            text_out = out[0].strip() if isinstance(out, (list, tuple)) else str(out).strip()
            if not text_out:
                print(f"‚ö†Ô∏è H∆∞·ªõng {i+1}: Output r·ªóng, b·ªè qua")
                continue

            text_out = _clean_model_output(text_out)  # ch·ªâ clean, KH√îNG format
            outputs_raw.append(text_out)
            print(f"‚úÖ H∆∞·ªõng {i+1} ho√†n th√†nh ({len(text_out)} k√Ω t·ª±)")

        except Exception as e:
            print(f"‚ö†Ô∏è L·ªói khi sinh h∆∞·ªõng {i+1}: {e}")

    if not outputs_raw:
        return {"final_answer": "Kh√¥ng sinh ƒë∆∞·ª£c c√¢u tr·∫£ l·ªùi h·ª£p l·ªá.", "candidates": []}

    # === Self-consistency: ch·ªçn c√¢u tr·∫£ l·ªùi trung b√¨nh v·ªÅ ng·ªØ nghƒ©a ===
    print("\nüß† ƒêang ƒë√°nh gi√° ƒë·ªô nh·∫•t qu√°n ng·ªØ nghƒ©a...")
    embs = _embed_sentences_cached(outputs_raw)
    sims = util.cos_sim(embs, embs)
    avg_scores = sims.mean(dim=1)
    best_idx = int(torch.argmax(avg_scores))

    # üî• Format CH·ªà ·ª©ng vi√™n t·ªët nh·∫•t
    final_answer = (
        _format_legal_answer_enhanced(outputs_raw[best_idx])
        if enforce_structure else outputs_raw[best_idx]
    )

    # === Format c√°c candidates c√≤n l·∫°i (tu·ª≥ ch·ªçn)
    candidates = [
        _format_legal_answer_enhanced(c) if enforce_structure else c
        for c in outputs_raw
    ]

    elapsed = time.perf_counter() - t0
    print(f"\n‚è±Ô∏è Ho√†n t·∫•t RAG trong {elapsed:.2f}s ‚Äî ch·ªçn h∆∞·ªõng {best_idx + 1}")

    return {"final_answer": final_answer, "candidates": candidates, "best_idx": best_idx}





Writing legal_self_consistency_rag.py


In [22]:
!pip install streamlit

Collecting streamlit
  Downloading streamlit-1.52.1-py3-none-any.whl.metadata (9.8 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.52.1-py3-none-any.whl (9.0 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m9.0/9.0 MB[0m [31m71.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m120.8 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.52.1


In [23]:
%%writefile legal_rag_app.py
import os
import streamlit as st
import torch
import gc

from LegalRetriever import HybridLegalRetriever
from Decoder import Decoder
from build_rag_from_txt_folder import build_rag_from_txt_folder
from legal_self_consistency_rag import legal_self_consistency_rag

# ==========================================
# ‚öôÔ∏è C·∫•u h√¨nh chung
# ==========================================
os.environ["TOKENIZERS_PARALLELISM"] = "false"
st.set_page_config(page_title="‚öñÔ∏è Legal RAG Chatbot (Vi·ªát Nam)", layout="centered")

st.markdown(
    """
    <style>
    .chat-message {
        padding: 0.8em 1.2em;
        border-radius: 1em;
        margin-bottom: 0.8em;
        max-width: 90%;
        word-wrap: break-word;
        font-size: 1rem;
        line-height: 1.5;
    }
    .user-msg {
        background-color: #DCF8C6;
        margin-left: auto;
    }
    .bot-msg {
        background-color: #F1F0F0;
        margin-right: auto;
    }
    </style>
    """,
    unsafe_allow_html=True
)

st.title("‚öñÔ∏è Legal RAG Chatbot (Lu·∫≠t Vi·ªát Nam)")

# ==========================================
# üîπ Kh·ªüi t·∫°o models v√† retriever
# ==========================================
@st.cache_resource
def load_decoder():
    """Load m√¥ h√¨nh Qwen ƒë√£ fine-tuned lu·∫≠t."""
    decoder = Decoder("/kaggle/input/qwen_law_instruct_v3/keras/default/2")
    if torch.cuda.is_available():
        decoder.model.to("cuda:0")
    return decoder

@st.cache_resource
def init_retriever(txt_folder="/kaggle/input/luat-txt"):
    """X√¢y d·ª±ng ch·ªâ m·ª•c t√¨m ki·∫øm lu·∫≠t."""
    docs = build_rag_from_txt_folder(txt_folder)
    retriever = HybridLegalRetriever()
    retriever.build_index(docs)
    return retriever

decoder = load_decoder()
retriever = init_retriever()

# ==========================================
# üí¨ Tr·∫°ng th√°i h·ªôi tho·∫°i
# ==========================================
if "messages" not in st.session_state:
    st.session_state.messages = []

# ==========================================
# ‚öôÔ∏è Sidebar: ƒëi·ªÅu ch·ªânh tham s·ªë sinh
# ==========================================
st.sidebar.header("‚öôÔ∏è C·∫•u h√¨nh m√¥ h√¨nh")

temperature = st.sidebar.slider("Temperature (ng·∫´u nhi√™n)", 0.0, 1.5, 0.5, 0.05)
num_samples = st.sidebar.slider("S·ªë m·∫´u sinh (N)", 1, 6, 3)
top_k = st.sidebar.slider("Top-K t√†i li·ªáu truy xu·∫•t", 1, 10, 5)
top_p = st.sidebar.slider("Top-P (nucleus sampling)", 0.1, 1.0, 0.9, 0.05)
repetition_penalty = st.sidebar.slider("Repetition Penalty", 1.0, 2.0, 1.1, 0.05)
alpha = st.sidebar.slider("Alpha (NLI vs ƒë·ªô d√†i)", 0.0, 1.0, 0.8, 0.05)
ideal_len = st.sidebar.slider("ƒê·ªô d√†i l√Ω t∆∞·ªüng (token)", 50, 300, 150, 10)
len_sigma = st.sidebar.slider("ƒê·ªô l·ªách chu·∫©n ƒë·ªô d√†i", 20, 200, 80, 10)
max_new_tokens = st.sidebar.slider("Max new tokens", 50, 2048, 512, 50)
if st.sidebar.button("üßπ Reset h·ªôi tho·∫°i"):
    st.session_state.messages = []
    st.sidebar.success("ƒê√£ x√≥a to√†n b·ªô h·ªôi tho·∫°i!")

# ==========================================
# üí¨ Giao di·ªán chat ch√≠nh
# ==========================================
st.markdown("### üí¨ Chatbot ph√°p l√Ω th√¥ng minh")

query = st.chat_input("Nh·∫≠p c√¢u h·ªèi ph√°p l√Ω c·ªßa b·∫°n...")

# Hi·ªÉn th·ªã l·∫°i h·ªôi tho·∫°i c≈©
for msg in st.session_state.messages:
    with st.chat_message(msg["role"]):
        st.markdown(msg["content"])

# ==========================================
# üöÄ X·ª≠ l√Ω c√¢u h·ªèi m·ªõi
# ==========================================
if query:
    st.session_state.messages.append({"role": "user", "content": query})
    with st.chat_message("user"):
        st.markdown(query)

    with st.chat_message("assistant"):
        with st.spinner("üß† ƒêang truy xu·∫•t v√† t·ªïng h·ª£p c√¢u tr·∫£ l·ªùi..."):
            try:
                answer = legal_self_consistency_rag(  
                retriever = retriever,
                decoder = decoder,
                query = query,
                num_samples = num_samples,
                temperature = temperature,
                max_new_tokens = max_new_tokens,
                top_k = top_k
            )
                reply = answer.get("final_answer", "Kh√¥ng c√≥ c√¢u tr·∫£ l·ªùi ph√π h·ª£p.")
            except Exception as e:
                reply = f"‚ö†Ô∏è L·ªói khi t·∫°o c√¢u tr·∫£ l·ªùi: {str(e)}"
                torch.cuda.empty_cache()
                gc.collect()

        st.markdown(reply)
        st.session_state.messages.append({"role": "assistant", "content": reply})

# ==========================================
# üßπ Cleanup GPU memory
# ==========================================
torch.cuda.empty_cache()
gc.collect()


Writing legal_rag_app.py


In [24]:
!pip install pyngrok

Collecting pyngrok
  Downloading pyngrok-7.5.0-py3-none-any.whl.metadata (8.1 kB)
Downloading pyngrok-7.5.0-py3-none-any.whl (24 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.5.0


In [25]:
!ngrok config add-authtoken 

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml                                


In [26]:
from pyngrok import ngrok
import threading, os

port = 8501
public_url = ngrok.connect(port).public_url
print("üåê Public URL:", public_url)

def run_app():
    os.system(f"streamlit run legal_rag_app.py --server.port {port}")

threading.Thread(target=run_app).start()

üåê Public URL: https://unsloped-pseudomedievally-makhi.ngrok-free.dev

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8501
  Network URL: http://172.19.2.2:8501
  External URL: http://136.112.3.221:8501



2025-12-17 10:43:50.219017: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1765968230.243585     187 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1765968230.251270     187 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
`torch_dtype` is deprecated! Use `dtype` instead!
Loading checkpoint shards: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2/2 [00:01<00:00,  1.76it/s]


üìö ƒêang ƒë·ªçc 3 t·ªáp TXT t·ª´: /kaggle/input/luat-txt


100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 3/3 [00:00<00:00, 34.19it/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at ./ms-marco-MiniLM-L-12-v2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Batches:   0%|          | 0/39 [00:00<?, ?it/s]

‚úÖ T·ªïng s·ªë ƒëo·∫°n vƒÉn b·∫£n (chunks): 1228
üîπ X√¢y d·ª±ng BM25 index...
üîπ ƒêang t·∫°o embeddings v√† x√¢y d·ª±ng FAISS index...


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 39/39 [00:11<00:00,  3.31it/s]


‚úÖ Ho√†n t·∫•t! ƒê√£ x√¢y d·ª±ng index cho 1,228 ƒëo·∫°n lu·∫≠t v·ªõi vector dim = 768.
   ‚Ä¢ FAISS vectors: 1228
   ‚Ä¢ Thi·∫øt b·ªã reranker: cuda:0


Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 34.86it/s]


üîé Truy v·∫•n: n·ªØ ƒë·ªß bao nhi√™u tu·ªïi th√¨ ƒë∆∞·ª£c ph√©p ƒëƒÉng k√Ω k·∫øt h√¥n
üìë Top 5 k·∫øt qu·∫£:
   01. (0.4860) ƒêi·ªÅu 8. ƒêi·ªÅu ki·ªán k·∫øt h√¥n [KHO·∫¢N] 1. Nam, n·ªØ k·∫øt h√¥n v·ªõi nhau ph·∫£i tu√¢n theo c√°c ƒëi·ªÅu ki·ªán sau ƒë√¢y: a) Nam t·ª´ ƒë·ªß 20 tu·ªïi tr·ªü l√™n, n·ªØ t·ª´ ƒë·ªß 18 tu·ªïi tr·ªü l√™n; b) Vi·ªác k·∫øt h√¥n do nam v√† n·ªØ t·ª± nguy·ªán quy·∫øt...
   02. (0.4848) ƒêi·ªÅu 14. Gi·∫£i quy·∫øt h·∫≠u qu·∫£ c·ªßa vi·ªác nam, n·ªØ chung s·ªëng v·ªõi nhau nh∆∞ v·ª£ ch·ªìng m√† kh√¥ng ƒëƒÉng k√Ω k·∫øt h√¥n [KHO·∫¢N] 1. Nam, n·ªØ c√≥ ƒë·ªß ƒëi·ªÅu ki·ªán k·∫øt h√¥n theo quy ƒë·ªãnh c·ªßa Lu·∫≠t n√†y chung s·ªëng v·ªõi nhau nh∆∞ v·ª£ ...
   03. (0.4844) ƒêi·ªÅu 13. X·ª≠ l√Ω vi·ªác ƒëƒÉng k√Ω k·∫øt h√¥n kh√¥ng ƒë√∫ng th·∫©m quy·ªÅn Trong tr∆∞·ªùng h·ª£p vi·ªác ƒëƒÉng k√Ω k·∫øt h√¥n kh√¥ng ƒë√∫ng th·∫©m quy·ªÅn th√¨ khi c√≥ y√™u c·∫ßu, c∆° quan nh√† n∆∞·ªõc c√≥ th·∫©m quy·ªÅn thu h·ªìi, h·ªßy b·ªè gi·∫•y ch·ª©ng nh·∫≠n...
   04. (0.4844) ƒêi·ªÅu 16 c

Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 93.98it/s]
Batches: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1/1 [00:00<00:00, 98.48it/s]
