#### DEMO

In [None]:
! git clone https://github.com/Eric0801/NTCIR-18-CLIR-pipeline-team6939.git

In [2]:
!pip3 install transformers rank_bm25 sentence-transformers faiss-cpu jieba tqdm opencc pymupdf 

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting rank_bm25
  Using cached rank_bm25-0.2.2-py3-none-any.whl.metadata (3.2 kB)
Collecting sentence-transformers
  Downloading sentence_transformers-4.1.0-py3-none-any.whl.metadata (13 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.11.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (4.8 kB)
Collecting jieba
  Using cached jieba-0.42.1-py3-none-any.whl
Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Collecting opencc
  Downloading opencc-1.1.9.tar.gz (3.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-n

In [2]:
!python3 setup_models.py

↓ Downloading zhbert from hfl/chinese-roberta-wwm-ext...
tokenizer_config.json: 100%|█████████████████| 19.0/19.0 [00:00<00:00, 23.0kB/s]
config.json: 100%|█████████████████████████████| 689/689 [00:00<00:00, 4.18MB/s]
vocab.txt: 100%|██████████████████████████████| 110k/110k [00:00<00:00, 259kB/s]
tokenizer.json: 100%|█████████████████████████| 269k/269k [00:00<00:00, 773kB/s]
added_tokens.json: 100%|█████████████████████| 2.00/2.00 [00:00<00:00, 9.22kB/s]
special_tokens_map.json: 100%|██████████████████| 112/112 [00:00<00:00, 773kB/s]
pytorch_model.bin: 100%|█████████████████████| 412M/412M [01:35<00:00, 4.33MB/s]
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at hfl/chinese-roberta-wwm-ext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[✓] Saved to models/zhbert
↓ Downloading labse from sentence-transformers

## Step 1: Intialize Models

if models haven't installed, it will check and install to "models/" directory.

In [3]:
# Step 0: Initialize Models (Colab Local)
from pathlib import Path
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification

def download_model(name, hf_id, is_classifier=True):
    save_dir = Path("models") / name
    if save_dir.exists() and any(save_dir.iterdir()):
        print(f"[✓] {name} already exists, skipping download.")
        return
    print(f"↓ Downloading {name} from HuggingFace...")
    tokenizer = AutoTokenizer.from_pretrained(hf_id)
    model_cls = AutoModelForSequenceClassification if is_classifier else AutoModel
    model = model_cls.from_pretrained(hf_id)
    save_dir.mkdir(parents=True, exist_ok=True)
    tokenizer.save_pretrained(save_dir)
    model.save_pretrained(save_dir)
    print(f"[✓] {name} saved to {save_dir}")

download_model("zhbert", "hfl/chinese-roberta-wwm-ext", is_classifier=True)
download_model("labse", "sentence-transformers/LaBSE", is_classifier=False)
download_model("cross_encoder", "sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2", is_classifier=True)

[✓] zhbert already exists, skipping download.
[✓] labse already exists, skipping download.
[✓] cross_encoder already exists, skipping download.


## Step 2: Extract Paragraphs from PDF (PyMuPDF + EasyOCR fallback)

In [None]:
import fitz
import easyocr
import json
import os
import numpy as np
from pdf2image import convert_from_path
from pathlib import Path
import traceback
import glob

# Install required poppler-utils if not already installed
!apt-get update -qq && apt-get install -qq -y poppler-utils

# Base directory settings
BASE_DIR = Path("/content/NTCIR-18-CLIR-pipeline-team6939")
PDF_DIRS = [
    BASE_DIR / "pdfs/finance",
    BASE_DIR / "pdfs/insurance"
]
OUTPUT_DIR = BASE_DIR / "outputs"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Initialize EasyOCR
try:
    print("Initializing EasyOCR...")
    reader = easyocr.Reader(['ch_tra', 'en'], gpu=False)
    print("✓ EasyOCR initialized successfully")
except Exception as e:
    print(f"Failed to initialize EasyOCR: {e}")
    print("Will try to process without OCR fallback")
    reader = None

def extract_blocks_with_heuristics(pdf_path, min_block_length=40):
    print(f"Extracting text blocks using PyMuPDF from: {pdf_path}")
    try:
        doc = fitz.open(pdf_path)
        results = []
        doc_id = os.path.splitext(os.path.basename(pdf_path))[0]

        for page_num, page in enumerate(doc):
            blocks = page.get_text("blocks")
            for i, block in enumerate(sorted(blocks, key=lambda b: b[1])):
                x0, y0, x1, y1, text, *_ = block
                clean_text = text.strip().replace("\n", " ")
                if len(clean_text) >= min_block_length:
                    results.append({
                        "pid": f"{doc_id}_p{page_num}_b{i}",
                        "page": page_num,
                        "bbox": [x0, y0, x1, y1],
                        "text": clean_text
                    })

        print(f"✓ Extracted {len(results)} text blocks")
        return results
    except Exception as e:
        print(f"PyMuPDF extraction failed: {e}")
        traceback.print_exc()
        raise

def fallback_ocr_easyocr(pdf_path):
    print(f"Using EasyOCR for fallback OCR processing: {pdf_path}")
    if reader is None:
        print("Cannot perform OCR: EasyOCR not initialized")
        return []

    try:
        images = convert_from_path(pdf_path, dpi=300)
        results = []
        doc_id = os.path.splitext(os.path.basename(pdf_path))[0]

        for page_num, image in enumerate(images):
            print(f"Processing page {page_num + 1}...")

            # Convert PIL Image to numpy array (format that EasyOCR expects)
            image_np = np.array(image)

            # Process with EasyOCR
            try:
                ocr_result = reader.readtext(image_np)
                full_text = " ".join([res[1] for res in ocr_result if len(res[1].strip()) > 0])

                if full_text.strip():
                    results.append({
                        "pid": f"{doc_id}_ocr_{page_num}",
                        "page": page_num,
                        "bbox": None,
                        "text": full_text.strip()
                    })
            except Exception as e:
                print(f"Error during OCR on page {page_num + 1}: {e}")
                continue

        print(f"✓ OCR processing completed, extracted {len(results)} pages of text")
        return results
    except Exception as e:
        print(f"OCR processing failed: {e}")
        traceback.print_exc()
        return []

def process_pdf_file(pdf_path):
    print(f"Processing PDF file: {pdf_path}")

    # Check if file exists
    if not os.path.exists(pdf_path):
        print(f"Error: PDF file does not exist: {pdf_path}")
        return []

    try:
        # First try with PyMuPDF
        segments = extract_blocks_with_heuristics(pdf_path)

        # Skip OCR if PyMuPDF produced good results
        if segments and not all(len(seg['text']) < 40 for seg in segments):
            return segments

        print(f"PyMuPDF extraction results poor or empty, attempting OCR")
        ocr_segments = fallback_ocr_easyocr(pdf_path)

        # If OCR also failed, return whatever we got from PyMuPDF
        if not ocr_segments:
            print("OCR produced no results, returning PyMuPDF results instead")
            return segments

        return ocr_segments
    except Exception as e:
        print(f"Error during PyMuPDF processing: {e}")
        print("Attempting OCR fallback")

        try:
            return fallback_ocr_easyocr(pdf_path)
        except Exception as e2:
            print(f"OCR fallback also failed: {e2}")
            print("Returning empty results for this PDF")
            return []

# Find all PDF files in specified directories
all_pdf_files = []
for pdf_dir in PDF_DIRS:
    if pdf_dir.exists():
        pdf_files = list(pdf_dir.glob("**/*.pdf"))
        all_pdf_files.extend(pdf_files)
        print(f"Found {len(pdf_files)} PDF files in {pdf_dir}")
    else:
        print(f"Warning: Directory does not exist: {pdf_dir}")

print(f"Total PDFs found: {len(all_pdf_files)}")

# Process a subset of PDFs if there are too many (optional)
MAX_PDFS = 99999999999  # Adjust this number as needed
if len(all_pdf_files) > MAX_PDFS:
    print(f"Processing first {MAX_PDFS} PDFs out of {len(all_pdf_files)}")
    all_pdf_files = all_pdf_files[:MAX_PDFS]

# Process all PDF files
all_results = []
successful_pdfs = 0
for pdf_file in all_pdf_files:
    print(f"\nProcessing {pdf_file}... ({successful_pdfs+1}/{len(all_pdf_files)})")
    try:
        pdf_results = process_pdf_file(pdf_file)
        all_results.extend(pdf_results)
        print(f"Extracted {len(pdf_results)} segments from {pdf_file}")
        successful_pdfs += 1
    except Exception as e:
        print(f"Failed to process {pdf_file}: {e}")
        traceback.print_exc()
        continue

# Save all results to a single file
output_file = OUTPUT_DIR / "structured_passages.jsonl"
with open(output_file, "w", encoding="utf-8") as f:
    for r in all_results:
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")

print(f"✅ Extraction completed, results saved to: {output_file}")
print(f"Total extracted passages: {len(all_results)}")
print(f"Successfully processed {successful_pdfs}/{len(all_pdf_files)} PDFs")

# Display some sample text
if all_results:
    print("\nSample texts:")
    for i, r in enumerate(all_results[:3]):  # Show only first 3
        print(f"[{i+1}] {r['pid']}: {r['text'][:100]}...")
else:
    print("No text was extracted from any PDF.")

## Step 2.a  PyPDF2 + easyocr fallback （Less granularized on passage chunk handling)

In [5]:
# 安裝：
# pip install PyPDF2 easyocr pdf2image

from PyPDF2 import PdfReader
from pdf2image import convert_from_path
import easyocr, numpy as np, json, traceback
from pathlib import Path

# 初始化 OCR
reader = easyocr.Reader(['ch_tra','en'], gpu=True)

# 資料夾設定
PDF_DIRS = [Path("./pdfs/finance"), Path("./pdfs/insurance")]
OUTPUT = Path("./outputs/structured_passages.jsonl")
OUTPUT.parent.mkdir(exist_ok=True)

def extract_text_pypdf2(pdf_path):
    try:
        rdr = PdfReader(str(pdf_path))
        texts = []
        for page in rdr.pages:
            t = page.extract_text()
            if t:
                texts.append(t.replace("\n", " "))
        return " ".join(texts)
    except Exception as e:
        print(f"PyPDF2 failed on {pdf_path}: {e}")
        traceback.print_exc()
        return ""

def fallback_ocr(pdf_path):
    try:
        imgs = convert_from_path(str(pdf_path), dpi=300)
        all_t = []
        for img in imgs:
            arr = np.array(img)
            res = reader.readtext(arr)
            txt = " ".join([r[1] for r in res if r[1].strip()])
            if txt:
                all_t.append(txt)
        return " ".join(all_t)
    except Exception as e:
        print(f"OCR failed on {pdf_path}: {e}")
        traceback.print_exc()
        return ""

with open(OUTPUT, "w", encoding="utf-8") as fout:
    for pdf_dir in PDF_DIRS:
        if not pdf_dir.exists(): continue
        for pdf in pdf_dir.glob("*.pdf"):
            print(f"\nProcessing {pdf.name}…")
            text = extract_text_pypdf2(pdf)
            source = "pypdf2"
            if len(text.strip()) < 40:
                print(f" → too little ({len(text)} chars), OCR fallback…")
                text = fallback_ocr(pdf)
                source = "easyocr"
            if text.strip():
                rec = {
                    "pid": pdf.stem,
                    "page": 0,
                    "bbox": None,
                    "text": text.strip(),
                    "source": source
                }
                fout.write(json.dumps(rec, ensure_ascii=False) + "\n")
                print(f" ✓ extracted {len(text)} chars via {source}")
            else:
                print(f" ❌ still empty for {pdf.name}")

print(f"\nDone! Results in {OUTPUT}")

KeyboardInterrupt: 

## Step 3: Hugging Face Translate

In [1]:
from transformers import MarianMTModel, MarianTokenizer
import torch
import os
import json
from tqdm import tqdm
from opencc import OpenCC  # 新增：簡轉繁

# 下載 HuggingFace 模型 (英文 ➔ 中文)
model_name = "Helsinki-NLP/opus-mt-en-zh"
tokenizer = MarianTokenizer.from_pretrained(model_name)
model = MarianMTModel.from_pretrained(model_name)

# 初始化簡轉繁工具
cc = OpenCC('s2t')  # Simplified Chinese ➔ Traditional Chinese

# 載入 queries
with open("/content/NTCIR-18-CLIR-pipeline-team6939/data/translated_query.json", "r", encoding="utf-8") as f:
    queries = json.load(f)

# 設定 cache 檔案路徑（根據 model 名稱自動命名）
cache_path = f"/content/NTCIR-18-CLIR-pipeline-team6939/outputs/translated_cache_{model_name.replace('/', '_')}.json"
if os.path.exists(cache_path):
    with open(cache_path, "r", encoding="utf-8") as f:
        translated_cache = json.load(f)
else:
    translated_cache = {}

# 定義 NMT 翻譯函式（支援 cache + 自訂是否轉繁體）
def translate_with_nmt(query_en, convert_to_traditional=True):
    if query_en in translated_cache:
        return translated_cache[query_en]
    try:
        inputs = tokenizer(query_en, return_tensors="pt", padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            translated = model.generate(**inputs, max_length=512, num_beams=5)
        result = tokenizer.decode(translated[0], skip_special_tokens=True)
        if convert_to_traditional:
            result = cc.convert(result)  # 自動簡體轉繁體
        translated_cache[query_en] = result
        return result
    except Exception as e:
        print(f"Translation error: {e}")
        return ""

# 執行批次翻譯
translated_output = []
for item in tqdm(queries):
    zh = translate_with_nmt(item["query_en"], convert_to_traditional=True)  # 這邊可以控管要不要轉繁體
    item["query_zh_nmt"] = zh
    translated_output.append(item)

# 存檔：翻譯後查詢 + 快取
os.makedirs("/content/NTCIR-18-CLIR-pipeline-team6939/outputs", exist_ok=True)

with open("/content/NTCIR-18-CLIR-pipeline-team6939/outputs/translated_query_nmt.json", "w", encoding="utf-8") as f:
    json.dump(translated_output, f, indent=2, ensure_ascii=False)

with open(cache_path, "w", encoding="utf-8") as f:
    json.dump(translated_cache, f, indent=2, ensure_ascii=False)

print("✅ HuggingFace NMT 簡繁轉換翻譯完成並成功快取保存。")

ModuleNotFoundError: No module named 'transformers'

## Step 4: Run Retrieval (4 Models with Runtime Logging)

In [11]:

#%cd ./NTCIR-18-CLIR-pipeline-team6939
from run_all_retrievals import run_all_retrievals
run_all_retrievals()

🚀 Running BM25 baseline...
[⏱️] BM25 baseline took 0.18 seconds.

🚀 Running BM25 + Chinese BERT reranker...


Traceback (most recent call last):
  File "/Users/chiuyiting/Documents/GitHub/NTCIR-18-CLIR-pipeline-team6939/src/retrievers/bm25_only.py", line 28, in <module>
    with open(PASSAGE_PATH, 'r', encoding='utf-8') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'NTCIR-18-CLIR-pipeline-team6939/outputs/runs/structured_passages.jsonl'
Traceback (most recent call last):
  File "/Users/chiuyiting/Documents/GitHub/NTCIR-18-CLIR-pipeline-team6939/src/reranker/reranker_zhbert.py", line 23, in <module>
    with open(PASSAGE_PATH, 'r', encoding='utf-8') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: '/content/NTCIR-18-CLIR-pipeline-team6939/outputs/runs/structured_passages.jsonl'


[⏱️] BM25 + Chinese BERT reranker took 3.41 seconds.

🚀 Running Multilingual Dual Encoder...


No sentence-transformers model found with name ./models/labse. Creating a new one with mean pooling.
Traceback (most recent call last):
  File "/Users/chiuyiting/Documents/GitHub/NTCIR-18-CLIR-pipeline-team6939/src/retrievers/dual_encoder_dense.py", line 29, in <module>
    with open(PASSAGE_PATH, 'r', encoding='utf-8') as f:
         ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'outputs/runs/structured_passages.jsonl'


[⏱️] Multilingual Dual Encoder took 5.68 seconds.

🚀 Running Cross Encoder Reranker...


Traceback (most recent call last):
  File "/Users/chiuyiting/Documents/GitHub/NTCIR-pipeline/myenv/lib/python3.11/site-packages/transformers/utils/hub.py", line 424, in cached_files
    hf_hub_download(
  File "/Users/chiuyiting/Documents/GitHub/NTCIR-pipeline/myenv/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 106, in _inner_fn
    validate_repo_id(arg_value)
  File "/Users/chiuyiting/Documents/GitHub/NTCIR-pipeline/myenv/lib/python3.11/site-packages/huggingface_hub/utils/_validators.py", line 154, in validate_repo_id
    raise HFValidationError(
huggingface_hub.errors.HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': '/content/models/cross_encoder'. Use `repo_type` argument if needed.

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/Users/chiuyiting/Documents/GitHub/NTCIR-18-CLIR-pipeline-team6939/src/reranker/cross_encoder_multilingual.py", line 22, in <modu

[⏱️] Cross Encoder Reranker took 2.29 seconds.

🧪 Retrieval Runtime Summary:
BM25 baseline                           : 0.18 seconds
BM25 + Chinese BERT reranker            : 3.41 seconds
Multilingual Dual Encoder               : 5.68 seconds
Cross Encoder Reranker                  : 2.29 seconds


#### (fine-tune done ✅) 目前 zhbert reranker 跑不出來，但因為你說要 fine tune 所以我先跳過他 

## Step 5: 合併結果成 retrieval_rankings.json ，然後評分（評分的 code 有大改，因為原本切太細了，等你修好之後 evaluate 也要再改） 
### 4/29 更：evaluate 改了✅ : MRR 現在多了 cutoff 的參數 （MRR@10)

In [12]:
import json
from collections import defaultdict
from pathlib import Path
import sys

# ────────────────────────────────────────────────
# 1️⃣ Setup Path
# ────────────────────────────────────────────────
BASE         = Path("./")  # Colab = 根目錄
RUNS_DIR     = BASE / "outputs" / "runs"
OUT_RANK     = RUNS_DIR / "retrieval_rankings.json"
GROUND_TRUTH = BASE / "data" / "ground_truths_example.json"
CSV_OUT      = BASE / "outputs" / "evaluation_summary.csv"

RUNS_DIR.mkdir(parents=True, exist_ok=True)

# ────────────────────────────────────────────────
# 2️⃣ 支援多模型（只匯入已完成的 jsonl）
# ────────────────────────────────────────────────
retrieval = {}
available_models = [
    "bm25_only_query_zh_nmt",
    "bm25_rerank_query_zh_nmt",
    "bm25_only_query",
    "bm25_rerank_query",
    "dense_dual_encoder",
    "cross_encoder_rerank"
]

for model_name in available_models:
    fn = RUNS_DIR / f"{model_name}.jsonl"
    if not fn.exists():
        print(f"⚠️ Skipping {model_name}: file not found.")
        continue

    qid_to_pids = defaultdict(list)
    with open(fn, 'r', encoding='utf-8') as f:
        for line in f:
            r = json.loads(line)
            pid = str(r["pid"]).split("_")[0]  # ✨ 只取前半段以對應 ground truth
            qid_to_pids[str(r["qid"])].append(pid)
    retrieval[model_name] = qid_to_pids

# ────────────────────────────────────────────────
# 3️⃣ 儲存為 retrieval_rankings.json
# ────────────────────────────────────────────────
with open(OUT_RANK, 'w', encoding='utf-8') as f:
    json.dump(retrieval, f, indent=2, ensure_ascii=False)
print(f"✅ grouped rankings saved to {OUT_RANK}")

# ────────────────────────────────────────────────
# 4️⃣ 評估所有模型（MRR@10, Recall@10/100, NDCG@10/100）
# ────────────────────────────────────────────────
sys.path.append(str(BASE / "src"))
from evaluation.evaluation_summary import evaluate_all_models

df = evaluate_all_models(
    ranking_path=str(OUT_RANK),
    ground_truth_path=str(GROUND_TRUTH),
    output_csv_path=str(CSV_OUT),
    ks=[10, 100]
)
df

⚠️ Skipping bm25_only: file not found.
⚠️ Skipping dense_dual_encoder: file not found.
⚠️ Skipping cross_encoder: file not found.
✅ grouped rankings saved to outputs/runs/retrieval_rankings.json


ModuleNotFoundError: No module named 'pandas'

                Model  MRR  Recall@10  NDCG@10  Recall@100  NDCG@100
0           bm25_only  0.0        0.0      0.0         0.0       0.0
1  dense_dual_encoder  0.0        0.0      0.0         0.0       0.0
2       cross_encoder  0.0        0.0      0.0         0.0       0.0

## Step 6: Translation Error Impact Analysis (還沒試過)
###  4/29 2100更：我也更新了這段 +translate error_analysis的code

In [None]:
from src.analysis.translate_error_analysis import extract_translation_impact

impact = extract_translation_impact(
    queries_path="data/translated_query.json",
    predictions_path="outputs/runs/retrieval_rankings.json",
    ground_truth_path="data/ground_truths_example.json"
)

for category, group in impact.items():
    print(f"\n== {category.upper()} ({len(group)} samples) ==")
    for qid, en, zh, pred, gt in group[:1]:  # 每個 category 顯示1個 example
        print(f"QID: {qid}\nEN: {en}\nZH(NMT): {zh}\nPRED_TOPK: {pred}\nGT: {gt}\n---")