# 📘 NTCIR Cross-Lingual Retrieval Pipeline Demo
From PDF to Passage Retrieval with Translation, BM25, Dense Embedding, and Reranking

In [2]:
# 📦 Step 1: Setup Environment and Mount Drive
!apt install poppler-utils -y
!pip3 install pymupdf easyocr pdf2image openai rank_bm25 sentence-transformers transformers faiss-cpu
from google.colab import drive
drive.mount('/content/drive')

zsh:1: command not found: apt

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.0.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49m/usr/local/opt/python@3.13/bin/python3.13 -m pip install --upgrade pip[0m
[1;31merror[0m: [1mexternally-managed-environment[0m

[31m×[0m This environment is externally managed
[31m╰─>[0m To install Python packages system-wide, try brew install
[31m   [0m xyz, where xyz is the package you are trying to
[31m   [0m install.
[31m   [0m 
[31m   [0m If you wish to install a Python library that isn't in Homebrew,
[31m   [0m use a virtual environment:
[31m   [0m 
[31m   [0m python3 -m venv path/to/venv
[31m   [0m source path/to/venv/bin/activate
[31m   [0m python3 -m pip install xyz
[31m   [0m 
[31m   [0m If you wish to install a Python application that isn't in Homebrew,
[31m   [0m it may be easiest to use 'pipx i

ModuleNotFoundError: No module named 'google.colab'

In [3]:
# 📄 Step 2: Extract Paragraphs from PDF (PyMuPDF + EasyOCR)
import fitz
import easyocr
import json
import os
from pdf2image import convert_from_path

reader = easyocr.Reader(['ch_tra', 'en'], gpu=False)

def extract_blocks_with_heuristics(pdf_path, min_block_length=40):
    doc = fitz.open(pdf_path)
    results = []
    doc_id = os.path.splitext(os.path.basename(pdf_path))[0]
    for page_num, page in enumerate(doc):
        blocks = page.get_text("blocks")
        for i, block in enumerate(sorted(blocks, key=lambda b: b[1])):
            x0, y0, x1, y1, text, *_ = block
            clean_text = text.strip().replace("\n", " ")
            if len(clean_text) >= min_block_length:
                results.append({
                    "pid": f"{doc_id}_p{page_num}_b{i}",
                    "page": page_num,
                    "bbox": [x0, y0, x1, y1],
                    "text": clean_text
                })
    return results

def fallback_ocr_easyocr(pdf_path):
    images = convert_from_path(pdf_path, dpi=300)
    results = []
    doc_id = os.path.splitext(os.path.basename(pdf_path))[0]
    for page_num, image in enumerate(images):
        ocr_result = reader.readtext(image)
        full_text = " ".join([res[1] for res in ocr_result if len(res[1].strip()) > 0])
        if full_text.strip():
            results.append({
                "pid": f"{doc_id}_ocr_{page_num}",
                "page": page_num,
                "bbox": None,
                "text": full_text.strip()
            })
    return results

def process_pdf_file(pdf_path):
    try:
        segments = extract_blocks_with_heuristics(pdf_path)
        if not segments or all(len(seg['text']) < 40 for seg in segments):
            raise ValueError("Fallback to OCR due to poor extraction.")
        return segments
    except:
        return fallback_ocr_easyocr(pdf_path)

pdf_path = "/content/drive/MyDrive/pdfs/example.pdf"
results = process_pdf_file(pdf_path)
with open("structured_passages.jsonl", "w", encoding="utf-8") as f:
    for r in results:
        json.dump(r, f, ensure_ascii=False)
        f.write("\n")
print("✅ Done extracting passages.")

ModuleNotFoundError: No module named 'frontend'

In [None]:
# 🌐 Step 3: GPT-based Batch Translation with Safety Check
import openai
import json
from tqdm import tqdm

openai.api_key = "your-api-key-here"  # 替換成你的 GPT API key

# 載入查詢資料
with open("/content/questions_translated_en_fixed_q1.json", "r", encoding="utf-8") as f:
    full_queries = json.load(f)

# GPT 翻譯函式（略過 placeholder）
def translate_with_gpt(query_en, model="gpt-3.5-turbo"):
    if "EN Translation of:" in query_en:
        return query_en  # 視為未處理或 placeholder
    try:
        messages = [
            {"role": "system", "content": "You are a professional translator who translates English financial search queries into Traditional Chinese."},
            {"role": "user", "content": f"Translate this search query into Traditional Chinese: '{query_en}'"}
        ]
        response = openai.ChatCompletion.create(
            model=model,
            messages=messages,
            temperature=0,
        )
        return response.choices[0].message.content.strip()
    except Exception as e:
        print(f"Error translating: {query_en} -> {e}")
        return ""

# 執行翻譯
translated_output = []
for item in tqdm(full_queries):
    zh = translate_with_gpt(item["query_en"])
    item["query_zh_gpt"] = zh
    translated_output.append(item)

# 輸出翻譯檔
with open("/content/translated_queries_gpt.json", "w", encoding="utf-8") as f:
    json.dump(translated_output, f, ensure_ascii=False, indent=2)

print("✅ Complete Translate with GPT, %d queries handled! " % len(translated_output))