In [None]:
from qdrant_client import QdrantClient, models
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
import os
import requests

In [None]:
model_path = os.path.expanduser("Qwen3-Reranker-0.6B")

reranker_tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    local_files_only=True,
    trust_remote_code = True
)

reranker_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    local_files_only=True,
    trust_remote_code=True
).eval()

In [None]:
token_false_id = reranker_tokenizer.convert_tokens_to_ids("no")
token_true_id = reranker_tokenizer.convert_tokens_to_ids("yes")

In [None]:
max_reranker_length = 8192

In [None]:
prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n"

In [None]:
prefix_tokens = reranker_tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = reranker_tokenizer.encode(suffix, add_special_tokens=False)

In [None]:
# Reranker 函數
def format_instruction(instruction, query, doc):
    """格式化 reranker 的輸入"""
    if instruction is None:
        instruction = '根據查詢檢索相關文件'

    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
        instruction=instruction, query=query, doc=doc
    )
    return output

In [None]:
def get_embedding(texts):
    # 1. 確保輸入是 list
    if isinstance(texts, str):
        texts = [texts]

    payload = {
        "texts": texts,
        "normalize": True,
        "batch_size": 32
    }

    try:
        response = requests.post(EMBEDDING_API_URL, json=payload, timeout=10)
        response.raise_for_status()
        result = response.json()

        return result.get("embedding", [])

    except Exception as e:
        print(f"* Embedding failed: {e}")
        return []

In [None]:
def process_inputs(pairs):
    processed_pairs = []
    for pair in pairs:
        pair_ids = reranker_tokenizer.encode(
            pair,
            add_special_tokens=False,
            truncation=True,
            max_length=max_reranker_length - len(prefix_tokens) - len(suffix_tokens)
        )
        full_ids = prefix_tokens + pair_ids + suffix_tokens
        processed_pairs.append(reranker_tokenizer.decode(full_ids))

        inputs = reranker_tokenizer(
            processed_pairs,
            padding=True,
            truncation=True,
            max_length=max_reranker_length
        )

        for key in inputs:
            inputs[key] = input[key].to(reranker_model.device)

        return  inputs

In [None]:
@torch.no_grad()
def compute_logits(inputs):
    """計算相關性分數"""
    batch_scores = reranker_model(**inputs).logits[:, -1, :]

    true_vector = batch_scores[:, token_true_id]
    false_vector = batch_scores[:, token_false_id]

    batch_scores = torch.stack([false_vector, true_vector], dim=1)
    batch_scores = torch.nn.functional.log_softmax(batch_scores, dim=1)

    scores = batch_scores[:, 1].exp().tolist()

    return scores

In [None]:
def rerank_documents(query, documents, task_instruction=None):
    """
    使用 Qwen3-Reranker 重新排序文件

    Args:
        query: 查詢字串
        documents: 文件列表
        task_instruction: 任務指令（可選）

    Returns:
        排序後的 (文件, 分數) 元組列表
    """
    if task_instruction is None:
        task_instruction = '根據查詢檢索相關的技術文件'

    # 格式化輸入
    pairs = [format_instruction(task_instruction, query, doc) for doc in documents]

    # 處理輸入並計算分數
    inputs = process_inputs(pairs)
    scores = compute_logits(inputs)

    # 組合文件和分數，並按分數降序排序
    doc_scores = list(zip(documents, scores))
    doc_scores.sort(key=lambda x: x[1], reverse=True)

    return doc_scores


In [None]:
# 整合 Reranker 的混合搜索
def hybrid_search_with_rerank(query: str, initial_limit: int = 20, final_limit: int = 3):
    """
    使用 RRF 混合搜索 + Reranker 重排

    Args:
        query: 查詢字串
        initial_limit: 初始檢索的文件數量（用於 reranking）
        final_limit: 最終返回的文件數量

    Returns:
        重排後的 top-k 結果
    """
    # 用 API 取得 query 的嵌入向量
    query_embedding = get_embeddings([query], task_description="檢索技術文件")[0]

    # 混合搜索（RRF）
    response = client.query_points(
        collection_name=collection_name,
        prefetch=[
            # BM25 關鍵字搜索
            models.Prefetch(
                query=models.Document(
                    text=query,
                    model="Qdrant/bm25",
                ),
                using="sparse",
                limit=initial_limit,
            ),
            # 語義搜索
            models.Prefetch(
                query=query_embedding,
                using="dense",
                limit=initial_limit,
            ),
        ],
        # 使用 RRF 融合演算法
        query=models.FusionQuery(fusion=models.Fusion.RRF),
        limit=initial_limit,
    )


In [None]:
    # 提取候選文件
    candidate_docs = [point.payload["text"] for point in response.points]

    if not candidate_docs:
        return []

    # 使用 Reranker 重新排序
    print(f"正在對 {len(candidate_docs)} 個候選文件進行重排...")
    reranked_results = rerank_documents(query, candidate_docs)

    # 返回 top-k 結果
    top_results = reranked_results[:final_limit]

    print(f"\n查詢: {query}")
    print(f"重排後的 Top {final_limit} 結果:")
    print("=" * 80)

    for i, (doc, score) in enumerate(top_results, 1):
        print(f"\n[{i}] 相關性分數: {score:.4f}")
        print(f"文件: {doc}")
        print("-" * 80)

    return top_results

In [None]:
# 執行混合搜索 + Reranking
query = "如何使用向量資料庫進行語義搜索？"
results = hybrid_search_with_rerank(
    query=query,
    initial_limit=20, # top-N
    final_limit=3 # top-K
)


In [None]:
model_path = os.path.expanduser("~/AI/Models/Qwen3-Reranker-0.6B")


# 載入模型
reranker_tokenizer = AutoTokenizer.from_pretrained(
    model_path,
    local_files_only=True,
    trust_remote_code=True
)


reranker_model = AutoModelForCausalLM.from_pretrained(
    model_path,
    local_files_only=True,
    trust_remote_code=True
).eval()


print("Reranker model loaded successfully.")


# 獲取 token IDs，先獲取才能知道結果
token_false_id = reranker_tokenizer.convert_tokens_to_ids("no")
token_true_id = reranker_tokenizer.convert_tokens_to_ids("yes")


# 最大長度設定
max_reranker_length = 8192


# Prompt 模板
prefix = "<|im_start|>system\nJudge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be \"yes\" or \"no\".<|im_end|>\n<|im_start|>user\n"
suffix = "<|im_end|>\n<|im_start|>assistant\n"


prefix_tokens = reranker_tokenizer.encode(prefix, add_special_tokens=False)
suffix_tokens = reranker_tokenizer.encode(suffix, add_special_tokens=False)


print("Reranker configuration completed.")


In [None]:
def format_instruction(instruction, query, doc):
    """格式化 reranker 的輸入"""
    if instruction is None:
        instruction = '根據查詢檢索相關文件'

    output = "<Instruct>: {instruction}\n<Query>: {query}\n<Document>: {doc}".format(
        instruction=instruction, query=query, doc=doc
    )
    return output

In [None]:
def process_inputs(pairs):
    """處理 reranker 的輸入"""
    # 先加 prefix 和 suffix
    processed_pairs = []
    for pair in pairs:
        pair_ids = reranker_tokenizer.encode(
            pair,
            add_special_tokens=False,
            truncation=True,
            max_length=max_reranker_length - len(prefix_tokens) - len(suffix_tokens)
        )
        full_ids = prefix_tokens + pair_ids + suffix_tokens
        processed_pairs.append(reranker_tokenizer.decode(full_ids))

    # 一次就完成編碼和填充
    inputs = reranker_tokenizer(
        processed_pairs,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=max_reranker_length
    )

    # 移動到模型設備
    for key in inputs:
        inputs[key] = inputs[key].to(reranker_model.device)

    return inputs


In [None]:
from openai import OpenAI
from deepeval.models import DeepEvalBaseLLM


class LlamaCppModel(DeepEvalBaseLLM):
    def __init__(
        self,
        base_url="https://ws-02.wade0426.me/v1",
        model_name="local-model"
    ):
        self.base_url = base_url
        self.model_name = model_name

    def load_model(self):
        # 建立 OpenAI 客戶端
        return OpenAI(
            api_key="NoNeed",
            base_url=self.base_url
        )

    def generate(self, prompt: str) -> str:
        client = self.load_model()
        response = client.chat.completions.create(
            model=self.model_name,
            messages=[
                {"role": "user", "content": prompt}
            ],
            temperature=0.7,
        )
        return response.choices[0].message.content

    async def a_generate(self, prompt: str) -> str:
        # 如果需要非同步版本，可以使用 AsyncOpenAI
        # 這裡為簡化示範，直接重用同步方法
        return self.generate(prompt)

    def get_model_name(self):
        return f"Llama.cpp ({self.model_name})"


In [None]:
from deepeval.metrics import FaithfulnessMetric
from deepeval.test_case import LLMTestCase


# 初始化自訂模型
custom_llm = LlamaCppModel(
    base_url="https://ws-02.wade0426.me/v1",
    model_name="your-model-name"
)
