In [1]:
from ragflow_sdk import RAGFlow
import pandas as pd

In [2]:
base_url='http://192.168.5.85:9380'
RAGFLOW_API_KEY='ragflow-Y5ZjA5YWY2MzU0ZDExZjBhYjkwMDI0Mm'

In [3]:
MODEL_NAME='gemini-2.5-flash-lite-preview-06-17'

In [4]:
TEMPERATURE=0.2
TOP_P=0.3
PRESENCE_PENALTY=0.4
FREQUENCY_PENALTY=0.7

In [5]:
# Retrive config
RERANKER='BAAI/bge-reranker-v2-m3___Huggingface'
VECTOR_SIMILARITY_WEIGHT=0.9
SIMILARITY_THRESHOLD=0.2
EMBEDDING_TOP_K=100
RERANKER_TOP_K=100

In [6]:
def create_prompt_templates():
    bio_prompt_text = f"""Given a paragraph of patient information from discharge note, please extract the phenotype about this patient only.

Check the Human Phenotype Ontology (HPO) database to determine the phenotype.

Only output the extracted phenotypes.

Use the format: {{‘HPO’: ‘HP:0000000’, ‘Phenotype’: ‘Phenotype description’}}

Patient information:"""
    return bio_prompt_text

In [7]:
llm_config = {
    "model_name": MODEL_NAME,
    "temperature": TEMPERATURE,
    "top_p":TOP_P,
    "presence_penalty":PRESENCE_PENALTY,
    "frequency_penalty": FREQUENCY_PENALTY,
    #"max_tokens": 7000 # remove to debug
}

In [9]:
rag_client = RAGFlow(api_key=RAGFLOW_API_KEY, base_url=base_url)
rag_client.delete_chats()

In [10]:
import time
import random
import uuid

def create_chat(#clinical_summary: str,
                rerank_id, 
            vector_similarity_weight: float = 0.3, similarity_threshold: float = 0.2, 
            embedding_top_k: int = 100, reranker_top_k: int = 100):

        
    # Get prompt templates
    bio_prompt_text= create_prompt_templates()
    
    # Create prompt configuration based on type
    prompt_config = {
        "prompt": bio_prompt_text,
        "show_quote": False,
        "top_n": 1,
        "similarity_threshold": similarity_threshold,
        "keywords_similarity_weight": 1-vector_similarity_weight,
        "rerank_model": rerank_id,
        "top_k": 100,
        "variables": [{"key": "knowledge", "optional": True}],
        "empty_response": None,
    }
    unique_id = str(uuid.uuid4())
    # Create a unique name for the chat
    unique_name = f"phenotype_extractor_{int(time.time())}_{random.randint(1000, 99999999)}_{unique_id}"

    try:
        # Create the chat session
        assistant = rag_client.create_chat(
            name=unique_name,
        )
        
        update_data = {
            "llm": llm_config,
            "prompt": prompt_config
        }
        assistant.update(update_data)
        
        # 創建 session
        session = assistant.create_session(f'phenotype_type_summary_assistant_{int(time.time())}')
        return session
    except Exception as e:
        print(f"Error creating chat session: {str(e)}")
        raise

In [11]:
import json
import ast
import re
def output_json_formater(raw_string, output_type='json'):
    """
    一個強健的、有彈性的函式，用於解析來自 LLM 的 HPO 表型列表。
    它可以處理多種格式變化和單行錯誤，而不會導致整個函式崩潰。
    """
    # 1. 基本驗證：確保輸入不是空的
    if not raw_string or not isinstance(raw_string, str):
        return {} if output_type == 'json' else '-'

    # 2. 清洗字串：替換各種智慧引號，這是處理 LLM 輸出的關鍵第一步
    clean_string = raw_string.replace("’", "'").replace("‘", "'")
    clean_string = clean_string.replace('“', '"').replace('”', '"')

    # 3. 提取所有類字典結構：使用正規表達式尋找所有被 `{...}` 包裹的內容
    #    這是比按行分割更強大的方法，可以忽略行前綴和周圍的文字
    dict_strings = re.findall(r'\{.*?\}', clean_string)

    if not dict_strings:
        return {} if output_type == 'json' else '-'

    # 4. 逐一解析並收集有效資料
    valid_data_list = []
    for dict_str in dict_strings:
        try:
            # ast.literal_eval 是安全的，但仍需錯誤處理
            data = ast.literal_eval(dict_str)
            # 確保解析出來的是字典且包含必要欄位
            if isinstance(data, dict) and 'HPO' in data and 'Phenotype' in data:
                valid_data_list.append(data)
        except (ValueError, SyntaxError, MemoryError):
            # 如果單一項目解析失敗，印出提示並繼續處理下一個，而不是讓整個函式崩潰
            # print(f"Skipping malformed entry: {dict_str}")
            continue
    
    # 如果沒有任何一個項目被成功解析
    if not valid_data_list:
        return {} if output_type == 'json' else '-'

    # 5. 轉換成最終格式
    result_dict = {item['HPO']: item['Phenotype'] for item in valid_data_list}
    
    if output_type == 'json':
        return result_dict
    
    if output_type == 'string':
        # 直接從字典的鍵創建列表，因為鍵是唯一的
        return ','.join(result_dict.keys())
        
    # 預留一個預設回傳值
    return result_dict

In [12]:
import google.generativeai as genai

# 設定你的 API 金鑰
genai.configure(api_key="AIzaSyDcSzeBZQ8lvCkpl2693ogJ4HwFUCZ7MxQ")

import google.generativeai as genai
from typing import List, Optional

def gemini_price_counter(
    prompt: str,
    input_list: Optional[List[str]] = None,
    output_list: Optional[List[str]] = None,
    input_token_price_per_million: float = 0.1,
    output_token_price_per_million: float = 0.4,
    model_name: str = 'gemini-1.5-flash'
) -> dict:
    """
    計算 Gemini API 的 token 用量和預估費用。

    Args:
        prompt (str): 主要的提示字串。
        input_list (Optional[List[str]]): 額外的輸入內容列表。
        output_list (Optional[List[str]]): 模型回應的輸出內容列表。
        input_token_price_per_million (float): 每百萬個輸入 token 的價格 (美元)。
        output_token_price_per_million (float): 每百萬個輸出 token 的價格 (美元)。
        model_name (str): 用來計算 token 的模型名稱。

    Returns:
        dict: 包含 total_tokens 和 total_cost 的字典。
    """
    if input_list is None:
        input_list = []
    if output_list is None:
        output_list = []

    model = genai.GenerativeModel(model_name)
    total_input_tokens = 0
    total_output_tokens = 0

    # 處理輸入 token：將所有輸入內容合併為一個完整的請求
    combined_input_text = prompt + " ".join(input_list)
    total_input_tokens = model.count_tokens(combined_input_text).total_tokens

    # 處理輸出 token
    if output_list:
        combined_output_text = " ".join(output_list)
        total_output_tokens = model.count_tokens(combined_output_text).total_tokens

    # 計算總費用
    input_cost = (total_input_tokens / 1_000_000) * input_token_price_per_million
    output_cost = (total_output_tokens / 1_000_000) * output_token_price_per_million
    total_cost = input_cost + output_cost

    return {
        "total_input_tokens": total_input_tokens,
        "total_output_tokens": total_output_tokens,
        "total_tokens": total_input_tokens + total_output_tokens,
        "total_cost_usd": total_cost,
        "input_cost_usd": input_cost,
        "output_cost_usd": output_cost
    }

# --- 使用範例 ---
# if __name__ == "__main__":
#     main_prompt = "請幫我總結以下文字："
#     input_data = [
#         "Python 是一種高階、直譯式、通用且功能強大的程式語言。",
#         "Google Cloud 提供一系列雲端運算服務。"
#     ]
#     output_data = [
#         "Python 是一種功能強大的高階程式語言，而 Google Cloud 則提供多樣的雲端服務。"
#     ]

#     price_info = gemini_price_counter(
#         prompt=main_prompt,
#         input_list=input_data,
#         output_list=output_data,
#     )

#     print(f"總輸入 token 數: {price_info['total_input_tokens']}")
#     print(f"總輸出 token 數: {price_info['total_output_tokens']}")
#     print(f"總費用 (USD): {price_info['total_cost_usd']:.8f}")

In [12]:
clinical_summary = 'A a female shows clinical features including High forehead, Bulging forehead, Broad forehead, Arched eyebrows, small hypothenar eminence, Dropped arches, Abnormality of the reproductive system, Triangular head shape, Flat back of skull, Prominent back of the head, Decreased width of the forehead, Flat face, Face with broad temples and narrow chin, Circular face, Broad face, Wide-set eyes, Eye folds, Bushy eyebrows, Sparse eyebrow, Increased length of eyelashes, Downward slanting of the opening between the eyelids, Drooping upper eyelid, Deep set eye, Bulging eye, Cat eye, Clouding of the lens of the eye, ectopia lentis, esodeviation, exodeviation, Broad flat nasal bridge, Elongated nose, Bulbous nose, Nasal tip, upturned, Decreased size of nasal tip, low hanging columella, long philtrum, short philtrum, smooth philtrum, deep philtrum, Thin red part of the upper lip, Full upper lip, Decreased volume of lower lip, Full lower lip, Drooping lower lip, Elevated palate, Cleft palate, Abnormality of dental shape, Apple cheeks, Small ears, Low set ears, Ears rotated toward back of head, Conductive deafness, Low frontal hairline, Brittle hair, Decreased length of neck, Neck webbing, small thenar eminence, Broad thumb, Club feet, Long big toe, Bunion, Toe curvature, and umbilical hernia, consistent with a diagnosis of jacobsen syndrome (omim:147791).'

In [13]:
# # Retrive config
# RERANKER='BAAI/bge-reranker-v2-m3___Huggingface'
# VECTOR_SIMILARITY_WEIGHT=0.9
# SIMILARITY_THRESHOLD=0.2
# EMBEDDING_TOP_K=100
# RERANKER_TOP_K=100

In [14]:
# session = create_chat(RERANKER, VECTOR_SIMILARITY_WEIGHT, 
#     SIMILARITY_THRESHOLD, EMBEDDING_TOP_K, RERANKER_TOP_K)

In [43]:
import threading
import concurrent.futures
from functools import partial
from tqdm.notebook import tqdm
import json

# 假設這些是您既有的函式和變數
# from some_module import create_chat, output_json_formater, RERANKER, ...

# -----------------
# 步驟 1: 建立全域的執行緒安全快取
# -----------------
# 這是我們用來儲存已處理結果的地方。
# 使用 threading.Lock 來確保在讀取或寫入快取時不會有兩個執行緒同時操作，造成資料混亂。
output_cache = {}
cache_lock = threading.Lock()

# -----------------
# 步驟 2: 建立 Thread-Local 儲存物件
# -----------------
thread_local_data = threading.local()

# -----------------
# 步驟 3: 建立新的 Worker 函式，整合 Session 重複使用和快取邏輯
# -----------------
def worker_with_cache(summary, output_type):
    """
    這個 worker 函式會為每個執行緒創建並重複使用一個 session。
    並在處理前，先檢查快取中是否已有結果。
    """
    # 執行緒安全的快取檢查
    with cache_lock:
        if summary in output_cache:
            # 如果快取中已有結果，直接回傳
            print(f"快取命中！跳過處理：'{summary[:30]}...'")
            return output_cache[summary]

    # 檢查當前這個執行緒是否已經初始化過 session
    if not hasattr(thread_local_data, 'session'):
        print(f"執行緒 {threading.get_ident()}: 正在初始化新的 session...")
        thread_local_data.session = create_chat(
            RERANKER,
            VECTOR_SIMILARITY_WEIGHT,
            SIMILARITY_THRESHOLD,
            EMBEDDING_TOP_K,
            RERANKER_TOP_K
        )

    session = thread_local_data.session

    try:
        stream_response = []
        for ans in session.ask(summary, stream=True):
            stream_response.append(ans)

        if stream_response:
            last_response = stream_response[-1]
            result = output_json_formater(last_response.content, output_type=output_type)
            time.sleep(1)
        else:
            result = '-'
    
    except Exception as e:
        print(f"執行緒 {threading.get_ident()}: 處理 '{summary[:30]}...' 時發生錯誤: {e}")
        result = None

    # 將結果儲存到快取中
    # 在寫入快取時也需要鎖定
    with cache_lock:
        output_cache[summary] = result

    return result

# -----------------
# 步驟 4: 修改主函式以適配新的 Worker
# -----------------
def process_summaries_in_parallel(summaries_list, output_type='string', max_workers=10):
    """
    使用多執行緒平行處理臨床摘要列表，並為每個執行緒重複使用 session。
    """
    # 在開始處理前，你可以選擇性地檢查快取，排除已處理過的項目
    # 但在 worker 內部檢查通常更高效，因為它能處理重複的輸入。

    with concurrent.futures.ThreadPoolExecutor(max_workers=max_workers) as executor:
        task_function = partial(worker_with_cache, output_type=output_type)
        results_iterator = executor.map(task_function, summaries_list)
        
        # 使用 tqdm 顯示進度
        results = list(tqdm(results_iterator, total=len(summaries_list), desc="處理臨床摘要中"))

    return results

In [46]:
output_cache = {key: value for key, value in output_cache.items() if value != '-'}

In [48]:
len(output_cache)

888

In [49]:
output_cache

{'A a female shows clinical features including High forehead, Bulging forehead, Broad forehead, Arched eyebrows, small hypothenar eminence, Dropped arches, Abnormality of the reproductive system, Triangular head shape, Flat back of skull, Prominent back of the head, Decreased width of the forehead, Flat face, Face with broad temples and narrow chin, Circular face, Broad face, Wide-set eyes, Eye folds, Bushy eyebrows, Sparse eyebrow, Increased length of eyelashes, Downward slanting of the opening between the eyelids, Drooping upper eyelid, Deep set eye, Bulging eye, Cat eye, Clouding of the lens of the eye, ectopia lentis, esodeviation, exodeviation, Broad flat nasal bridge, Elongated nose, Bulbous nose, Nasal tip, upturned, Decreased size of nasal tip, low hanging columella, long philtrum, short philtrum, smooth philtrum, deep philtrum, Thin red part of the upper lip, Full upper lip, Decreased volume of lower lip, Full lower lip, Drooping lower lip, Elevated palate, Cleft palate, Abnor

In [None]:
discache

In [72]:
# output_str = gemini_phenotype_extractor(clinical_summary)

In [14]:
df_hpo_mapping = pd.read_csv('./reference/2025-08-01_orphanet_WGS_database(HPO_ID_Mapping_v20250506).csv', sep=',')
hpo_mapping_dict = {}
for idx in df_hpo_mapping.index:
    hpo_mapping_dict[df_hpo_mapping.input_hpo_id[idx]] = df_hpo_mapping.mapped_main_id[idx]
    

In [15]:
def hpo_map(hpo_id_list_str):
    if hpo_id_list_str =='-':
        return '-'
    hpo_id_list = hpo_id_list_str.split(',')
    normalized_hpo_id_list = []
    for hpo_id in hpo_id_list:
        if hpo_id not in normalized_hpo_id_list:
            normalized_hpo_id_list.append(hpo_mapping_dict.get(hpo_id,'-'))
    return ','.join(normalized_hpo_id_list)

def accuracy_calculator(true_hpo_str, pred_hpo_str):
    true_hpo_list = true_hpo_str.split(',')
    pred_hpo_list = pred_hpo_str.split(',')
    mapped_hpo_list = [hpo for hpo in pred_hpo_list if hpo in true_hpo_list]
    return len(mapped_hpo_list)/len(true_hpo_list)

## Gemini Flash Lite 2.5 Usage and Price 

In [16]:
import pandas as pd
phenopacket = pd.read_excel('./reference/20250805_phenopacket_with_demographics_summary_phenobert_result.xlsx')

In [17]:
phenopacket.head(2)

Unnamed: 0,case_id,sex,age,diagnoses,hpo_terms,clinical_summary,hpo_ids,normalized_hpo_ids,phenobert_hpo_ids,normalized_phenobert_hpo_ids,phenobert_accuracy
0,PMID_15266616_100,FEMALE,,Jacobsen syndrome (OMIM:147791),High forehead (HP:0000348); Prominent forehead...,A a female shows clinical features including H...,"HP:0000348,HP:0011220,HP:0000337,HP:0002553,HP...","HP:0000348,HP:0011220,HP:0000337,HP:0002553,HP...","HP:0006482,HP:0000078,HP:0000293,HP:0002553,HP...","HP:0006482,HP:0000078,HP:0000293,HP:0002553,HP...",0.952381
1,PMID_15266616_101,FEMALE,,Jacobsen syndrome (OMIM:147791),High forehead (HP:0000348); Prominent forehead...,This case involves A a female diagnosed with j...,"HP:0000348,HP:0011220,HP:0000337,HP:0000311,HP...","HP:0000348,HP:0011220,HP:0000337,HP:0000311,HP...","HP:0006482,HP:0000078,HP:0000293,HP:0002553,HP...","HP:0006482,HP:0000078,HP:0000293,HP:0002553,HP...",0.9375


In [18]:
clinical_summary_list = phenopacket['clinical_summary'].to_list()
hpo_terms_list = phenopacket['hpo_terms'].to_list()
prompt_1 = f"""Given a paragraph of patient information from discharge note, please extract the phenotype about this patient only.

Check the Human Phenotype Ontology (HPO) database to determine the phenotype.

Only output the extracted phenotypes.

Use the format: {{‘HPO’: ‘HP:0000000’, ‘Phenotype’: ‘Phenotype description’}}

Patient information:"""

In [19]:
price_info = gemini_price_counter(
    prompt=prompt_1,
    input_list=clinical_summary_list,
    output_list=hpo_terms_list,
)

print(f"總輸入 token 數: {price_info['total_input_tokens']}")
print(f"總輸出 token 數: {price_info['total_output_tokens']}")
print(f"總費用 (USD): {price_info['total_cost_usd']:.8f}")

總輸入 token 數: 1066473
總輸出 token 數: 2465233
總費用 (USD): 1.09274050


In [20]:
from tqdm.notebook import tqdm
tqdm.pandas()

In [44]:
gemini_FL_hpo_list =  process_summaries_in_parallel(phenopacket['clinical_summary'].to_list(), max_workers=1)

執行緒 123630722733760: 正在初始化新的 session...


處理臨床摘要中:   0%|          | 0/8174 [00:00<?, ?it/s]

快取命中！跳過處理：'This case involves A a female ...'
快取命中！跳過處理：'Recurrent fungal infections, d...'
快取命中！跳過處理：'Chronic mucocutaneous candidia...'
快取命中！跳過處理：'Chronic mucocutaneous candidia...'
快取命中！跳過處理：'Megaloblastic anemia, high uri...'
快取命中！跳過處理：'Clinical findings such as Inte...'
快取命中！跳過處理：'This case involves A a female ...'
快取命中！跳過處理：'Intellectual disability, incre...'
快取命中！跳過處理：'Clinical findings such as Inte...'
快取命中！跳過處理：'Abnormality of balance, loss o...'
快取命中！跳過處理：'Clinical findings such as Brui...'
快取命中！跳過處理：'Clinical findings such as lowe...'
快取命中！跳過處理：'Patchy darkened skin, decrease...'
快取命中！跳過處理：'Patchy darkened skin, decrease...'
快取命中！跳過處理：'A a male with intellectual dev...'
快取命中！跳過處理：'This case involves A a female ...'
快取命中！跳過處理：'Clinical findings such as Shor...'
快取命中！跳過處理：'Short feet, circular face, inc...'
快取命中！跳過處理：'Clinical findings such as Shor...'
快取命中！跳過處理：'Short feet, circular face, inc...'
快取命中！跳過處理：'A a female with ectopia lentis...'
快取命中！跳過處理：'A a female with ectopia

In [25]:
gemini_FL_hpo_list

['HP:0000218,HP:0011194,HP:0000598,HP:0010770,HP:0001892,HP:0000119,HP:0000278,HP:0005488,HP:0000256,HP:0002095,HP:0000324,HP:0000597,HP:0000322,HP:0000508,HP:0000490,HP:0000426,HP:0000343,HP:0000219,HP:0000175,HP:0000368,HP:0000405,HP:0002224',
 'HP:0000218,HP:0011191,HP:0000241,HP:0000634,HP:0000598,HP:0000636,HP:0000582,HP:0000508,HP:0000457,HP:0000173,HP:0000343,HP:0000220,HP:0000298,HP:0010773,HP:0001884,HP:0000230,HP:0000490,HP:0000505,HP:0000485,HP:0000507,HP:0000426,HP:0000219,HP:0000175,HP:0000368,HP:0001314,HP:0002224,HP:0000132,HP:0000955,HP:0000150',
 'HP:0000211,HP:0000213,HP:0000336,HP:0000587,HP:0000508,HP:0000343,HP:0000369,HP:0000370,HP:0011744,HP:0011745,HP:0001882,HP:0001874,HP:0001877,HP:0001875,HP:0005498,HP:0000244,HP:0000236,HP:0011220,HP:0000241,HP:0000586,HP:0000577,HP:0000578,HP:0000598,HP:0000490,HP:0000539,HP:0000505,HP:0000477,HP:0000511,HP:0000512,HP:0000513,HP:0000426,HP:0000431,HP:0000433,HP:0000434,HP:0000440,HP:0000219,HP:0000220,HP:0000218,HP:0000175,

In [26]:
phenopacket['gemini_flash_lite_25_hpo_ids'] = gemini_FL_hpo_list

In [27]:
phenopacket['gemini_flash_lite_25_hpo_idss'] = phenopacket['gemini_flash_lite_25_hpo_ids'].fillna('-')

In [30]:
phenopacket['normalized_gemini_flash_lite_25_hpo_ids'] = phenopacket['gemini_flash_lite_25_hpo_ids'].apply(hpo_map)

In [31]:
phenopacket['gemini_flash_lite_25_accuracy'] = phenopacket.apply(lambda x:accuracy_calculator(x.normalized_hpo_ids, x.normalized_gemini_flash_lite_25_hpo_ids), axis=1)

In [32]:
phenopacket['gemini_flash_lite_25_accuracy'].mean()

np.float64(0.005516715510062375)

In [36]:
phenopacket[phenopacket['normalized_gemini_flash_lite_25_hpo_ids'] =='-'].shape[0]

7858