In [1]:
import sys
#sys.path.append("/mnt/AI/wgs-database/phenotype_extractors/PhenoBERT/phenobert/utils")
from api import *

In [3]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /config/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# 方案1: 批次處理 + 進度條
import pandas as pd
from tqdm.notebook import tqdm
from io import StringIO
import numpy as np
from concurrent.futures import ThreadPoolExecutor
import time


# 啟用 tqdm 對 pandas 的支持
tqdm.pandas()

def phenobert_predict_optimized(description):
    """優化後的單筆預測函數"""
    try:
        data_string = annotate_text(description)
        if not data_string.strip():  # 如果沒有結果
            return ""
        
        df = pd.read_csv(StringIO(data_string), sep='\t', header=None, 
                        names=['start', 'end', 'description', 'hpo_id', 'score', 'note'])
        df = df.sort_values(by='description')
        return ','.join(list(df.hpo_id))
    except Exception as e:
        print(f"Error processing description: {e}")
        return ""

def batch_phenobert_predict(descriptions, batch_size=10, max_workers=10):
    """
    批次處理版本 - 降低模型重新載入的開銷
    """
    results = []
    
    # 分批處理
    for i in tqdm(range(0, len(descriptions), batch_size), desc="Processing batches"):
        batch = descriptions[i:i+batch_size]
        batch_results = []
        
        # 對每個批次進行處理
        for desc in tqdm(batch, desc=f"Batch {i//batch_size + 1}", leave=False):
            result = phenobert_predict_optimized(desc)
            batch_results.append(result)
        
        results.extend(batch_results)
        
        # 每個批次後短暫休息，避免過熱
        time.sleep(0.1)
    
    return results

def parallel_phenobert_predict(descriptions, max_workers=3):
    """
    並行處理版本 - 注意：只有在確認模型線程安全時才使用
    """
    results = [None] * len(descriptions)
    
    def process_single(args):
        idx, desc = args
        try:
            result = phenobert_predict_optimized(desc)
            return idx, result
        except Exception as e:
            print(f"Error processing index {idx}: {e}")
            return idx, ""
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = [executor.submit(process_single, (i, desc)) 
                  for i, desc in enumerate(descriptions)]
        
        for future in tqdm(futures, desc="Processing in parallel"):
            idx, result = future.result()
            results[idx] = result
    
    return results

In [3]:
df_hpo_mapping = pd.read_csv('../../reference/2025-08-01_orphanet_WGS_database(HPO_ID_Mapping_v20250506).csv', sep=',')

In [6]:
hpo_mapping_dict = {}
for idx in df_hpo_mapping.index:
    hpo_mapping_dict[df_hpo_mapping.input_hpo_id[idx]] = df_hpo_mapping.mapped_main_id[idx]

In [19]:
biolarkgsc_copd_phenochf = pd.read_excel('../../reference/biolarkgsc_copd_phenochf.xlsx',engine='openpyxl')

In [7]:
def hpo_map(hpo_id_list_str):
    if hpo_id_list_str =='-':
        return '-'
    hpo_id_list = hpo_id_list_str.split(',')
    normalized_hpo_id_list = []
    for hpo_id in hpo_id_list:
        if hpo_id not in normalized_hpo_id_list:
            normalized_hpo_id_list.append(hpo_mapping_dict.get(hpo_id,'-'))
    return ','.join(normalized_hpo_id_list)

def accuracy_calculator(true_hpo_str, pred_hpo_str):
    true_hpo_list = true_hpo_str.split(',')
    pred_hpo_list = pred_hpo_str.split(',')
    mapped_hpo_list = [hpo for hpo in pred_hpo_list if hpo in true_hpo_list]
    return len(mapped_hpo_list)/len(true_hpo_list)

In [21]:
biolarkgsc_copd_phenochf['normalized_hpo_ids'] = biolarkgsc_copd_phenochf['hpo_ids'].apply(hpo_map)

In [40]:
biolarkgsc_copd_phenochf['phenobert_hpo_ids'] = parallel_phenobert_predict(list(biolarkgsc_copd_phenochf['clinical_summary']))

Processing in parallel:   0%|          | 0/2138 [00:00<?, ?it/s]

Error processing description: 'WordNetCorpusReader' object has no attribute '_LazyCorpusLoader__args'Error processing description: 'WordNetCorpusReader' object has no attribute '_LazyCorpusLoader__args'

loading vocabulary file /mnt/AI/wgs-database/phenotype_extractors/PhenoBERT/phenobert/embeddings/biobert_v1.1_pubmed/vocab.txt
Load pre-trained BERT parameters from file /mnt/AI/wgs-database/phenotype_extractors/PhenoBERT/phenobert/embeddings/biobert_v1.1_pubmed/pytorch_model.bin.
Error processing description: sequence item 0: expected str instance, float found
Error processing description: sequence item 0: expected str instance, float found
Error processing description: sequence item 0: expected str instance, float found
Error processing description: sequence item 0: expected str instance, float found
Error processing description: sequence item 0: expected str instance, float found
Error processing description: sequence item 0: expected str instance, float found
Error processing descr

In [41]:
biolarkgsc_copd_phenochf['phenobert_hpo_ids'] = biolarkgsc_copd_phenochf['phenobert_hpo_ids'].fillna('-')

In [42]:
biolarkgsc_copd_phenochf['normalized_phenobert_hpo_ids'] = biolarkgsc_copd_phenochf['phenobert_hpo_ids'].apply(hpo_map)

In [43]:
biolarkgsc_copd_phenochf['phenobert_accuracy'] = biolarkgsc_copd_phenochf.apply(lambda x:accuracy_calculator(x.normalized_hpo_ids, x.normalized_phenobert_hpo_ids), axis=1)

In [44]:
biolarkgsc_copd_phenochf['phenobert_accuracy'].mean()

0.5809218329230357

In [45]:
biolarkgsc_copd_phenochf.shape[0]

2138

In [71]:
len(biolarkgsc_copd_phenochf.id.unique())

2138

In [74]:
biolarkgsc_copd_phenochf.head()

Unnamed: 0,id,clinical_summary,labels,hpo_ids,dataset,normalized_hpo_ids,phenobert_hpo_ids,normalized_phenobert_hpo_ids,phenobert_accuracy,normalized_pheno_tagger_hpo_ids,pheno_tagger_accuracy
0,1003450,A syndrome of brachydactyly (absence of some m...,HP_0001156;HP_0009881;HP_0001798;HP_0001792;HP...,"HP:0001156,HP:0009881,HP:0001798,HP:0001792,HP...",biolarkgsc,"HP:0001156,HP:0009881,HP:0001798,HP:0001792,HP...",,-,0.0,"HP:0001156,HP:0008386,HP:0100266,HP:0001363,HP...",0.3
1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,HP_0000006;HP_0000006;HP_0000006;HP_0003828;HP...,"HP:0000006,HP:0000006,HP:0000006,HP:0003828,HP...",biolarkgsc,"HP:0000006,HP:0003828,HP:0003812,HP:0000356,HP...","HP:0000356,HP:0000365,HP:0002023,HP:0001249,HP...","HP:0000356,HP:0000365,HP:0002023,HP:0001249,HP...",0.5,"HP:0000356,HP:0000365,HP:0001177,HP:0002023,HP...",0.5
2,10066029,Nevoid basal cell carcinoma syndrome (NBCCS) i...,HP_0002671;HP_0000006;HP_0000006;HP_0000006;HP...,"HP:0002671,HP:0000006,HP:0000006,HP:0000006,HP...",biolarkgsc,"HP:0002671,HP:0000006,HP:0003828,HP:0003812,HP...",,-,0.0,"HP:0002671,HP:0010603,HP:0010612,HP:0000924,HP...",0.545455
3,10196695,Angelman syndrome (AS) is a neurodevelopmental...,HP_0000707;HP_0001466,"HP:0000707,HP:0001466",biolarkgsc,"HP:0000707,HP:0001466",HP:0012759,HP:0012759,0.0,HP:0012759,0.0
4,10417280,Prader-Willi syndrome (PWS) and Angelman syndr...,HP_0000708;HP_0003745,"HP:0000708,HP:0003745",biolarkgsc,"HP:0000708,HP:0003745",HP:0000708,HP:0000708,0.5,HP:0000708,0.5


In [1]:
# biolarkgsc_copd_phenochf_phenotagger = pd.read_excel('../../reference/biolarkgsc_copd_phenochf_phenotagger.xlsx',engine='openpyxl')
biolarkgsc_phenotagger = pd.read_excel('../../reference/biolarkgsc_splited_phenotagger.xlsx',engine='openpyxl')

NameError: name 'pd' is not defined

In [76]:
biolarkgsc_copd_phenochf_phenotagger.head(1)

Unnamed: 0,PMID,title,summary,hpo_terms,hpo_ids
0,1003450,A syndrome of brachydactyly (absence of some m...,A syndrome of brachydactyly (absence of some m...,brachydactyly (HP:0001156); aplastic or hypopl...,"HP:0001156,HP:0008386,HP:0100266,HP:0001363,HP..."


In [77]:
biolarkgsc_copd_phenochf_phenotagger['pheno_tagger_hpo_ids'] = biolarkgsc_copd_phenochf_phenotagger['hpo_ids'].fillna('-')
biolarkgsc_copd_phenochf_phenotagger['clinical_summary'] = biolarkgsc_copd_phenochf_phenotagger['summary']
biolarkgsc_copd_phenochf_phenotagger['id'] = biolarkgsc_copd_phenochf_phenotagger['PMID']
#del biolarkgsc_copd_phenochf['pheno_tagger_hpo_ids']

In [78]:
# GSC_database_phenotagger['file_name'] = GSC_database_phenotagger['PMID']
# GSC_database_phenotagger['pheno_tagger_hpo_ids'] = GSC_database_phenotagger['hpo_ids']
biolarkgsc_copd_phenochf_merged = biolarkgsc_copd_phenochf.merge(biolarkgsc_copd_phenochf_phenotagger[['id','pheno_tagger_hpo_ids']], on=['id'], how='left')

In [79]:
biolarkgsc_copd_phenochf_merged['pheno_tagger_hpo_ids'] = biolarkgsc_copd_phenochf_merged['pheno_tagger_hpo_ids'].fillna('-')
biolarkgsc_copd_phenochf_merged['normalized_pheno_tagger_hpo_ids'] = biolarkgsc_copd_phenochf_merged['pheno_tagger_hpo_ids'].apply(hpo_map)

In [80]:
biolarkgsc_copd_phenochf_merged['pheno_tagger_accuracy'] = biolarkgsc_copd_phenochf_merged.apply(lambda x:accuracy_calculator(x.normalized_hpo_ids, x.normalized_pheno_tagger_hpo_ids), axis=1)

In [81]:
biolarkgsc_copd_phenochf_merged['pheno_tagger_accuracy'].mean()

0.17531242710307943

In [84]:
biolarkgsc_copd_phenochf_merged.dataset.unique()

array(['biolarkgsc', 'copd', 'phenochf'], dtype=object)

In [85]:
biolarkgsc_copd_phenochf_merged.head()
['phenobert_accuracy','pheno_tagger_accuracy']

Unnamed: 0,id,clinical_summary,labels,hpo_ids,dataset,normalized_hpo_ids,phenobert_hpo_ids,normalized_phenobert_hpo_ids,phenobert_accuracy,normalized_pheno_tagger_hpo_ids,pheno_tagger_accuracy,pheno_tagger_hpo_ids
0,1003450,A syndrome of brachydactyly (absence of some m...,HP_0001156;HP_0009881;HP_0001798;HP_0001792;HP...,"HP:0001156,HP:0009881,HP:0001798,HP:0001792,HP...",biolarkgsc,"HP:0001156,HP:0009881,HP:0001798,HP:0001792,HP...",,-,0.0,"HP:0001156,HP:0008386,HP:0100266,HP:0001363,HP...",0.3,"HP:0001156,HP:0008386,HP:0100266,HP:0001363,HP..."
1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,HP_0000006;HP_0000006;HP_0000006;HP_0003828;HP...,"HP:0000006,HP:0000006,HP:0000006,HP:0003828,HP...",biolarkgsc,"HP:0000006,HP:0003828,HP:0003812,HP:0000356,HP...","HP:0000356,HP:0000365,HP:0002023,HP:0001249,HP...","HP:0000356,HP:0000365,HP:0002023,HP:0001249,HP...",0.5,"HP:0000356,HP:0000365,HP:0001177,HP:0002023,HP...",0.5,"HP:0000356,HP:0000365,HP:0001177,HP:0002023,HP..."
2,10066029,Nevoid basal cell carcinoma syndrome (NBCCS) i...,HP_0002671;HP_0000006;HP_0000006;HP_0000006;HP...,"HP:0002671,HP:0000006,HP:0000006,HP:0000006,HP...",biolarkgsc,"HP:0002671,HP:0000006,HP:0003828,HP:0003812,HP...",,-,0.0,"HP:0002671,HP:0010603,HP:0010612,HP:0000924,HP...",0.545455,"HP:0002671,HP:0010603,HP:0010612,HP:0000924,HP..."
3,10196695,Angelman syndrome (AS) is a neurodevelopmental...,HP_0000707;HP_0001466,"HP:0000707,HP:0001466",biolarkgsc,"HP:0000707,HP:0001466",HP:0012759,HP:0012759,0.0,HP:0012759,0.0,HP:0012759
4,10417280,Prader-Willi syndrome (PWS) and Angelman syndr...,HP_0000708;HP_0003745,"HP:0000708,HP:0003745",biolarkgsc,"HP:0000708,HP:0003745",HP:0000708,HP:0000708,0.5,HP:0000708,0.5,HP:0000708


In [89]:
biolarkgsc_copd_phenochf_merged.clinical_summary[0]

'A syndrome of brachydactyly (absence of some middle or distal phalanges), aplastic or hypoplastic nails, symphalangism (ankylois of proximal interphalangeal joints), synostosis of some carpal and tarsal bones, craniosynostosis, and dysplastic hip joints is reported in five members of an Italian family. It may represent a previously undescribed autosomal dominant trait.'

In [88]:
biolarkgsc_copd_phenochf_merged.clinical_summary = biolarkgsc_copd_phenochf_merged.clinical_summary.fillna('-')
biolarkgsc_copd_phenochf_merged['length'] = biolarkgsc_copd_phenochf_merged.clinical_summary.apply(lambda x:len(x))

for dataset in biolarkgsc_copd_phenochf_merged.dataset.unique():
    print(f'{dataset} length mean:')
    print(biolarkgsc_copd_phenochf_merged[biolarkgsc_copd_phenochf_merged.dataset==dataset]['length'].mean())


biolarkgsc length mean:
998.640350877193
copd length mean:
821.9025
phenochf length mean:
8621.667741935484


In [None]:
# for model_score in ['phenobert_accuracy','pheno_tagger_accuracy']:
#     print(f'-----{model_score}-----')
for dataset in biolarkgsc_copd_phenochf_merged.dataset.unique():
    print(f'{dataset} text length mean:')
    print(biolarkgsc_copd_phenochf_merged[biolarkgsc_copd_phenochf_merged.dataset==dataset].mean())


In [33]:
# import matplotlib.pyplot as plt
# import numpy as np
# import pandas as pd
# import seaborn as sns

# # 設定中文字型（如果需要顯示中文）
# plt.rcParams['font.sans-serif'] = ['DejaVu Sans', 'SimHei', 'Arial']
# plt.rcParams['axes.unicode_minus'] = False

# def plot_phenobert_accuracy_distribution(df_phenopacket, column_name='phenobert_accuracy'):
#     """
#     繪製 PhenoBERT Accuracy 分數分布圖
    
#     Parameters:
#     df_phenopacket: DataFrame containing phenobert_accuracy column
#     """
    
#     # 建立分組區間 (0.0-0.1, 0.1-0.2, ..., 0.9-1.0)
#     bins = np.arange(0, 1.1, 0.1)  # [0, 0.1, 0.2, ..., 1.0]
    
#     # 創建圖表
#     plt.figure(figsize=(10, 6))
    
#     # 直方圖
#     n, bins_edges, patches = plt.hist(df_phenopacket[column_name], 
#                                      bins=bins, 
#                                      edgecolor='black', 
#                                      alpha=0.7,
#                                      color='skyblue')
    
#     plt.xlabel(f'{column_name} Score', fontsize=12)
#     plt.ylabel('Frequency', fontsize=12)
#     plt.title(f'{column_name} Distribution', fontsize=14, fontweight='bold')
#     plt.xticks(bins, [f'{i:.1f}' for i in bins])
#     plt.grid(True, alpha=0.3)
    
#     # 在每個bar上顯示數量
#     for i, count in enumerate(n):
#         if count > 0:
#             plt.text(bins_edges[i] + 0.05, count + max(n)*0.01, 
#                     f'{int(count)}', ha='center', va='bottom', fontsize=10)
    
#     plt.tight_layout()
#     plt.show()
    
#     # 顯示統計資訊
#     print("=== PhenoBERT Recall 統計資訊 ===")
#     print(f"總樣本數: {len(df_phenopacket[column_name])}")
#     print(f"平均值: {df_phenopacket[column_name].mean():.4f}")
#     print(f"中位數: {df_phenopacket[column_name].median():.4f}")
#     print(f"標準差: {df_phenopacket[column_name].std():.4f}")
#     print(f"最小值: {df_phenopacket[column_name].min():.4f}")
#     print(f"最大值: {df_phenopacket[column_name].max():.4f}")
    
#     # 顯示各區間的詳細統計
#     print("\n=== 各分數區間統計 ===")
#     bin_labels = [f'{bins[i]:.1f}-{bins[i+1]:.1f}' for i in range(len(bins)-1)]
#     counts, _ = np.histogram(df_phenopacket[column_name], bins=bins)
#     percentages = (counts / len(df_phenopacket[column_name])) * 100
    
#     for label, count, pct in zip(bin_labels, counts, percentages):
#         print(f"{label}: {count:4d} 樣本 ({pct:5.1f}%)")

# def plot_simple_histogram(df_phenopacket):
#     """
#     簡單版本的分布圖
#     """
#     bins = np.arange(0, 1.1, 0.1)
    
#     plt.figure(figsize=(10, 6))
#     plt.hist(df_phenopacket[column_name], 
#              bins=bins, 
#              edgecolor='black', 
#              alpha=0.7,
#              color='steelblue')
    
#     plt.xlabel(f'{column_name} Score', fontsize=12)
#     plt.ylabel('Frequency', fontsize=12)
#     plt.title(f'{column_name} Score Distribution', fontsize=14, fontweight='bold')
#     plt.xticks(bins, [f'{i:.1f}' for i in bins])
#     plt.grid(True, alpha=0.3)
#     plt.tight_layout()
#     plt.show()

# # 使用範例
# # plot_phenobert_accuracy_distribution(df_phenopacket)

# print("執行 plot_phenobert_accuracy_distribution(df_phenopacket) 來繪製分布圖")

In [34]:
# phenopacket = pd.read_excel('../../reference/20250811_phenopacket_with_demographics_summary_phenobert_phenotagger_result.xlsx',engine='openpyxl')

In [35]:
# phenopacket['phenobert_recall'] = phenopacket['phenobert_accuracy']
# phenopacket['pheno_tagger_recall'] = phenopacket['pheno_tagger_accuracy']

In [36]:
# GSC_database.head()

In [37]:
# plot_phenobert_accuracy_distribution(phenopacket, 'pheno_tagger_recall')

In [38]:
# plot_phenobert_accuracy_distribution(GSC_database_merged, column_name='pheno_tagger_accuracy')

In [39]:
# 