In [58]:
import pandas as pd
import nltk
from nltk.tokenize import sent_tokenize

# 下載NLTK模型
try:
    nltk.data.find('tokenizers/punkt')
except LookupError:
    nltk.download('punkt')

def text_to_sentence_pubtator(df, id_col='index', text_col='clinical_summary'):
    """
    將文本按句子拆分轉換為PubTator格式
    每個句子一個條目，使用原始ID
    """
    lines = []
    for _, row in df.iterrows():
        original_id = str(row[id_col])
        text = str(row[text_col])
        
        # 句子切割
        sentences = sent_tokenize(text)
        for sentence in sentences:
            sentence = sentence.strip()
            if sentence:
                lines.append(f"{original_id}|t|{sentence}")
                lines.append(f"{original_id}|a|{sentence}")
                lines.append("")  # 空行
    
    return "\n".join(lines)

def save_pubtator_format(df: pd.DataFrame, output_file: str = "output_pubtator.txt"):
    """
    將DataFrame轉換為PubTator格式並儲存到檔案
    
    Parameters:
    df (pd.DataFrame): 包含case_id和clinical_summary欄位的DataFrame
    output_file (str): 輸出檔案名稱
    """
    pubtator_text = text_to_sentence_pubtator(df)
    
    with open(output_file, 'w', encoding='utf-8') as f:
        f.write(pubtator_text)
    
    print(f"PubTator格式文件已儲存至: {output_file}")
    return pubtator_text

def read_pubtator_file(file_path):
    """
    從檔案讀取PubTator格式文本
    """
    with open(file_path, 'r', encoding='utf-8') as f:
        return f.read()

def merge_hpo_by_id(pubtator_text):
    """
    讀取PubTator，將相同ID的HPO註釋合併
    """
    lines = pubtator_text.strip().split('\n')
    data = {}
    
    for line in lines:
        line = line.strip()
        if not line:
            continue
            
        # 解析註釋行
        if '\t' in line and len(line.split('\t')) >= 6:
            parts = line.split('\t')
            id_val = parts[0]
            text = parts[3]
            entity_type = parts[4]
            hpo_id = parts[5]
            
            if entity_type == 'Phenotype' and hpo_id.startswith('HP:'):
                if id_val not in data:
                    data[id_val] = {'hpo_terms': [], 'hpo_ids': []}
                
                if text not in data[id_val]['hpo_terms']:
                    data[id_val]['hpo_terms'].append(f"{text} ({hpo_id})")
                if hpo_id not in data[id_val]['hpo_ids']:
                    data[id_val]['hpo_ids'].append(hpo_id)
    
    # 轉換為DataFrame
    result = []
    for id_val, info in data.items():
        result.append({
            'index': id_val,
            'hpo_terms': ';'.join(info['hpo_terms']),
            'hpo_ids': ','.join(info['hpo_ids'])
        })
    
    return pd.DataFrame(result)


def read_and_merge_pubtator(file_path):
    """
    讀取PubTator檔案並合併相同ID的HPO註釋
    """
    pubtator_text = read_pubtator_file(file_path)
    return merge_hpo_by_id(pubtator_text)

# # 使用範例
# if __name__ == "__main__":
#     # 範例1: 文本轉句子PubTator
#     df = pd.DataFrame({
#         'case_id': ['case1', 'case2'],
#         'clinical_summary': [
#             'Patient has fever. Also shows rash.',
#             'Cardiac defect noted. Growth delay observed.'
#         ]
#     })
    
#     pubtator = text_to_sentence_pubtator(df)
#     print("=== 句子PubTator格式 ===")
#     print(pubtator)
    
#     # 範例2: 模擬HPO註釋後的PubTator
#     annotated_pubtator = """case1|t|Patient has fever.
# case1|a|Patient has fever.
# case1	0	5	fever	Phenotype	HP:0001945	0.99

# case1|t|Also shows rash.
# case1|a|Also shows rash.
# case1	11	15	rash	Phenotype	HP:0000988	0.95

# case2|t|Cardiac defect noted.
# case2|a|Cardiac defect noted.
# case2	0	14	Cardiac defect	Phenotype	HP:0001627	0.98"""
    
    # 範例3: 讀取檔案並合併HPO
    # merged_df = read_and_merge_pubtator('annotated_output.txt')
    # print("\n=== 從檔案讀取並合併 ===")
    # print(merged_df)

In [47]:
biolarkgsc_copd_phenochf = pd.read_excel('../PhenoBERT/reference/biolarkgsc_copd_phenochf.xlsx',engine='openpyxl')
biolarkgsc_copd_phenochf['index'] = list(range(0, biolarkgsc_copd_phenochf.shape[0]))
biolarkgsc_copd_phenochf.to_excel('../PhenoBERT/reference/biolarkgsc_copd_phenochf.xlsx',index=False)
save_pubtator_format(biolarkgsc_copd_phenochf[biolarkgsc_copd_phenochf.dataset=='biolarkgsc'], './input/biolarkgsc.PubTator')
#save_pubtator_format(biolarkgsc_copd_phenochf, './input/biolarkgsc_copd_phenochf_splited.PubTator')

PubTator格式文件已儲存至: ./input/biolarkgsc.PubTator


'0|t|A syndrome of brachydactyly (absence of some middle or distal phalanges), aplastic or hypoplastic nails, symphalangism (ankylois of proximal interphalangeal joints), synostosis of some carpal and tarsal bones, craniosynostosis, and dysplastic hip joints is reported in five members of an Italian family.\n0|a|A syndrome of brachydactyly (absence of some middle or distal phalanges), aplastic or hypoplastic nails, symphalangism (ankylois of proximal interphalangeal joints), synostosis of some carpal and tarsal bones, craniosynostosis, and dysplastic hip joints is reported in five members of an Italian family.\n\n0|t|It may represent a previously undescribed autosomal dominant trait.\n0|a|It may represent a previously undescribed autosomal dominant trait.\n\n1|t|Townes-Brocks syndrome (TBS) is an autosomal dominant disorder with multiple malformations and variable expression.\n1|a|Townes-Brocks syndrome (TBS) is an autosomal dominant disorder with multiple malformations and variable ex

In [37]:
len(biolarkgsc_copd_phenochf.id.unique())

2138

In [38]:
biolarkgsc_copd_phenochf.shape[0]

2138

In [None]:
# save_pubtator_format(biolarkgsc_copd_phenochf[biolarkgsc_copd_phenochf.dataset=='biolarkgsc'], './input/biolarkgsc.PubTator')

PubTator格式文件已儲存至: ./input/biolarkgsc.PubTator


'1003450|t|A syndrome of brachydactyly (absence of some middle or distal phalanges), aplastic or hypoplastic nails, symphalangism (ankylois of proximal interphalangeal joints), synostosis of some carpal and tarsal bones, craniosynostosis, and dysplastic hip joints is reported in five members of an Italian family.\n1003450|a|A syndrome of brachydactyly (absence of some middle or distal phalanges), aplastic or hypoplastic nails, symphalangism (ankylois of proximal interphalangeal joints), synostosis of some carpal and tarsal bones, craniosynostosis, and dysplastic hip joints is reported in five members of an Italian family.\n\n1003450|t|It may represent a previously undescribed autosomal dominant trait.\n1003450|a|It may represent a previously undescribed autosomal dominant trait.\n\n10051003|t|Townes-Brocks syndrome (TBS) is an autosomal dominant disorder with multiple malformations and variable expression.\n10051003|a|Townes-Brocks syndrome (TBS) is an autosomal dominant disorder with 

In [59]:
output_df = read_and_merge_pubtator('output/biolarkgsc.PubTator')

In [60]:
output_df

Unnamed: 0,index,hpo_terms,hpo_ids
0,0,brachydactyly (HP:0001156);aplastic or hypopla...,"HP:0001156,HP:0008386,HP:0001792,HP:0100266,HP..."
1,1,ear anomalies (HP:0000356);hearing loss (HP:00...,"HP:0000356,HP:0000365,HP:0100258,HP:0001177,HP..."
2,2,basal cell carcinoma (HP:0002671);carcinoma (H...,"HP:0002671,HP:0030731,HP:0010603,HP:0010612,HP..."
3,3,neurodevelopmental disorder (HP:0012759);neuro...,HP:0012759
4,4,neurobehavioral disorders (HP:0000708);neurobe...,HP:0000708
...,...,...,...
211,222,acoustic neuromas (HP:0009588);neuromas (HP:00...,"HP:0009588,HP:0030430,HP:0100008,HP:0002664,HP..."
212,223,overgrowth (HP:0001548);overgrowth (HP:0001548...,"HP:0001548,HP:0002591,HP:0001513"
213,224,unilateral vestibular schwannomas (HP:0009590)...,"HP:0009590,HP:0009588,HP:0100008,HP:0001067,HP..."
214,225,basal cell carcinoma (HP:0002671);carcinoma (H...,"HP:0002671,HP:0030731,HP:0002664,HP:0009824,HP..."


In [26]:
output_df.to_excel('../PhenoBERT/reference/20250818_biolarkgsc_splited_phenotagger.xlsx',index=False)

In [61]:
output_df.head(1)

Unnamed: 0,index,hpo_terms,hpo_ids
0,0,brachydactyly (HP:0001156);aplastic or hypopla...,"HP:0001156,HP:0008386,HP:0001792,HP:0100266,HP..."


In [62]:
output_df.hpo_terms[0]

'brachydactyly (HP:0001156);aplastic or hypoplastic nails (HP:0008386);hypoplastic nails (HP:0001792);carpal and tarsal bones (HP:0100266);tarsal bones (HP:0041247);craniosynostosis (HP:0001363);dysplastic hip (HP:0001385);brachydactyly (HP:0001156);aplastic or hypoplastic nails (HP:0008386);hypoplastic nails (HP:0001792);carpal and tarsal bones (HP:0100266);tarsal bones (HP:0041247);craniosynostosis (HP:0001363);dysplastic hip (HP:0001385)'

In [50]:
df_hpo_mapping = pd.read_csv('../PhenoBERT/reference/2025-08-01_orphanet_WGS_database(HPO_ID_Mapping_v20250506).csv', sep=',')

In [51]:
hpo_mapping_dict = {}
for idx in df_hpo_mapping.index:
    hpo_mapping_dict[df_hpo_mapping.input_hpo_id[idx]] = df_hpo_mapping.mapped_main_id[idx]

In [52]:
biolarkgsc_copd_phenochf = pd.read_excel('../PhenoBERT/reference/biolarkgsc_copd_phenochf.xlsx',engine='openpyxl')

In [53]:
def hpo_map(hpo_id_list_str):
    if hpo_id_list_str =='-':
        return '-'
    hpo_id_list = hpo_id_list_str.split(',')
    normalized_hpo_id_list = []
    for hpo_id in hpo_id_list:
        if hpo_id not in normalized_hpo_id_list:
            normalized_hpo_id_list.append(hpo_mapping_dict.get(hpo_id,'-'))
    return ','.join(normalized_hpo_id_list)

def accuracy_calculator(true_hpo_str, pred_hpo_str):
    true_hpo_list = true_hpo_str.split(',')
    pred_hpo_list = pred_hpo_str.split(',')
    mapped_hpo_list = [hpo for hpo in pred_hpo_list if hpo in true_hpo_list]
    return len(mapped_hpo_list)/len(true_hpo_list)

In [54]:
biolarkgsc_copd_phenochf['normalized_hpo_ids'] = biolarkgsc_copd_phenochf['hpo_ids'].apply(hpo_map)

In [55]:
biolarkgsc_copd_phenochf[biolarkgsc_copd_phenochf['normalized_hpo_ids'].isna()]

Unnamed: 0,id,clinical_summary,labels,hpo_ids,dataset,index,normalized_hpo_ids


In [63]:
# output_df.head()
output_df['pheno_tagger_terms'] = output_df['hpo_terms']
output_df['pheno_tagger_hpo_ids'] = output_df['hpo_ids'].fillna('-')
output_df['index'] = output_df['index'].apply(lambda x:int(x))
# biolarkgsc_tagger.id = biolarkgsc_tagger.id.astype(str)
biolarkgsc_tagger = biolarkgsc_copd_phenochf.merge(output_df[['index','pheno_tagger_hpo_ids']], on='index', how='right')

In [64]:
biolarkgsc_tagger

Unnamed: 0,id,clinical_summary,labels,hpo_ids,dataset,index,normalized_hpo_ids,pheno_tagger_hpo_ids
0,1003450,A syndrome of brachydactyly (absence of some m...,HP_0001156;HP_0009881;HP_0001798;HP_0001792;HP...,"HP:0001156,HP:0009881,HP:0001798,HP:0001792,HP...",biolarkgsc,0,"HP:0001156,HP:0009881,HP:0001798,HP:0001792,HP...","HP:0001156,HP:0008386,HP:0001792,HP:0100266,HP..."
1,10051003,Townes-Brocks syndrome (TBS) is an autosomal d...,HP_0000006;HP_0000006;HP_0000006;HP_0003828;HP...,"HP:0000006,HP:0000006,HP:0000006,HP:0003828,HP...",biolarkgsc,1,"HP:0000006,HP:0003828,HP:0003812,HP:0000356,HP...","HP:0000356,HP:0000365,HP:0100258,HP:0001177,HP..."
2,10066029,Nevoid basal cell carcinoma syndrome (NBCCS) i...,HP_0002671;HP_0000006;HP_0000006;HP_0000006;HP...,"HP:0002671,HP:0000006,HP:0000006,HP:0000006,HP...",biolarkgsc,2,"HP:0002671,HP:0000006,HP:0003828,HP:0003812,HP...","HP:0002671,HP:0030731,HP:0010603,HP:0010612,HP..."
3,10196695,Angelman syndrome (AS) is a neurodevelopmental...,HP_0000707;HP_0001466,"HP:0000707,HP:0001466",biolarkgsc,3,"HP:0000707,HP:0001466",HP:0012759
4,10417280,Prader-Willi syndrome (PWS) and Angelman syndr...,HP_0000708;HP_0003745,"HP:0000708,HP:0003745",biolarkgsc,4,"HP:0000708,HP:0003745",HP:0000708
...,...,...,...,...,...,...,...,...
211,9811917,Stereotactic radiosurgery is the principal alt...,HP_0009588;HP_0009588;HP_0100008;HP_0002664;HP...,"HP:0009588,HP:0009588,HP:0100008,HP:0002664,HP...",biolarkgsc,222,"HP:0009588,HP:0100008,HP:0002664,HP:0000707","HP:0009588,HP:0030430,HP:0100008,HP:0002664,HP..."
212,9831341,We had previously described a patient with an ...,HP_0001548;HP_0002591;HP_0001513;HP_0001548,"HP:0001548,HP:0002591,HP:0001513,HP:0001548",biolarkgsc,223,"HP:0001548,HP:0002591,HP:0001513","HP:0001548,HP:0002591,HP:0001513"
213,9863591,Patients who present with unilateral vestibula...,HP_0009590;HP_0009588;HP_0100008;HP_0003593;HP...,"HP:0009590,HP:0009588,HP:0100008,HP:0003593,HP...",biolarkgsc,224,"HP:0009590,HP:0009588,HP:0100008,HP:0003593,HP...","HP:0009590,HP:0009588,HP:0100008,HP:0001067,HP..."
214,9931336,Mutations of the human Patched gene ( PTCH ) h...,HP_0002671;HP_0003745;HP_0002671;HP_0002885;HP...,"HP:0002671,HP:0003745,HP:0002671,HP:0002885,HP...",biolarkgsc,225,"HP:0002671,HP:0003745,HP:0002885,HP:0002664","HP:0002671,HP:0030731,HP:0002664,HP:0009824,HP..."


In [19]:
print(type(biolarkgsc_copd_phenochf.id[0]))
print(type(output_df.id[0]))

<class 'int'>
<class 'str'>


In [65]:
biolarkgsc_tagger['normalized_pheno_tagger_hpo_ids'] = biolarkgsc_tagger['pheno_tagger_hpo_ids'].apply(hpo_map)

In [66]:
biolarkgsc_tagger['pheno_tagger_accuracy'] = biolarkgsc_tagger.apply(lambda x:accuracy_calculator(x.normalized_hpo_ids, x.normalized_pheno_tagger_hpo_ids), axis=1)

In [67]:
biolarkgsc_tagger['pheno_tagger_accuracy'].mean()

0.5954651972414159