In [4]:
import pandas as pd
import os
import re

def process_corpus_data(corpus_folder, ann_folder):
    # 處理 corpus 資料夾
    corpus_data = []
    for file_name in os.listdir(corpus_folder):
        # 只處理純數字檔名且為檔案
        if file_name.isdigit() and os.path.isfile(os.path.join(corpus_folder, file_name)):
            with open(os.path.join(corpus_folder, file_name), 'r', encoding='utf-8') as f:
                content = f.read().strip()
                corpus_data.append({
                    'file_name': file_name,
                    'clinical_summary': content
                })
    
    # 處理 ann 資料夾
    ann_data = []
    for file_name in os.listdir(ann_folder):
        # 只處理純數字檔名且為檔案
        if file_name.isdigit() and os.path.isfile(os.path.join(ann_folder, file_name)):
            with open(os.path.join(ann_folder, file_name), 'r', encoding='utf-8') as f:
                content = f.read().strip()
                
                # 提取HPO條目
                hpo_pattern = r'\[.*?\]\s+(HP_\d+)\s+\|\s+(.+)'
                matches = re.findall(hpo_pattern, content)
                
                if matches:
                    hpo_terms = []
                    hpo_ids = []
                    
                    for hpo_id, term in matches:
                        hpo_terms.append(f"{term} ({hpo_id.replace('_', ':')})")
                        hpo_ids.append(hpo_id.replace('_', ':'))
                    
                    ann_data.append({
                        'file_name': file_name,
                        'hpo_terms': ';'.join(hpo_terms),
                        'hpo_ids': ','.join(hpo_ids)
                    })
    
    # 建立DataFrame並合併
    df_corpus = pd.DataFrame(corpus_data)
    df_ann = pd.DataFrame(ann_data)
    
    # 合併兩個DataFrame
    result_df = pd.merge(df_corpus, df_ann, on='file_name', how='outer')
    
    return result_df

# 使用方法
# df = process_corpus_data('corpus資料夾路徑', 'ann資料夾路徑')
# print(df.head())
# df.to_csv('output.csv', index=False)

In [5]:
corpus_path = './phenobert/data/GSC/corpus/'
anno_path = './phenobert/data/GSC/ann/'
df = process_corpus_data(corpus_path, anno_path)

In [1]:
df.to_excel('./reference/GSC_database.xlsx', index=False)

NameError: name 'df' is not defined