In [1]:
from tqdm import tqdm
import json
import sys
import os
os.environ['MPLBACKEND'] = 'Agg'

import matplotlib
matplotlib.use('Agg')  

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

In [19]:
set_name = "train" # dev test

In [20]:
original_note = []
plain = []
# plain_remove_cxr = []
risk_factor = []
timeline = []

file_path = f"/ssd1/chanhwi/long-clinical-doc/dataset/{set_name}_summarization/total_output.jsonl"
with open(file_path, "r", encoding="utf-8") as f:
    for line in tqdm(f, desc="Reading file"):
        line = line.strip()

        if not line:
            continue
        
        data = json.loads(line)
        original_note.append(data['original_note'])
        plain.append(data['plain'])
        risk_factor.append(data['risk_factor'])
        timeline.append(data['timeline'])

Reading file: 1360it [00:00, 8543.47it/s]


In [21]:
original_note_tokens = [item[1] for item in original_note]
plain_tokens = [item[1] for item in plain]
risk_factor_tokens = [item[1] for item in risk_factor]
timeline_tokens = [item[1] for item in timeline]

final = {"original":original_note_tokens, 
         "plain":plain_tokens, 
         "risk":risk_factor_tokens,
         "timeline":timeline_tokens}

In [22]:
with open(f"/ssd1/chanhwi/long-clinical-doc/dataset/{set_name}_token_num.jsonl", "w") as f:
    json.dump(final, f)

In [4]:
def create_token_count_boxplot(original_note, plain, risk_factor, timeline):
    # 각 리스트에서 토큰 수 추출
    original_note_tokens = [item[1] for item in original_note]
    plain_tokens = [item[1] for item in plain]
    risk_factor_tokens = [item[1] for item in risk_factor]
    timeline_tokens = [item[1] for item in timeline]

    # Seaborn용 long-format 데이터프레임 생성
    data_long = []
    labels = ['Original Note', 'Plain', 'Risk Factor', 'Timeline']
    token_counts = [original_note_tokens, plain_tokens, risk_factor_tokens, timeline_tokens]

    for label, counts in zip(labels, token_counts):
        for count in counts:
            data_long.append({'Text Type': label, 'Token Count': count})

    df_long = pd.DataFrame(data_long)

    # 플롯 설정
    plt.figure(figsize=(8, 5))
    palette = sns.color_palette("deep", 4)
    ax = sns.boxplot(x='Text Type', y='Token Count', data=df_long,
                     palette=palette,
                     width=0.5,
                     showfliers=True,
                     boxprops=dict(alpha=1),
                     medianprops={"color": "darkred", "linewidth": 1})

    # 플롯 커스터마이징
    plt.title(f'Length Distribution in {set_name} set', fontsize=16, fontweight='bold', pad=20)
    plt.xlabel('Text Type', fontsize=14, fontweight='bold')
    plt.ylabel('Number of Tokens', fontsize=14, fontweight='bold')
    plt.xticks(rotation=0, ha='center')
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()

    # 플롯 저장 후 출력
    plt.savefig(f'{set_name}_token_count_boxplot.png', dpi=300)
    plt.show()
    # plt.close()

In [5]:
create_token_count_boxplot(original_note, plain, risk_factor, timeline)


Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `x` variable to `hue` and set `legend=False` for the same effect.

  ax = sns.boxplot(x='Text Type', y='Token Count', data=df_long,


In [6]:
import matplotlib.pyplot as plt
import numpy as np

# 데이터 구성: summary 별로 Baseline, + (AGE, RACE), - Radiology Info 조건의 AUROC 값
data_summary = {
    "Plain": {
        "Baseline": {"DN": 0.8392, "DN+RR": 0.8261, "DN+CXR": 0.8597},
        "+ (AGE, RACE)": {"DN": 0.8414, "DN+RR": 0.8120, "DN+CXR": 0.8657},
        "- Radiology Info": {"DN": 0.8263, "DN+RR": 0.8121, "DN+CXR": 0.8300},
    },
    "Riskfactor": {
        "Baseline": {"DN": 0.8001, "DN+RR": 0.7978, "DN+CXR": 0.8374},
        "+ (AGE, RACE)": {"DN": 0.8164, "DN+RR": 0.8032, "DN+CXR": 0.8535},
        "- Radiology Info": {"DN": 0.7659, "DN+RR": 0.7724, "DN+CXR": 0.8159},
    },
    "Timeline": {
        "Baseline": {"DN": 0.8076, "DN+RR": 0.7873, "DN+CXR": 0.8332},
        "+ (AGE, RACE)": {"DN": 0.8323, "DN+RR": 0.7998, "DN+CXR": 0.8522},
        "- Radiology Info": {"DN": 0.7675, "DN+RR": 0.7799, "DN+CXR": 0.8099},
    },
}

# 모달리티 및 summary type 리스트, 조건 리스트
modalities = ["DN", "DN+RR", "DN+CXR"]
summary_types = ["Plain", "Riskfactor", "Timeline"]
conditions = ["Baseline", "+ (AGE, RACE)", "- Radiology Info"]

# 행: summary type, 열: 모달리티로 3x3 서브플롯 생성
fig, axs = plt.subplots(nrows=len(summary_types), ncols=len(modalities), figsize=(10, 8), sharey=True)

# 각 셀에 대해 바 차트 그리기
for i, summary in enumerate(summary_types):         # 각 행은 summary type
    for j, modality in enumerate(modalities):         # 각 열은 모달리티
        ax = axs[i, j]
        # 해당 summary와 modality 조합의 조건별 AUROC 값 추출
        values = [
            data_summary[summary]["Baseline"][modality],
            data_summary[summary]["+ (AGE, RACE)"][modality],
            data_summary[summary]["- Radiology Info"][modality],
        ]
        x = np.arange(len(conditions))
        ax.bar(x, values, color='skyblue', edgecolor='black')
        ax.set_xticks(x)
        ax.set_xticklabels(conditions, rotation=45, ha='right', fontsize=9)
        ax.set_ylim(0.75, 0.9)
        
        # 첫 번째 행에서는 각 열에 모달리티 이름을 제목으로 설정
        if i == 0:
            ax.set_title(modality, fontsize=12)
        # 각 행의 첫 번째 열에 summary type 이름을 y-label로 표시 (세로 방향 중앙 정렬)
        if j == 0:
            ax.set_ylabel(summary, fontsize=12, rotation=0, labelpad=50, va='center')

# 전체 그래프 제목 및 레이아웃 조정
fig.suptitle("AUROC by Summary Type (Rows) and Modality (Columns)\n(with Baseline, + (AGE, RACE), - Radiology Info)", fontsize=16)
plt.tight_layout(rect=[0, 0, 1, 0.95])
plt.show()


In [7]:
with open("/ssd1/chanhwi/long-clinical-doc/dataset/train_summarization/total_output.jsonl") as f:
    tmp = f.readlines()

In [8]:
print(len(tmp))

6345
