In [None]:
import pandas as pd
from datasets import Dataset, DatasetDict

label_cols = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity",
    "Lung Lesion", "Edema", "Consolidation", "Pneumonia",
    "Atelectasis", "Pneumothorax", "Pleural Effusion",
    "Pleural Other", "Fracture", "Support Devices"
]

def clean_dataframe(df):

    df["text"] = df["text"].fillna("NA")

    df[label_cols] = df[label_cols].fillna(0).astype(float)

    df["labels"] = df[label_cols].values.tolist()

    return df

def load_and_clean_all(train_path, dev_path, test_path):
    df_train = clean_dataframe(pd.read_csv(train_path))
    df_dev   = clean_dataframe(pd.read_csv(dev_path))
    df_test  = clean_dataframe(pd.read_csv(test_path))

    # 转换为 HuggingFace Datasetsrt
    ds_train = Dataset.from_pandas(df_train)
    ds_dev   = Dataset.from_pandas(df_dev)
    ds_test  = Dataset.from_pandas(df_test)

    return DatasetDict({
        "train": ds_train,
        "validation": ds_dev,
        "test": ds_test
    })

train_file = "/text_train_data_noimpression.csv"
dev_file   = "/text_val_data_noimpression.csv"
test_file  = "/text_test_data_noimpression.csv"

raw_datasets = load_and_clean_all(train_file, dev_file, test_file)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_list = [
    "dmis-lab/biobert-v1.1",
    "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",
    "emilyalsentzer/Bio_ClinicalBERT",
]

num_labels = len(label_cols)
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256
    )

In [None]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import numpy as np
import torch

def multi_label_metrics(eval_preds):
    logits, labels = eval_preds

    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    y_true = labels
    y_pred = (probs > 0.5).astype(int)

    macro_f1 = f1_score(y_true, y_pred, average="macro")
    micro_f1 = f1_score(y_true, y_pred, average="micro")
    acc = accuracy_score(y_true, y_pred)

    try:
        macro_auc = roc_auc_score(y_true, probs, average="macro")
        micro_auc = roc_auc_score(y_true, probs, average="micro")
    except ValueError:
        macro_auc = -1
        micro_auc = -1

    num_classes = y_true.shape[1]
    per_class_auc = []
    for i in range(num_classes):
        try:
            auc = roc_auc_score(y_true[:, i], probs[:, i])
        except ValueError:
            auc = -1
        per_class_auc.append(auc)

    result = {
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "accuracy": acc,
        "macro_auc": macro_auc,
        "micro_auc": micro_auc,
    }

    for i, auc in enumerate(per_class_auc):
        result[f"auc_class_{i}"] = auc

    return result

In [None]:
from transformers import TrainingArguments, Trainer

results_summary = {}


for model_name in model_list:
    print(f"\nProcessing Model: {model_name}")

    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForSequenceClassification.from_pretrained(
        model_name,
        num_labels=num_labels,
        problem_type="multi_label_classification"
    )

    tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)
    tokenized_datasets.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

    training_args = TrainingArguments(
        output_dir=f"./checkpoints/{model_name.split('/')[0]}",
        eval_strategy="epoch",
        save_strategy="epoch",
        logging_steps=50,
        num_train_epochs=3,
        per_device_train_batch_size=8,
        per_device_eval_batch_size=8,
        learning_rate=2e-5,
        load_best_model_at_end=True,
        metric_for_best_model="micro_f1",
        greater_is_better=True,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_datasets["train"],
        eval_dataset=tokenized_datasets["validation"],
        tokenizer=tokenizer,
        compute_metrics=multi_label_metrics
    )

    trainer.train()

    metrics = trainer.evaluate()

    best_model_dir = f"./checkpoints/{model_name.split('/')[0]}_best"
    trainer.save_model(best_model_dir)
    tokenizer.save_pretrained(best_model_dir)

    results_summary[model_name] = metrics

print("\nAll Models have been trained and evaluated!")
print("Evaluation Results Summary:")

for model_name, metrics in results_summary.items():
    print(f"\nModel: {model_name}")
    for metric_name, metric_value in metrics.items():
        print(f"{metric_name}: {metric_value:.4f}")

In [None]:
#PLOT RESULT FIGURE

import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc
from transformers import AutoModelForSequenceClassification, AutoTokenizer, Trainer
from scipy.special import expit

num_labels = len(label_cols)
name_map = {
    "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext": "biomedbert",
    "dmis-lab/biobert-v1.1": "biobert",
    "emilyalsentzer/Bio_ClinicalBERT": "clinicalbert"
}
model_list = list(name_map.keys())

tokenized_test_dict = {}
for model_name in model_list:
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    def tokenize_fn(example):
        return tokenizer(example["text"], padding="max_length", truncation=True, max_length=256)
    tokenized = raw_datasets["test"].map(tokenize_fn, batched=True)
    tokenized.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
    tokenized_test_dict[model_name] = tokenized

y_true = np.vstack(tokenized_test_dict[model_list[0]]["labels"])

y_score_dict = {}
for model_name in model_list:
    short = name_map[model_name]
    ckpt = f"./checkpoints/{model_name.split('/')[0]}_best"
    model = AutoModelForSequenceClassification.from_pretrained(ckpt)
    tokenizer = AutoTokenizer.from_pretrained(ckpt)
    trainer = Trainer(model=model, tokenizer=tokenizer)
    
    pred = trainer.predict(tokenized_test_dict[model_name])
    logits = pred.predictions
    probs = expit(logits if isinstance(logits, np.ndarray) else logits.detach().cpu().numpy())
    y_score_dict[short] = probs

plt.figure(figsize=(6, 6))
for short, probs in y_score_dict.items():
    all_fpr = np.unique(np.concatenate([
        roc_curve(y_true[:, i], probs[:, i])[0]
        for i in range(num_labels)
    ]))
    mean_tpr = np.zeros_like(all_fpr)
    for i in range(num_labels):
        fpr, tpr, _ = roc_curve(y_true[:, i], probs[:, i])
        mean_tpr += np.interp(all_fpr, fpr, tpr)
    mean_tpr /= num_labels
    macro_auc = auc(all_fpr, mean_tpr)
    plt.plot(all_fpr, mean_tpr, lw=2, label=f"{short} (AUC={macro_auc:.3f})")

plt.plot([0, 1], [0, 1], '--', color='gray', lw=1)
plt.title("Macro-average ROC Curves Across Models")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive Rate")
plt.legend(loc="lower right")
plt.grid(True, linestyle="--", alpha=0.6)
plt.tight_layout()
plt.show()

first_full = model_list[0]
first = name_map[first_full]
roc_aucs = []
pr_aucs = []
for i in range(len(class_names)):
    roc_aucs.append(roc_auc_score(y_true[:, i], y_score_dict[first][:, i]))
    pr_aucs.append(average_precision_score(y_true[:, i], y_score_dict[first][:, i]))

x = np.arange(len(class_names))
w = 0.4
fig, ax = plt.subplots(figsize=(12, 5))
ax.bar(x - w/2, roc_aucs, w, label="ROC AUC", color="skyblue")
ax.bar(x + w/2, pr_aucs,  w, label="PR AUC",  color="gold")

for bar in ax.patches:
    height = bar.get_height()
    ax.annotate(f"{height:.3f}", (bar.get_x() + bar.get_width()/2, height),
                ha="center", va="bottom", fontsize=8)

ax.set_xticks(x)
ax.set_xticklabels(class_names, rotation=45, ha="right")
ax.set_ylabel("AUC")
ax.set_title(f"Per-class ROC & PR AUC ({first})")
ax.legend()
ax.grid(axis="y", linestyle="--", alpha=0.5)
plt.tight_layout()