In [None]:
#这里使用了筛选出来的data
import pandas as pd
from datasets import Dataset, DatasetDict

# 设置标签列名（根据你实际的标签列设置）
label_cols = [
    "Enlarged Cardiomediastinum", "Cardiomegaly", "Lung Opacity",
    "Lung Lesion", "Edema", "Consolidation", "Pneumonia",
    "Atelectasis", "Pneumothorax", "Pleural Effusion",
    "Pleural Other", "Fracture", "Support Devices"
]

# ⬇️ 主清洗函数：填补 text 为 'NA'，labels 的 NaN 为 0，并合并为 list
def clean_dataframe(df):
    # 填补空文本为 'NA'（字符串）
    df["text"] = df["text"].fillna("NA")

    # 填补标签空值为 0，确保类型为 int
    df[label_cols] = df[label_cols].fillna(0).astype(float)

    # 合并标签为 list[int] 放入 'labels' 列中
    df["labels"] = df[label_cols].values.tolist()

    return df

# ⬇️ 载入并处理三个数据集
def load_and_clean_all(train_path, dev_path, test_path):
    df_train = clean_dataframe(pd.read_csv(train_path))
    df_dev   = clean_dataframe(pd.read_csv(dev_path))
    df_test  = clean_dataframe(pd.read_csv(test_path))

    # 转换为 HuggingFace Datasets
    ds_train = Dataset.from_pandas(df_train)
    ds_dev   = Dataset.from_pandas(df_dev)
    ds_test  = Dataset.from_pandas(df_test)

    return DatasetDict({
        "train": ds_train,
        "validation": ds_dev,
        "test": ds_test
    })

# ⬇️ 替换为你自己的路径
train_file = "/content/text_train_data.csv"
dev_file   = "/content/text_val_data.csv"
test_file  = "/content/text_test_data.csv"

# 执行清洗和数据集构建
raw_datasets = load_and_clean_all(train_file, dev_file, test_file)

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

model_name = "microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext"
num_labels = len(label_cols)  # 多标签分类的标签数量

tokenizer = AutoTokenizer.from_pretrained(model_name)

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_labels,
    problem_type="multi_label_classification"  # ✅ 使用 BCEWithLogitsLoss
)

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=256  # 可按实际需求调整
    )

tokenized_datasets = raw_datasets.map(tokenize_function, batched=True)

In [None]:
from sklearn.metrics import f1_score, accuracy_score, roc_auc_score
import numpy as np
import torch

def multi_label_metrics(eval_preds):
    logits, labels = eval_preds

    # ✅ 概率输出（多标签 sigmoid）
    probs = torch.sigmoid(torch.tensor(logits)).numpy()
    y_true = labels
    y_pred = (probs > 0.5).astype(int)

    # ✅ 基本指标
    macro_f1 = f1_score(y_true, y_pred, average="macro")
    micro_f1 = f1_score(y_true, y_pred, average="micro")
    acc = accuracy_score(y_true, y_pred)

    # ✅ AUC（注意要用概率）
    try:
        macro_auc = roc_auc_score(y_true, probs, average="macro")
        micro_auc = roc_auc_score(y_true, probs, average="micro")
    except ValueError:
        macro_auc = -1
        micro_auc = -1

    # ✅ 每个标签单独的 AUC
    num_classes = y_true.shape[1]
    per_class_auc = []
    for i in range(num_classes):
        try:
            auc = roc_auc_score(y_true[:, i], probs[:, i])
        except ValueError:
            auc = -1
        per_class_auc.append(auc)

    # ✅ 构造返回字典
    result = {
        "macro_f1": macro_f1,
        "micro_f1": micro_f1,
        "accuracy": acc,
        "macro_auc": macro_auc,
        "micro_auc": micro_auc,
    }

    # ✅ 加入每类 AUC（默认命名 auc_class_0, auc_class_1,...）
    for i, auc in enumerate(per_class_auc):
        result[f"auc_class_{i}"] = auc

    return result

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="./checkpoints",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_steps=50,
    num_train_epochs=3,  # 可按需调整
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    tokenizer=tokenizer,
    compute_metrics=multi_label_metrics
)
trainer.train()

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

# ✅ 模型路径（你训练好的模型）
model_path = "./checkpoints/checkpoint-13683"  # 替换成你的路径
tokenizer = AutoTokenizer.from_pretrained(model_path)
model = AutoModelForSequenceClassification.from_pretrained(model_path)
model.eval()

# ✅ 标签名列表（你要替换成自己的标签顺序）
label_cols = [
    "Atelectasis", "Cardiomegaly", "Consolidation", "Edema", "Effusion",
    "Emphysema", "Fibrosis", "Hernia", "Infiltration", "Mass",
    "Nodule", "Pleural Thickening", "Pneumonia", "Pneumothorax"
]

# ✅ 输入报告
text = "the left lower lobe. No pleural effusion or pneumothorax. Mild atelectasis noted."

# ✅ 编码 + 推理
inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
with torch.no_grad():
    outputs = model(**inputs)
    probs = torch.sigmoid(outputs.logits).squeeze().cpu().numpy()  # 多标签用 sigmoid
    preds = (probs > 0.5).astype(int)

# ✅ 转换成字典格式
processed_report = {label: int(pred) for label, pred in zip(label_cols, preds)}

# ✅ 打印结果
from pprint import pprint
pprint(processed_report)
