In [2]:
import pandas as pd
from datasets import Dataset
import csv

# 加载csv数据
data = pd.read_csv("bio_data.csv")
data['token'] = data['token'].astype(str)
data['label'] = data['label'].astype(str)

# 将CSV数据转换为NER格式
def prepare_ner_data(data):
    ner_data = []
    sentence = {"tokens": [], "ner_tags": []}
    
    for index, row in data.iterrows():
        token = row['token']
        label = row['label']
        
        # 每个句子可能以特殊标记结尾（比如 '[CLS]'），我们根据标记处理换行
        if token == "[CLS]":
            if sentence['tokens']:  # 如果当前句子不为空，保存该句子
                ner_data.append(sentence)
            sentence = {"tokens": [], "ner_tags": []}  # 开始新的句子
        sentence['tokens'].append(token)
        sentence['ner_tags'].append(label)
    
    if sentence['tokens']:  # 添加最后一个句子
        ner_data.append(sentence)
    
    return ner_data

ner_data = prepare_ner_data(data)
dataset = Dataset.from_list(ner_data)


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# 获取所有的唯一标签{'O': 0, 'I-TREATMENT': 1, 'B-TREATMENT': 2}
unique_labels = list(set(data['label'].values))

# 生成标签到数字的映射
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

print("Label to ID mapping:", id2label)


Label to ID mapping: {0: 'I-TREATMENT', 1: 'B-TREATMENT', 2: 'O'}


In [3]:
from transformers import AutoTokenizer

# 选择一个预训练的BERT模型
model_name = "bert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_and_align_labels(examples):
    # 添加padding=True和truncation=True来确保输入长度一致
    tokenized_inputs = tokenizer(
        examples["tokens"],
        truncation=True,  # 截断超过最大长度的输入
        padding=True,  # 对输入进行padding以确保相同长度
        is_split_into_words=True,  # 保证tokenizer知道输入是分词后的列表
    )
    
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # 获取每个token的word id
        previous_word_idx = None
        label_ids = []
        
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)  # 对padding部分不计算损失
            elif word_idx != previous_word_idx:  # 只对第一个token进行标注
                label_ids.append(label2id[label[word_idx]])
            else:
                label_ids.append(-100)  # 对子词进行忽略
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


# 对数据集进行tokenization和标签对齐
tokenized_datasets = dataset.map(tokenize_and_align_labels, batched=True)


Map: 100%|██████████| 4999/4999 [00:10<00:00, 489.93 examples/s]


In [4]:
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer

# 加载预训练的BERT模型
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label2id))

# 定义训练参数
training_args = TrainingArguments(
    output_dir="./results",  # 模型保存路径
    evaluation_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
)

# 初始化Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
    tokenizer=tokenizer,
)

# 开始训练
model.to('cuda')
trainer.train()


Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Step,Training Loss
500,0.0437


TrainOutput(global_step=939, training_loss=0.03703850247481633, metrics={'train_runtime': 3195.5224, 'train_samples_per_second': 4.693, 'train_steps_per_second': 0.294, 'total_flos': 3918702889202688.0, 'train_loss': 0.03703850247481633, 'epoch': 3.0})

In [7]:
import torch

def predict(text):
    # 确保模型在GPU上
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)
    
    # 对输入的文本进行tokenization
    inputs = tokenizer(text.split(), return_tensors="pt", truncation=True, is_split_into_words=True)
    
    # 将输入数据移到GPU上
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # 获取模型的预测结果
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 处理预测结果
    predictions = outputs.logits.argmax(dim=2)
    predicted_labels = [id2label[prediction.item()] for prediction in predictions[0].cpu().numpy()]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
    
    return list(zip(tokens, predicted_labels))

# 示例预测
text = "The patient received radiation therapy for cancer treatment."
predictions = predict(text)
for token, label in predictions:
    print(f"{token}: {label}")


[CLS]: O
The: I-TREATMENT
patient: O
received: O
radiation: O
therapy: O
for: O
cancer: O
treatment: O
.: O
[SEP]: I-TREATMENT


In [10]:
from transformers import AutoModelForTokenClassification, AutoTokenizer
import torch

checkpoint_path = "./results/checkpoint-6000"
model = AutoModelForTokenClassification.from_pretrained(checkpoint_path)
tokenizer = AutoTokenizer.from_pretrained(checkpoint_path)

# 确保模型在GPU上
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict(text):
    # 对输入的文本进行tokenization
    inputs = tokenizer(text.split(), return_tensors="pt", truncation=True, is_split_into_words=True)
    
    # 将输入数据移到GPU上
    inputs = {key: value.to(device) for key, value in inputs.items()}
    
    # 获取模型的预测结果
    with torch.no_grad():
        outputs = model(**inputs)
    
    # 处理预测结果
    predictions = outputs.logits.argmax(dim=2)
    predicted_labels = [id2label[prediction.item()] for prediction in predictions[0].cpu().numpy()]
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0].cpu())
    
    # 筛选出标签为B-TREATMENT的内容
    result = [(token, label) for token, label in zip(tokens, predicted_labels) if label == 'B-TREATMENT']
    
    return result

# 示例预测
text = "The patient received radiation therapy for cancer treatment."
predictions = predict(text)
print(predictions)


[]
