In [9]:
!ls

EvaNer.ipynb  EvaNer.py  data.ipynb


In [10]:
!pip install pytorch-crf



In [14]:
from transformers import AutoTokenizer
import transformers
import torch
from datasets import load_dataset, load_from_disk
from transformers import AutoModel
from transformers import AdamW
import random
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset
from tqdm import tqdm
from torchcrf import CRF  # 引入 CRF

In [15]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [16]:
def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)

same_seeds(7890)

In [17]:
model_path = "../model/GujiRoBERTa_jian_fan"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)


# 示例文本
text = "主唱太拼命了"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 8, 768])


In [18]:
class TextLabelDataset(Dataset):
    def __init__(self, text_file, label_file, max_length=510):
        """
            text_file: 文本文件的路径.
            label_file: 标签文件的路径.
            tokenizer_name: 使用的 tokenizer 名称，默认为 'bert-base-chinese'.
            max_length: 最大序列长度，默认 510.
        """
        self.text_file = text_file
        self.label_file = label_file
        self.max_length = max_length
        self.texts, self.labels = self._load_data()
        
        self.dataset = self._filter_long_sentences() # 过滤掉过长的句子

    def _filter_long_sentences(self):
        """过滤掉过长的句子"""
        filtered_texts = []
        filtered_labels = []
        for text, label in zip(self.texts, self.labels):
            if len(text) <= self.max_length:
                filtered_texts.append(text)
                filtered_labels.append(label)

        return list(zip(filtered_texts,filtered_labels))

    def _load_data(self):
        """
        加载文本和标签数据。返回包含文本列表和标签列表的元组.
        """
        texts = []
        labels = []
        with open(self.text_file, 'r', encoding='utf-8') as f_text, \
                open(self.label_file, 'r', encoding='utf-8') as f_label:
            for text, label in zip(f_text, f_label):
                texts.append(list(text.strip()))
                labels.append(eval(label.strip()))  # 使用 eval 将字符串转换为 list
        return texts, labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens, labels = self.dataset[idx]
        return tokens, labels


text_file = '../data/text_B.txt'  # 文本文件的路径
label_file = '../data/label_B.txt'  # 标签文件的路径
dataset = TextLabelDataset(text_file, label_file)
tokens, labels = dataset[5]

len(dataset), tokens, labels

(3405,
 ['我', '命', '旨', '酒', '，', '以', '歌', '以', '謡', '。'],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
def collate_fn(data):
    tokens = [i[0] for i in data]
    labels = [i[1] for i in data]

    inputs = tokenizer.batch_encode_plus(tokens,
                                         truncation=True,
                                         padding=True,
                                         return_tensors='pt',
                                         is_split_into_words=True) 

    lens = inputs['input_ids'].shape[1]
    # print(lens)

    for i in range(len(labels)):
        labels[i] = [13] + labels[i]
        labels[i] += [13] * lens
        labels[i] = labels[i][:lens]

    return inputs.to(device), torch.LongTensor(labels).to(device)  # 将输入和标签都移动到设备上
    # return inputs, torch.LongTensor(labels)


loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=2,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

i = 0
for data in loader:
    i += 1
    if i == 4:
        print(data)
        break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


({'input_ids': tensor([[ 101, 3501, 2300, 1251, 4761,  780, 2336, 8024, 2603, 2617, 4158,  679,
         3791, 8024, 6298, 5885, 1814, 4761, 5238, 4374, 4509, 5389, 8024, 3462,
         6843, 4352,  511,  102,    0,    0],
        [ 101, 1350, 5800, 2134, 2200, 5635, 3745, 1814, 8024, 4352, 1401, 1315,
         1343, 1071, 3431, 3451, 8024, 2876, 6342, 5445, 1139,  722, 8024,  718,
         6210, 3176, 6662,  979,  511,  102]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1]], device='cuda:0')}, tensor([[13,  1,  2,  3,  0,  5,  7,  0

In [20]:
#加载预训练模型
model_path = "../model/GujiRoBERTa_jian_fan"
pretrained = AutoModel.from_pretrained(model_path, local_files_only=True).to(device)
# pretrained = AutoModel.from_pretrained(model_path, local_files_only=True)


#统计参数量
print(sum(i.numel() for i in pretrained.parameters()) / 10000)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11269.7856


In [21]:
#定义下游模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.tuneing = False
        self.pretrained = None

        self.rnn = torch.nn.LSTM(768, 768, batch_first=True)
        self.fc = torch.nn.Linear(768, 14)
        self.crf = CRF(14, batch_first=True)

    def forward(self, inputs, labels=None):
        if self.tuneing:
            out = self.pretrained(**inputs).last_hidden_state
        else:
            with torch.no_grad():
                out = pretrained(**inputs).last_hidden_state

        out, _ = self.rnn(out)
        out = self.fc(out)

        if labels is not None:
            # 如果提供了 labels，则计算 CRF loss
            mask = inputs['attention_mask'].bool()
            loss = -self.crf(out, labels, mask=mask, reduction='mean')
            return loss
        else:
            # 否则，使用 CRF 进行解码
            mask = inputs['attention_mask'].bool()
            prediction = self.crf.decode(out, mask=mask)
            return prediction

    def fine_tuneing(self, tuneing):
        self.tuneing = tuneing
        if tuneing:
            for i in pretrained.parameters():
                i.requires_grad = True

            pretrained.train()
            self.pretrained = pretrained
        else:
            for i in pretrained.parameters():
                i.requires_grad_(False)

            pretrained.eval()
            self.pretrained = None


model = Model().to(device)
# model = Model()

In [22]:
# #对计算结果和label变形,并且移除pad
# def reshape_and_remove_pad(outs, labels, attention_mask):
#     #变形,便于计算loss
#     outs = outs.reshape(-1, 14)
#     labels = labels.reshape(-1)

#     #忽略对pad的计算结果
#     select = attention_mask.reshape(-1) == 1
#     outs = outs[select]
#     labels = labels[select]

#     return outs, labels


# reshape_and_remove_pad(torch.randn(2, 3, 14), torch.ones(2, 3),
#                        torch.ones(2, 3))

In [23]:
def get_correct_and_total_count(labels, outs, attention_mask):
    # 将预测结果和标签都转换为一维列表
    active_outs = [item for sublist in outs for item in sublist]  # outs 本身已经是 list of lists，直接展平
    active_labels = []
    
    # 遍历每个样本的标签和对应的 attention_mask
    for label_seq, mask_seq in zip(labels, attention_mask):
        for label, mask in zip(label_seq, mask_seq):
            if mask:  # 只保留 attention_mask 中为 True 的部分
                active_labels.append(label.item())

    # 确保 active_outs 和 active_labels 都是 list
    active_outs = [int(item) for item in active_outs]
    active_labels = [int(item) for item in active_labels]

    # 转换成 tensor
    active_outs = torch.tensor(active_outs).to(device)
    active_labels = torch.tensor(active_labels).to(device)

    correct = (active_outs == active_labels).sum().item()
    total = len(active_labels)

    # 计算除了0以外元素的正确率
    select = (active_labels != 0)
    active_outs = active_outs[select]
    active_labels = active_labels[select]
    correct_content = (active_outs == active_labels).sum().item()
    total_content = len(active_labels)

    return correct, total, correct_content, total_content

In [24]:
def train(epochs):
    lr = 2e-5 if model.tuneing else 5e-4
    # lr = 1e-5 if model.tuneing else 1e-4

    optimizer = AdamW(model.parameters(), lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)
    
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        progress_bar = tqdm(loader, desc="Training", unit="batch")

        for step, (inputs, labels) in enumerate(progress_bar):

            # 将输入移动到设备
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            
            # 梯度下降
            loss = model(inputs, labels) # 直接用 model 计算 loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if step % 100 == 0:
                with torch.no_grad():
                  outs = model(inputs) # 得到预测结果
                  counts = get_correct_and_total_count(labels, outs, inputs['attention_mask'])
                  accuracy = counts[0] / counts[1] if counts[1] > 0 else 0
                  accuracy_content = counts[2] / counts[3] if counts[3] > 0 else 0

                progress_bar.set_postfix({
                    "loss": f"{loss.item():.4f}",
                    "accuracy": f"{accuracy:.4f}",
                    # "accuracy_content": f"{accuracy_content:.4f}",
                    "accuracy_content": f"{accuracy_content}",
                })
        
        torch.save(model, '../model/NER_crf_lstm_B.model')

In [25]:
model.fine_tuneing(False)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(4)

473.5726
Epoch 1/4


Training: 100%|██████████████| 1702/1702 [02:17<00:00, 12.36batch/s, loss=0.2599, accuracy=1.0000, accuracy_content=1.0]


Epoch 2/4


Training: 100%|██████████████| 1702/1702 [02:18<00:00, 12.30batch/s, loss=1.7593, accuracy=1.0000, accuracy_content=1.0]


Epoch 3/4


Training: 100%|█| 1702/1702 [02:30<00:00, 11.29batch/s, loss=7.3795, accuracy=0.9016, accuracy_content=0.863636363636363


Epoch 4/4


Training: 100%|██████████████| 1702/1702 [02:39<00:00, 10.69batch/s, loss=0.4687, accuracy=1.0000, accuracy_content=1.0]


In [26]:
model.fine_tuneing(True)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(1)

11743.3582
Epoch 1/1


Training: 100%|██████████████| 1702/1702 [09:49<00:00,  2.89batch/s, loss=0.7117, accuracy=1.0000, accuracy_content=1.0]


In [37]:
text_file = '../data/text_C_test.txt'
label_file = '../data/label_C_test.txt'

#测试
def predict():
    model_load = torch.load('../model/NER_crf_C.model', weights_only=False)
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (inputs, labels) in enumerate(loader_test):
        break

    with torch.no_grad():
        outs = model_load(inputs) # outs 直接是解码后的结果，是一个 list of lists

    for i in range(2):
        # 移除 pad
        select = inputs['attention_mask'][i] == 1
        input_id = inputs['input_ids'][i, select]
        out = outs[i]  # 直接使用 outs[i] 获取当前样本的预测标签序列
        label = labels[i, select]

        # 输出原句子
        print(tokenizer.decode(input_id).replace(' ', ''))

        # 输出 tag
        # 标签
        s = ''
        for j in range(len(label)):
            if label[j] == 0:
                s += '·'
                continue
            s += tokenizer.decode(input_id[j])
            s += str(label[j].item())
        print("Label:", s)

        # 预测
        s = ''
        for j in range(len(out)):
            if out[j] == 0:
                s += '·'
                continue
            s += tokenizer.decode(input_id[j])
            s += str(out[j])
        print("Out:", s)

        print('==========================')

predict()

[CLS]故意合則胡越爲昆弟，由余、越人蒙是矣；不合，則骨肉出逐不收，朱、象、管、蔡是矣。[SEP]
Label: [CLS]25··········由1余3·越1人2蒙3··············朱4·象4·管4·蔡4···[SEP]25
Out: [CLS]25··········由1余3·越1人2蒙3··············朱4·象4·管4·蔡4···[SEP]25
[CLS]攻齊所以大破者，以其伐楚而肥韓、魏也。[SEP]
Label: [CLS]25·齊20·········楚20··韓20·魏20··[SEP]25
Out: [CLS]25·齊20·········楚20··韓20·魏20··[SEP]25


In [38]:
#测试
def test():
    model_load = torch.load('../model/NER_crf_C.model', weights_only=False)
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    correct = 0
    total = 0

    correct_content = 0
    total_content = 0

    for step, (inputs, labels) in enumerate(loader_test):
        if step == 5:
            break
        # print(step)

        with torch.no_grad():
            outs = model_load(inputs) # outs 直接是解码后的结果，是一个 list of lists

        counts = get_correct_and_total_count(labels, outs, inputs['attention_mask'])
        print(counts)
        correct += counts[0]
        total += counts[1]
        correct_content += counts[2]
        total_content += counts[3]

    print(correct / total, correct_content / total_content)

test()

(32, 33, 5, 6)
(57, 57, 11, 11)
(42, 42, 7, 7)
(64, 64, 19, 19)
(35, 35, 9, 9)
0.9956709956709957 0.9807692307692307
