In [9]:
!ls

EvaNer.ipynb  EvaNer.py  data.ipynb


In [10]:
!pip install pytorch-crf



In [1]:
from transformers import AutoTokenizer
import transformers
import torch
from datasets import load_dataset, load_from_disk
from transformers import AutoModel
from transformers import AdamW
import random
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset
from tqdm import tqdm
from torchcrf import CRF  # 引入 CRF

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [3]:
def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)

same_seeds(7890)

In [4]:
model_path = "../model/GujiRoBERTa_jian_fan"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)


# 示例文本
text = "主唱太拼命了"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 8, 768])


In [5]:
class TextLabelDataset(Dataset):
    def __init__(self, text_file, label_file, max_length=128):
        """
            text_file: 文本文件的路径.
            label_file: 标签文件的路径.
            tokenizer_name: 使用的 tokenizer 名称，默认为 'bert-base-chinese'.
            max_length: 最大序列长度，默认 128.
        """
        self.text_file = text_file
        self.label_file = label_file
        self.max_length = max_length
        self.texts, self.labels = self._load_data()
        
        self.dataset = self._filter_long_sentences() # 过滤掉过长的句子

    def _filter_long_sentences(self):
        """过滤掉过长的句子."""
        filtered_texts = []
        filtered_labels = []
        for text, label in zip(self.texts, self.labels):
            if len(text) <= self.max_length:
                filtered_texts.append(text)
                filtered_labels.append(label)

        return list(zip(filtered_texts,filtered_labels))

    def _load_data(self):
        """
        加载文本和标签数据。返回包含文本列表和标签列表的元组.
        """
        texts = []
        labels = []
        with open(self.text_file, 'r', encoding='utf-8') as f_text, \
                open(self.label_file, 'r', encoding='utf-8') as f_label:
            for text, label in zip(f_text, f_label):
                texts.append(list(text.strip()))
                labels.append(eval(label.strip()))  # 使用 eval 将字符串转换为 list
        return texts, labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens, labels = self.dataset[idx]
        return tokens, labels


text_file = '../data/text_A.txt'  # 文本文件的路径
label_file = '../data/label_A.txt'  # 标签文件的路径
dataset = TextLabelDataset(text_file, label_file)
tokens, labels = dataset[5]

len(dataset), tokens, labels

(1491,
 ['今',
  '楚',
  '地',
  '方',
  '五',
  '千',
  '里',
  '，',
  '持',
  '戟',
  '百',
  '萬',
  '，',
  '此',
  '霸',
  '王',
  '之',
  '資',
  '也',
  '。',
  '以',
  '楚',
  '之',
  '彊',
  '，',
  '天',
  '下',
  '弗',
  '能',
  '當',
  '。',
  '白',
  '起',
  '，',
  '小',
  '豎',
  '子',
  '耳',
  '，',
  '率',
  '數',
  '萬',
  '之',
  '衆',
  '，',
  '興',
  '師',
  '以',
  '與',
  '楚',
  '戰',
  '，',
  '一',
  '戰',
  '而',
  '舉',
  '鄢',
  '郢',
  '，',
  '再',
  '戰',
  '而',
  '燒',
  '夷',
  '陵',
  '，',
  '三',
  '戰',
  '而',
  '辱',
  '王',
  '之',
  '先',
  '人',
  '。',
  '此',
  '百',
  '世',
  '之',
  '怨',
  '而',
  '趙',
  '之',
  '所',
  '羞',
  '，',
  '而',
  '王',
  '弗',
  '知',
  '惡',
  '焉',
  '。',
  '合',
  '從',
  '者',
  '爲',
  '楚',
  '，',
  '非',
  '爲',
  '趙',
  '也',
  '。'],
 [24,
  20,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  20,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  20,
  0,
  0,


In [6]:
def collate_fn(data):
    tokens = [i[0] for i in data]
    labels = [i[1] for i in data]

    inputs = tokenizer.batch_encode_plus(tokens,
                                         truncation=True,
                                         padding=True,
                                         return_tensors='pt',
                                         is_split_into_words=True) 

    lens = inputs['input_ids'].shape[1]
    # print(lens)

    for i in range(len(labels)):
        labels[i] = [25] + labels[i]
        labels[i] += [25] * lens
        labels[i] = labels[i][:lens]

    return inputs.to(device), torch.LongTensor(labels).to(device)  # 将输入和标签都移动到设备上
    # return inputs, torch.LongTensor(labels)


loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=2,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

i = 0
for data in loader:
    i += 1
    if i == 4:
        print(data)
        break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


({'input_ids': tensor([[  101,  4264,  1724,  7386,  8024,  1724,   758,  2541,   722,  8024,
           679,  5543,  1139,   511,  1071,  2200,  6725,  6635,  2886,  1139,
         23186,  1293,  5632,  3011,  2782,  8024,  4912,  6725,  2198,  3669,
          6635,  2886,   511,  2886,  6725,  3134,  8024,  1293,  1724,  1282,
          5857,   782,  7360,  3636,  2128,  1409,   511,  3636,  2128,  1409,
          6243,  3288,  8038,   519,  1184,  4912,  2347,  2869,   677,  7955,
          8024,   677,  7955,  3696,   679,  3556,  4264,  4912,  5445,  3645,
          6635,   511,  6635,  1293,  1353,  6208,   511,  7478,  4674,  3669,
           722,  8024,  2607,  4264,   748,   511,   520,   718,  2925,  6266,
          5445,  4674, 23621,  3669,   722,  8024,  6909,  1071,  2207,  5442,
           753,  4636,  1724,  1282,   782,  3645,  6635,   511,  1184,  2527,
          3170,  7674,  5996,  1724,  1282,   758,  5857,   782,   511,  6635,
           782,  1920,  7448,   511, 

In [9]:
#加载预训练模型
model_path = "../model/GujiRoBERTa_jian_fan"
pretrained = AutoModel.from_pretrained(model_path, local_files_only=True).to(device)
# pretrained = AutoModel.from_pretrained(model_path, local_files_only=True)


#统计参数量
print(sum(i.numel() for i in pretrained.parameters()) / 10000)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11269.7856


In [10]:
#定义下游模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.tuneing = False
        self.pretrained = None

        # self.rnn = torch.nn.GRU(768, 768, batch_first=True)
        self.fc1 = torch.nn.Linear(768, 512)
        self.fc2 = torch.nn.Linear(512, 26)
        self.crf = CRF(26, batch_first=True)  # 添加 CRF 层

    def forward(self, inputs, labels=None): # 修改 forward 函数
        if self.tuneing:
            out = self.pretrained(**inputs).last_hidden_state
        else:
            with torch.no_grad():
                out = pretrained(**inputs).last_hidden_state

        # out, _ = self.rnn(out)

        out = self.fc1(out)
        out = self.fc2(out)

        if labels is not None:
            # 如果提供了 labels，则计算 CRF loss
            mask = inputs['attention_mask'].bool()
            loss = -self.crf(out, labels, mask=mask, reduction='mean')
            return loss
        else:
            # 否则，使用 CRF 进行解码
            mask = inputs['attention_mask'].bool()
            prediction = self.crf.decode(out, mask=mask)
            return prediction

    def fine_tuneing(self, tuneing):
        self.tuneing = tuneing
        if tuneing:
            for i in pretrained.parameters():
                i.requires_grad = True

            pretrained.train()
            self.pretrained = pretrained
        else:
            for i in pretrained.parameters():
                i.requires_grad_(False)

            pretrained.eval()
            self.pretrained = None


model = Model().to(device)
# model = Model()

In [11]:
# #对计算结果和label变形,并且移除pad
# def reshape_and_remove_pad(outs, labels, attention_mask):
#     #变形,便于计算loss
#     outs = outs.reshape(-1, 26)
#     labels = labels.reshape(-1)

#     #忽略对pad的计算结果
#     select = attention_mask.reshape(-1) == 1
#     outs = outs[select]
#     labels = labels[select]

#     return outs, labels


# reshape_and_remove_pad(torch.randn(2, 3, 26), torch.ones(2, 3),
#                        torch.ones(2, 3))

In [12]:
def get_correct_and_total_count(labels, outs, attention_mask):
    # 将预测结果和标签都转换为一维列表
    active_outs = [item for sublist in outs for item in sublist]  # outs 本身已经是 list of lists，直接展平
    active_labels = []
    
    # 遍历每个样本的标签和对应的 attention_mask
    for label_seq, mask_seq in zip(labels, attention_mask):
        for label, mask in zip(label_seq, mask_seq):
            if mask:  # 只保留 attention_mask 中为 True 的部分
                active_labels.append(label.item())

    # 确保 active_outs 和 active_labels 都是 list
    active_outs = [int(item) for item in active_outs]
    active_labels = [int(item) for item in active_labels]

    # 转换成 tensor
    active_outs = torch.tensor(active_outs).to(device)
    active_labels = torch.tensor(active_labels).to(device)

    correct = (active_outs == active_labels).sum().item()
    total = len(active_labels)

    # 计算除了0以外元素的正确率
    select = (active_labels != 0)
    active_outs = active_outs[select]
    active_labels = active_labels[select]
    correct_content = (active_outs == active_labels).sum().item()
    total_content = len(active_labels)

    return correct, total, correct_content, total_content

In [13]:
# #训练
# def train(epochs):
#     # lr = 2e-5 if model.tuneing else 5e-4
#     lr = 1e-5 if model.tuneing else 1e-4    
#     optimizer = AdamW(model.parameters(), lr=lr)
#     # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#     criterion = torch.nn.CrossEntropyLoss()

#     model.train()
#     for epoch in range(epochs):
#         for step, (inputs, labels) in enumerate(loader):
#             outs = model(inputs)
            
#             #对outs和label变形,并且移除pad
#             outs, labels = reshape_and_remove_pad(outs, labels,
#                                                 inputs['attention_mask'])

#             #梯度下降
#             loss = criterion(outs, labels)
#             loss.backward()
#             optimizer.step()
#             optimizer.zero_grad()

#             if step % 50 == 0:
#                 counts = get_correct_and_total_count(labels, outs)

#                 accuracy = counts[0] / counts[1]
#                 accuracy_content = counts[2] / counts[3]

#                 print(epoch, step, loss.item(), accuracy, accuracy_content)

#         torch.save(model, '../model/NER_ZH.model')

In [14]:
def train(epochs):
    lr = 2e-5 if model.tuneing else 5e-4
    # lr = 1e-5 if model.tuneing else 1e-4

    optimizer = AdamW(model.parameters(), lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # # 计算每个类别的样本数量
    # label_counts = [0] * 26  # 有26个类别
    # for _, labels in dataset:
    #     for label in labels:
    #         if label != 25:
    #           label_counts[label] += 1

    # # 计算权重，做倒数
    # weights = [1.0 / count if count > 0 else 0 for count in label_counts]
    # weights = torch.tensor(weights).to(device)

    # criterion = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=25) 
    
    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        progress_bar = tqdm(loader, desc="Training", unit="batch")

        for step, (inputs, labels) in enumerate(progress_bar):

            # 将输入移动到设备
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            
            # 梯度下降
            loss = model(inputs, labels) # 直接用 model 计算 loss
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if step % 100 == 0:
                with torch.no_grad():
                  outs = model(inputs) # 得到预测结果
                  counts = get_correct_and_total_count(labels, outs, inputs['attention_mask'])
                  accuracy = counts[0] / counts[1] if counts[1] > 0 else 0
                  accuracy_content = counts[2] / counts[3] if counts[3] > 0 else 0

                progress_bar.set_postfix({
                    "loss": f"{loss.item():.4f}",
                    "accuracy": f"{accuracy:.4f}",
                    # "accuracy_content": f"{accuracy_content:.4f}",
                    "accuracy_content": f"{accuracy_content}",
                })
        
        torch.save(model, '../model/NER_ZH.model')

In [15]:
model.fine_tuneing(False)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(4)



40.7794
Epoch 1/4


Training: 100%|██████████| 745/745 [01:10<00:00, 10.63batch/s, loss=17.0045, accuracy=0.9167, accuracy_content=0.8]               


Epoch 2/4


Training: 100%|██████████| 745/745 [01:07<00:00, 11.02batch/s, loss=3.0452, accuracy=0.9957, accuracy_content=0.9705882352941176] 


Epoch 3/4


Training: 100%|██████████| 745/745 [01:10<00:00, 10.62batch/s, loss=5.9724, accuracy=0.9877, accuracy_content=0.9583333333333334] 


Epoch 4/4


Training: 100%|██████████| 745/745 [01:11<00:00, 10.38batch/s, loss=2.5367, accuracy=0.9837, accuracy_content=0.9333333333333333] 


In [16]:
model.fine_tuneing(True)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(1)

11310.565
Epoch 1/1


Training: 100%|██████████| 745/745 [01:29<00:00,  8.32batch/s, loss=1.6133, accuracy=0.9955, accuracy_content=0.9861111111111112] 


In [20]:
text_file = '../data/text_A_test.txt'
label_file = '../data/label_A_test.txt'

#测试
def predict():
    model_load = torch.load('../model/NER_ZH.model')
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (inputs, labels) in enumerate(loader_test):
        break

    with torch.no_grad():
        outs = model_load(inputs) # outs 直接是解码后的结果，是一个 list of lists

    for i in range(2):
        # 移除 pad
        select = inputs['attention_mask'][i] == 1
        input_id = inputs['input_ids'][i, select]
        out = outs[i]  # 直接使用 outs[i] 获取当前样本的预测标签序列
        label = labels[i, select]

        # 输出原句子
        print(tokenizer.decode(input_id).replace(' ', ''))

        # 输出 tag
        # 标签
        s = ''
        for j in range(len(label)):
            if label[j] == 0:
                s += '·'
                continue
            s += tokenizer.decode(input_id[j])
            s += str(label[j].item())
        print("Label:", s)

        # 预测
        s = ''
        for j in range(len(out)):
            if out[j] == 0:
                s += '·'
                continue
            s += tokenizer.decode(input_id[j])
            s += str(out[j])
        print("Out:", s)

        print('==========================')

predict()

  model_load = torch.load('../model/NER_ZH.model')


[CLS]法不當砭灸，砭灸至氣逐。」問臣意：「師慶安受之？聞於齊諸侯不？」對曰：「不知慶所師受。慶家富，善爲醫，不肯爲人治病，當以此故不聞。慶又告臣意曰：『慎毋令我子孫知若學我方也。』」問臣意：「師慶何見於意而愛意，欲悉教意方？」對曰：「臣意不聞師慶爲方善也。[SEP]
Label: [CLS]25···············意4···慶4······齊20···········慶4····慶4·····················慶4···意4····················意4···慶4···意4··意4····意4········意4···慶4·····[SEP]25
Out: [CLS]25···············意4···慶4······齊20···········慶4····慶4·····················慶4···意4····················意4···慶4···意4··意4····意4········意4···慶4·····[SEP]25
[CLS]今青臣等又面諛以重陛下過，非忠臣也。」始皇下其議丞相。丞相謬其説，絀其辭，乃上書曰：「古者天下散亂，莫能相一，是以諸侯並作，語皆道古以害今，飾虚言以亂實，人善其所私學，以非上所建立。[SEP]
Label: [CLS]25今24青4·················始1皇3···丞13相15·丞13相15······························································[SEP]25
Out: [CLS]25今24青1臣3················始1皇3···丞13相15·丞13相15··············古21者23··············································[SEP]25


In [21]:
#测试
def test():
    model_load = torch.load('../model/NER_ZH.model')
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    correct = 0
    total = 0

    correct_content = 0
    total_content = 0

    for step, (inputs, labels) in enumerate(loader_test):
        if step == 5:
            break
        # print(step)

        with torch.no_grad():
            outs = model_load(inputs) # outs 直接是解码后的结果，是一个 list of lists

        counts = get_correct_and_total_count(labels, outs, inputs['attention_mask'])
        print(counts)
        correct += counts[0]
        total += counts[1]
        correct_content += counts[2]
        total_content += counts[3]

    print(correct / total, correct_content / total_content)

test()

  model_load = torch.load('../model/NER_ZH.model')


(249, 258, 50, 59)
(241, 244, 56, 56)
(230, 231, 30, 31)
(223, 227, 74, 75)
(233, 236, 46, 48)
0.9832775919732442 0.9516728624535316
