In [22]:
!ls

EvaNer.ipynb  EvaNer.py  NER_ZH.ipynb  data.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [23]:
from transformers import AutoTokenizer
import transformers
import torch
from datasets import load_dataset, load_from_disk
from transformers import AutoModel
from transformers import AdamW
import random
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset
from tqdm import tqdm

In [24]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [25]:
def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)

same_seeds(7890)

In [26]:
model_path = "../model/GujiRoBERTa_jian_fan"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)


# 示例文本
text = "主唱太拼命了"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 8, 768])


In [27]:
class TextLabelDataset(Dataset):
    def __init__(self, text_file, label_file, max_length=128):
        """
            text_file: 文本文件的路径.
            label_file: 标签文件的路径.
            tokenizer_name: 使用的 tokenizer 名称，默认为 'bert-base-chinese'.
            max_length: 最大序列长度，默认 128.
        """
        self.text_file = text_file
        self.label_file = label_file
        self.max_length = max_length
        self.texts, self.labels = self._load_data()
        
        self.dataset = self._filter_long_sentences() # 过滤掉过长的句子

    def _filter_long_sentences(self):
        """过滤掉过长的句子."""
        filtered_texts = []
        filtered_labels = []
        for text, label in zip(self.texts, self.labels):
            if len(text) <= self.max_length:
                filtered_texts.append(text)
                filtered_labels.append(label)

        return list(zip(filtered_texts,filtered_labels))

    def _load_data(self):
        """
        加载文本和标签数据。返回包含文本列表和标签列表的元组.
        """
        texts = []
        labels = []
        with open(self.text_file, 'r', encoding='utf-8') as f_text, \
                open(self.label_file, 'r', encoding='utf-8') as f_label:
            for text, label in zip(f_text, f_label):
                texts.append(list(text.strip()))
                labels.append(eval(label.strip()))  # 使用 eval 将字符串转换为 list
        return texts, labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens, labels = self.dataset[idx]
        return tokens, labels


text_file = '../data/text_A.txt'  # 文本文件的路径
label_file = '../data/label_A.txt'  # 标签文件的路径
dataset = TextLabelDataset(text_file, label_file)
tokens, labels = dataset[5]

len(dataset), tokens, labels

(1491,
 ['今',
  '楚',
  '地',
  '方',
  '五',
  '千',
  '里',
  '，',
  '持',
  '戟',
  '百',
  '萬',
  '，',
  '此',
  '霸',
  '王',
  '之',
  '資',
  '也',
  '。',
  '以',
  '楚',
  '之',
  '彊',
  '，',
  '天',
  '下',
  '弗',
  '能',
  '當',
  '。',
  '白',
  '起',
  '，',
  '小',
  '豎',
  '子',
  '耳',
  '，',
  '率',
  '數',
  '萬',
  '之',
  '衆',
  '，',
  '興',
  '師',
  '以',
  '與',
  '楚',
  '戰',
  '，',
  '一',
  '戰',
  '而',
  '舉',
  '鄢',
  '郢',
  '，',
  '再',
  '戰',
  '而',
  '燒',
  '夷',
  '陵',
  '，',
  '三',
  '戰',
  '而',
  '辱',
  '王',
  '之',
  '先',
  '人',
  '。',
  '此',
  '百',
  '世',
  '之',
  '怨',
  '而',
  '趙',
  '之',
  '所',
  '羞',
  '，',
  '而',
  '王',
  '弗',
  '知',
  '惡',
  '焉',
  '。',
  '合',
  '從',
  '者',
  '爲',
  '楚',
  '，',
  '非',
  '爲',
  '趙',
  '也',
  '。'],
 [24,
  20,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  20,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  1,
  3,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  20,
  0,
  0,


In [28]:
def collate_fn(data):
    tokens = [i[0] for i in data]
    labels = [i[1] for i in data]

    inputs = tokenizer.batch_encode_plus(tokens,
                                         truncation=True,
                                         padding=True,
                                         return_tensors='pt',
                                         is_split_into_words=True) 

    lens = inputs['input_ids'].shape[1]
    # print(lens)

    for i in range(len(labels)):
        labels[i] = [25] + labels[i]
        labels[i] += [25] * lens
        labels[i] = labels[i][:lens]

    return inputs.to(device), torch.LongTensor(labels).to(device)  # 将输入和标签都移动到设备上
    # return inputs, torch.LongTensor(labels)


loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=2,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

i = 0
for data in loader:
    i += 1
    if i == 4:
        print(data)
        break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


({'input_ids': tensor([[  101,  4264,  1724,  7386,  8024,  1724,   758,  2541,   722,  8024,
           679,  5543,  1139,   511,  1071,  2200,  6725,  6635,  2886,  1139,
         23186,  1293,  5632,  3011,  2782,  8024,  4912,  6725,  2198,  3669,
          6635,  2886,   511,  2886,  6725,  3134,  8024,  1293,  1724,  1282,
          5857,   782,  7360,  3636,  2128,  1409,   511,  3636,  2128,  1409,
          6243,  3288,  8038,   519,  1184,  4912,  2347,  2869,   677,  7955,
          8024,   677,  7955,  3696,   679,  3556,  4264,  4912,  5445,  3645,
          6635,   511,  6635,  1293,  1353,  6208,   511,  7478,  4674,  3669,
           722,  8024,  2607,  4264,   748,   511,   520,   718,  2925,  6266,
          5445,  4674, 23621,  3669,   722,  8024,  6909,  1071,  2207,  5442,
           753,  4636,  1724,  1282,   782,  3645,  6635,   511,  1184,  2527,
          3170,  7674,  5996,  1724,  1282,   758,  5857,   782,   511,  6635,
           782,  1920,  7448,   511, 

In [31]:
#加载预训练模型
model_path = "../model/GujiRoBERTa_jian_fan"
pretrained = AutoModel.from_pretrained(model_path, local_files_only=True).to(device)
# pretrained = AutoModel.from_pretrained(model_path, local_files_only=True)


#统计参数量
print(sum(i.numel() for i in pretrained.parameters()) / 10000)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11269.7856


In [32]:
#定义下游模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.tuneing = False
        self.pretrained = None

        # self.rnn = torch.nn.GRU(768, 768, batch_first=True)
        self.fc1 = torch.nn.Linear(768, 512)
        self.fc2 = torch.nn.Linear(512, 26)

    def forward(self, inputs):
        if self.tuneing:
            out = self.pretrained(**inputs).last_hidden_state
        else:
            with torch.no_grad():
                out = pretrained(**inputs).last_hidden_state

        # out, _ = self.rnn(out)

        out = self.fc1(out)
        out = F.softmax(self.fc2(out), dim=2)

        return out

    def fine_tuneing(self, tuneing):
        self.tuneing = tuneing
        if tuneing:
            for i in pretrained.parameters():
                i.requires_grad = True

            pretrained.train()
            self.pretrained = pretrained
        else:
            for i in pretrained.parameters():
                i.requires_grad_(False)

            pretrained.eval()
            self.pretrained = None


model = Model().to(device)
# model = Model()

In [33]:
#对计算结果和label变形,并且移除pad
def reshape_and_remove_pad(outs, labels, attention_mask):
    #变形,便于计算loss
    outs = outs.reshape(-1, 26)
    labels = labels.reshape(-1)

    #忽略对pad的计算结果
    select = attention_mask.reshape(-1) == 1
    outs = outs[select]
    labels = labels[select]

    return outs, labels


reshape_and_remove_pad(torch.randn(2, 3, 26), torch.ones(2, 3),
                       torch.ones(2, 3))

(tensor([[-0.8420,  0.5070,  0.1182,  1.1984,  1.0579, -1.2031, -0.7358,  0.3194,
           0.3987, -0.4950, -0.2447,  0.7975, -1.0523, -1.7899, -1.3929,  0.3385,
          -0.8516,  1.0509, -0.1571, -0.2533,  1.4901, -0.0327, -1.2014, -0.7821,
           0.2197,  0.2431],
         [-2.6020, -0.8985, -0.5458, -0.6226, -0.5878, -2.0267,  0.0862, -0.2964,
           0.9675,  0.2973,  0.6631, -1.2984,  0.5594, -0.1245, -0.4351, -1.3926,
           0.6670, -0.3480, -0.7823,  0.1771,  0.0228, -1.1189,  1.6513, -1.4321,
          -0.6285,  0.0367],
         [ 1.1671, -0.1989,  2.2370,  0.4147,  0.2827, -0.2688,  0.9631, -0.6835,
           0.4847,  1.1674,  0.1868, -0.0167, -0.3660, -0.0143, -0.4304,  1.2072,
          -1.5384,  1.9352, -0.0049,  1.3987,  0.5744,  1.7467,  1.5225, -1.4022,
           0.8777,  0.9350],
         [ 0.8227,  1.6903, -1.1558,  0.1568, -1.6183,  1.3930,  0.3279, -0.1916,
          -0.0409,  0.6019,  0.3101,  0.8075, -0.3037,  0.5119,  1.1302, -0.4810,
          -

In [34]:
#获取正确数量和总数
def get_correct_and_total_count(labels, outs):
    outs = outs.argmax(dim=1)
    correct = (outs == labels).sum().item()
    total = len(labels)

    #计算除了0以外元素的正确率
    select = (labels != 0)
    outs = outs[select]
    labels = labels[select]
    correct_content = (outs == labels).sum().item()
    total_content = len(labels)

    return correct, total, correct_content, total_content


get_correct_and_total_count(torch.ones(16), torch.randn(16, 26))

(0, 16, 0, 16)

In [35]:
# #训练
# def train(epochs):
#     # lr = 2e-5 if model.tuneing else 5e-4
#     lr = 1e-5 if model.tuneing else 1e-4    
#     optimizer = AdamW(model.parameters(), lr=lr)
#     # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#     criterion = torch.nn.CrossEntropyLoss()

#     model.train()
#     for epoch in range(epochs):
#         for step, (inputs, labels) in enumerate(loader):
#             outs = model(inputs)
            
#             #对outs和label变形,并且移除pad
#             outs, labels = reshape_and_remove_pad(outs, labels,
#                                                 inputs['attention_mask'])

#             #梯度下降
#             loss = criterion(outs, labels)
#             loss.backward()
#             optimizer.step()
#             optimizer.zero_grad()

#             if step % 50 == 0:
#                 counts = get_correct_and_total_count(labels, outs)

#                 accuracy = counts[0] / counts[1]
#                 accuracy_content = counts[2] / counts[3]

#                 print(epoch, step, loss.item(), accuracy, accuracy_content)

#         torch.save(model, '../model/NER_ZH.model')

In [36]:
#训练
def train(epochs):
    lr = 1e-6 if model.tuneing else 1e-5
    # lr = 1e-5 if model.tuneing else 1e-4

    optimizer = AdamW(model.parameters(), lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # criterion = torch.nn.CrossEntropyLoss()

    # 计算每个类别的样本数量
    label_counts = [0] * 26  # 有26个类别
    for _, labels in dataset:
        for label in labels:
            if label != 25:
              label_counts[label] += 1

    # 计算权重，做倒数
    weights = [1.0 / count if count > 0 else 0 for count in label_counts]
    weights = torch.tensor(weights).to(device)

    criterion = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=25) 


    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        progress_bar = tqdm(loader, desc="Training", unit="batch")

        for step, (inputs, labels) in enumerate(progress_bar):

            # 将输入移动到设备
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            
            outs = model(inputs)

            # 对outs和label变形,并且移除pad
            outs, labels = reshape_and_remove_pad(outs, labels,
                                                inputs['attention_mask'])

            # 梯度下降
            loss = criterion(outs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if step % 500 == 0:
                counts = get_correct_and_total_count(labels, outs)
                accuracy = counts[0] / counts[1] if counts[1] > 0 else 0
                accuracy_content = counts[2] / counts[3] if counts[3] > 0 else 0

                progress_bar.set_postfix({
                    "loss": f"{loss.item():.4f}",
                    "accuracy": f"{accuracy:.4f}",
                    # "accuracy_content": f"{accuracy_content:.4f}",
                    "accuracy_content": f"{accuracy_content}",
                })
        
        torch.save(model, '../model/NER_ZH.model')


In [37]:
model.fine_tuneing(False)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(20)

40.7066
Epoch 1/20


Training: 100%|██████████| 745/745 [00:07<00:00, 96.48batch/s, loss=3.1929, accuracy=0.0315, accuracy_content=0.05970149253731343] 


Epoch 2/20


Training: 100%|██████████| 745/745 [00:07<00:00, 100.61batch/s, loss=2.9568, accuracy=0.6926, accuracy_content=0.5116279069767442]


Epoch 3/20


Training: 100%|██████████| 745/745 [00:07<00:00, 100.73batch/s, loss=2.5486, accuracy=0.8607, accuracy_content=0.7407407407407407]


Epoch 4/20


Training: 100%|██████████| 745/745 [00:07<00:00, 104.64batch/s, loss=3.1126, accuracy=0.8008, accuracy_content=0.6388888888888888]


Epoch 5/20


Training: 100%|██████████| 745/745 [00:07<00:00, 105.17batch/s, loss=2.8768, accuracy=0.9148, accuracy_content=0.78]


Epoch 6/20


Training: 100%|██████████| 745/745 [00:07<00:00, 105.65batch/s, loss=3.1792, accuracy=0.8634, accuracy_content=0.7931034482758621]


Epoch 7/20


Training: 100%|██████████| 745/745 [00:07<00:00, 106.15batch/s, loss=2.4366, accuracy=0.8738, accuracy_content=0.8076923076923077]


Epoch 8/20


Training: 100%|██████████| 745/745 [00:07<00:00, 104.38batch/s, loss=2.3851, accuracy=0.8578, accuracy_content=0.75]              


Epoch 9/20


Training: 100%|██████████| 745/745 [00:07<00:00, 103.45batch/s, loss=2.3694, accuracy=0.8340, accuracy_content=0.8767123287671232]


Epoch 10/20


Training: 100%|██████████| 745/745 [00:07<00:00, 101.94batch/s, loss=2.3964, accuracy=0.8964, accuracy_content=0.75]              


Epoch 11/20


Training: 100%|██████████| 745/745 [00:07<00:00, 101.78batch/s, loss=2.4088, accuracy=0.8904, accuracy_content=0.7666666666666667]


Epoch 12/20


Training: 100%|██████████| 745/745 [00:07<00:00, 100.90batch/s, loss=2.4235, accuracy=0.8732, accuracy_content=0.9]               


Epoch 13/20


Training: 100%|██████████| 745/745 [00:07<00:00, 100.18batch/s, loss=2.4329, accuracy=0.8863, accuracy_content=0.8461538461538461]


Epoch 14/20


Training: 100%|██████████| 745/745 [00:07<00:00, 103.87batch/s, loss=2.9707, accuracy=0.7966, accuracy_content=0.7222222222222222]


Epoch 15/20


Training: 100%|██████████| 745/745 [00:07<00:00, 105.67batch/s, loss=2.6050, accuracy=0.8590, accuracy_content=0.8055555555555556]


Epoch 16/20


Training: 100%|██████████| 745/745 [00:07<00:00, 106.06batch/s, loss=2.3642, accuracy=0.9204, accuracy_content=0.7777777777777778]


Epoch 17/20


Training: 100%|██████████| 745/745 [00:07<00:00, 100.67batch/s, loss=2.4701, accuracy=0.8654, accuracy_content=0.8095238095238095]


Epoch 18/20


Training: 100%|██████████| 745/745 [00:07<00:00, 101.41batch/s, loss=2.4000, accuracy=0.9000, accuracy_content=0.775]             


Epoch 19/20


Training: 100%|██████████| 745/745 [00:07<00:00, 100.87batch/s, loss=2.3403, accuracy=0.8962, accuracy_content=0.9166666666666666]


Epoch 20/20


Training: 100%|██████████| 745/745 [00:07<00:00, 101.14batch/s, loss=2.7293, accuracy=0.8814, accuracy_content=0.8360655737704918]


In [38]:
model.fine_tuneing(True)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(5)

11310.4922
Epoch 1/5


Training: 100%|██████████| 745/745 [00:30<00:00, 24.73batch/s, loss=2.3683, accuracy=0.8927, accuracy_content=0.8979591836734694]


Epoch 2/5


Training: 100%|██████████| 745/745 [00:30<00:00, 24.72batch/s, loss=2.4521, accuracy=0.8726, accuracy_content=0.8055555555555556]


Epoch 3/5


Training: 100%|██████████| 745/745 [00:30<00:00, 24.67batch/s, loss=2.3713, accuracy=0.8719, accuracy_content=0.8157894736842105]


Epoch 4/5


Training: 100%|██████████| 745/745 [00:30<00:00, 24.55batch/s, loss=2.4028, accuracy=0.9136, accuracy_content=0.8571428571428571]


Epoch 5/5


Training: 100%|██████████| 745/745 [00:30<00:00, 24.37batch/s, loss=2.3739, accuracy=0.9058, accuracy_content=0.75]              


In [39]:
text_file = '../data/text_A_test.txt'
label_file = '../data/label_A_test.txt'

#测试
def predict():
    model_load = torch.load('../model/NER_ZH.model')
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (inputs, labels) in enumerate(loader_test):
        break

    with torch.no_grad():
        outs = model_load(inputs).argmax(dim=2)

    for i in range(2):
        #移除pad
        select = inputs['attention_mask'][i] == 1
        input_id = inputs['input_ids'][i, select]
        out = outs[i, select]
        label = labels[i, select]
        
        #输出原句子
        print(tokenizer.decode(input_id).replace(' ', ''))

        #输出tag
        for tag in [label, out]:
            s = ''
            for j in range(len(tag)):
                if tag[j] == 0:
                    s += '·'
                    continue
                s += tokenizer.decode(input_id[j])
                s += str(tag[j].item())

            print(s)
        print('==========================')


predict()

  model_load = torch.load('../model/NER_ZH.model')


[CLS]所以知循病者，切其脈時，右口氣急，脈無五藏氣，右口脈大而數。數者中下熱而湧，左爲下，右爲上，皆無五藏應，故曰湧疝。中熱，故溺赤也。齊中御府長信病，臣意入診其脈，告曰：「熱病氣也。然暑汗，脈少衰，不死。」曰：「此病得之當浴流水而寒甚，已則熱。[SEP]
[CLS]25···循4·····························································齊20中13御14府14長15信4···意4·············································[SEP]25
··································································齊20中13御14府15長1信3··臣1意4···············暑24······························
[CLS]故意合則胡越爲昆弟，由余、越人蒙是矣；不合，則骨肉出逐不收，朱、象、管、蔡是矣。今人主誠能用齊、秦之義，後宋、魯之聽，則五伯不足稱，三王易爲也。是以聖王覺寤，捐子之之心，而能不説於田常之賢；封比干之後，修孕婦之墓，故功業復就於天下。何則？欲善無厭也。[SEP]
[CLS]25··········由1余3·越1人2蒙3··············朱4·象4·管4·蔡4···今24·····齊20·秦20····宋20·魯20························子1之3········田1常3····比1干3···························[SEP]25
······越20····由1余3·越8·蒙4··············朱4·象4·管4·蔡4···今24·····齊20·秦20····宋20·魯20····五1伯3····三21王3······聖1王3····子1之3········田1常3····比1干3····孕1婦3······················


In [43]:
#测试
def test():
    model_load = torch.load('../model/NER_ZH.model')
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    correct = 0
    total = 0

    correct_content = 0
    total_content = 0

    for step, (inputs, labels) in enumerate(loader_test):
        if step == 5:
            break
        # print(step)

        with torch.no_grad():
            outs = model_load(inputs)
            # outs = torch.zeros(model_load(inputs).shape).to(device)

        #对outs和label变形,并且移除pad
        outs, labels = reshape_and_remove_pad(outs, labels,
                                              inputs['attention_mask'])

        counts = get_correct_and_total_count(labels, outs)
        print(counts)
        correct += counts[0]
        total += counts[1]
        correct_content += counts[2]
        total_content += counts[3]

    print(correct / total, correct_content / total_content)


test()

  model_load = torch.load('../model/NER_ZH.model')


(202, 238, 50, 56)
(196, 215, 53, 59)
(216, 236, 18, 24)
(225, 238, 40, 45)
(188, 207, 36, 44)
0.9056437389770723 0.8640350877192983
