In [22]:
!ls

EvaNer.ipynb  EvaNer.py  NER_ZH.ipynb  data.ipynb


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [41]:
from transformers import AutoTokenizer
import transformers
import torch
from datasets import load_dataset, load_from_disk
from transformers import AutoModel
from transformers import AdamW
import random
import numpy as np
import torch.nn.functional as F
from torch.utils.data import Dataset
from tqdm import tqdm

In [42]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
device

device(type='cuda', index=0)

In [43]:
def same_seeds(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.benchmark = False
    torch.backends.cudnn.deterministic = True
    transformers.set_seed(seed)

same_seeds(7890)

In [44]:
model_path = "../model/GujiRoBERTa_jian_fan"

tokenizer = AutoTokenizer.from_pretrained(model_path, local_files_only=True)
model = AutoModel.from_pretrained(model_path, local_files_only=True)


# 示例文本
text = "主唱太拼命了"
inputs = tokenizer(text, return_tensors="pt")
outputs = model(**inputs)
last_hidden_states = outputs.last_hidden_state
print(last_hidden_states.shape)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


torch.Size([1, 8, 768])


In [45]:
class TextLabelDataset(Dataset):
    def __init__(self, text_file, label_file, max_length=128):
        """
            text_file: 文本文件的路径.
            label_file: 标签文件的路径.
            tokenizer_name: 使用的 tokenizer 名称，默认为 'bert-base-chinese'.
            max_length: 最大序列长度，默认 128.
        """
        self.text_file = text_file
        self.label_file = label_file
        self.max_length = max_length
        self.texts, self.labels = self._load_data()
        
        self.dataset = self._filter_long_sentences() # 过滤掉过长的句子

    def _filter_long_sentences(self):
        """过滤掉过长的句子."""
        filtered_texts = []
        filtered_labels = []
        for text, label in zip(self.texts, self.labels):
            if len(text) <= self.max_length:
                filtered_texts.append(text)
                filtered_labels.append(label)

        return list(zip(filtered_texts,filtered_labels))

    def _load_data(self):
        """
        加载文本和标签数据。返回包含文本列表和标签列表的元组.
        """
        texts = []
        labels = []
        with open(self.text_file, 'r', encoding='utf-8') as f_text, \
                open(self.label_file, 'r', encoding='utf-8') as f_label:
            for text, label in zip(f_text, f_label):
                texts.append(list(text.strip()))
                labels.append(eval(label.strip()))  # 使用 eval 将字符串转换为 list
        return texts, labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        tokens, labels = self.dataset[idx]
        return tokens, labels


text_file = '../data/text_A.txt'  # 文本文件的路径
label_file = '../data/label_A.txt'  # 标签文件的路径
dataset = TextLabelDataset(text_file, label_file)
tokens, labels = dataset[5]

len(dataset), tokens, labels

(6246,
 ['若',
  '此',
  '二',
  '士',
  '者',
  '，',
  '非',
  '不',
  '能',
  '成',
  '小',
  '廉',
  '而',
  '行',
  '小',
  '節',
  '也',
  '，',
  '以',
  '爲',
  '殺',
  '身',
  '亡',
  '軀',
  '，',
  '絶',
  '世',
  '滅',
  '後',
  '，',
  '功',
  '名',
  '不',
  '立',
  '，',
  '非',
  '智',
  '也',
  '。'],
 [0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0,
  0])

In [46]:
def collate_fn(data):
    tokens = [i[0] for i in data]
    labels = [i[1] for i in data]

    inputs = tokenizer.batch_encode_plus(tokens,
                                         truncation=True,
                                         padding=True,
                                         return_tensors='pt',
                                         is_split_into_words=True) 

    lens = inputs['input_ids'].shape[1]
    # print(lens)

    for i in range(len(labels)):
        labels[i] = [25] + labels[i]
        labels[i] += [25] * lens
        labels[i] = labels[i][:lens]

    return inputs.to(device), torch.LongTensor(labels).to(device)  # 将输入和标签都移动到设备上
    # return inputs, torch.LongTensor(labels)


loader = torch.utils.data.DataLoader(dataset=dataset,
                                     batch_size=2,
                                     collate_fn=collate_fn,
                                     shuffle=True,
                                     drop_last=True)

i = 0
for data in loader:
    i += 1
    if i == 4:
        print(data)
        break

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


({'input_ids': tensor([[  101,  7401,   862,  1728,  6304,  1922,  2153,  3288,  8038,   519,
          4374,   722,   679,  6210,   862,  8024,  2553,   809,  3504,  4264,
         23051,  8024,   809,  4031,  4264,  2483,  8024,  3634,  5628,   722,
          2792,   809,  4264,   886,   511,   102],
        [  101,  3669,   782,  6912,   790,  8024,  5645,  3678,   510,  1992,
          1963,  7968,  8024,   809,  2248,  4264,   752,   511,   102,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [47]:
len([25,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          4,  0,  0,  0,  0,  0,  1,  3,  0,  8,  0,  0,  0,  0,  0,  0,  4,  0,
          0,  0,  0,  0,  4,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
          0,  0, 25])

75

In [48]:
len([  101,   520,   718, 30184,   928,  3288,  8038,   519,  1062,  7478,
          7269,  5442,  8013,   520,  1293,  5632, 28472,   511,   928,  2898,
          1071,  7674,  8024,  6332,  7770,  4862,  3176,  7376,   511,   677,
           808,  3636,  1894,  5236,   928,  8024,  6734,  2527,  6722,   511,
           928,  3288,  8038,   519,  3362,  5735,   782,  6241,  8024,   521,
          4322,  1052,  3647,  8024,  5679,  4318,   773,  8039,  7770,  7852,
          4674,  8024,  5679,  2469,  5966,  8039,  3147,  1751,  4788,  8024,
          6331,  5628,   767,   511,   102])

75

In [49]:
#加载预训练模型
model_path = "../model/GujiRoBERTa_jian_fan"
pretrained = AutoModel.from_pretrained(model_path, local_files_only=True).to(device)
# pretrained = AutoModel.from_pretrained(model_path, local_files_only=True)


#统计参数量
print(sum(i.numel() for i in pretrained.parameters()) / 10000)

Some weights of BertModel were not initialized from the model checkpoint at ../model/GujiRoBERTa_jian_fan and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


11269.7856


In [50]:
#定义下游模型
class Model(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.tuneing = False
        self.pretrained = None

        # self.rnn = torch.nn.GRU(768, 768, batch_first=True)
        self.fc1 = torch.nn.Linear(768, 512)
        self.fc2 = torch.nn.Linear(512, 26)

    def forward(self, inputs):
        if self.tuneing:
            out = self.pretrained(**inputs).last_hidden_state
        else:
            with torch.no_grad():
                out = pretrained(**inputs).last_hidden_state

        # out, _ = self.rnn(out)

        out = self.fc1(out)
        out = F.softmax(self.fc2(out), dim=2)

        return out

    def fine_tuneing(self, tuneing):
        self.tuneing = tuneing
        if tuneing:
            for i in pretrained.parameters():
                i.requires_grad = True

            pretrained.train()
            self.pretrained = pretrained
        else:
            for i in pretrained.parameters():
                i.requires_grad_(False)

            pretrained.eval()
            self.pretrained = None


model = Model().to(device)
# model = Model()

In [51]:
#对计算结果和label变形,并且移除pad
def reshape_and_remove_pad(outs, labels, attention_mask):
    #变形,便于计算loss
    outs = outs.reshape(-1, 26)
    labels = labels.reshape(-1)

    #忽略对pad的计算结果
    select = attention_mask.reshape(-1) == 1
    outs = outs[select]
    labels = labels[select]

    return outs, labels


reshape_and_remove_pad(torch.randn(2, 3, 26), torch.ones(2, 3),
                       torch.ones(2, 3))

(tensor([[-0.8420,  0.5070,  0.1182,  1.1984,  1.0579, -1.2031, -0.7358,  0.3194,
           0.3987, -0.4950, -0.2447,  0.7975, -1.0523, -1.7899, -1.3929,  0.3385,
          -0.8516,  1.0509, -0.1571, -0.2533,  1.4901, -0.0327, -1.2014, -0.7821,
           0.2197,  0.2431],
         [-2.6020, -0.8985, -0.5458, -0.6226, -0.5878, -2.0267,  0.0862, -0.2964,
           0.9675,  0.2973,  0.6631, -1.2984,  0.5594, -0.1245, -0.4351, -1.3926,
           0.6670, -0.3480, -0.7823,  0.1771,  0.0228, -1.1189,  1.6513, -1.4321,
          -0.6285,  0.0367],
         [ 1.1671, -0.1989,  2.2370,  0.4147,  0.2827, -0.2688,  0.9631, -0.6835,
           0.4847,  1.1674,  0.1868, -0.0167, -0.3660, -0.0143, -0.4304,  1.2072,
          -1.5384,  1.9352, -0.0049,  1.3987,  0.5744,  1.7467,  1.5225, -1.4022,
           0.8777,  0.9350],
         [ 0.8227,  1.6903, -1.1558,  0.1568, -1.6183,  1.3930,  0.3279, -0.1916,
          -0.0409,  0.6019,  0.3101,  0.8075, -0.3037,  0.5119,  1.1302, -0.4810,
          -

In [52]:
#获取正确数量和总数
def get_correct_and_total_count(labels, outs):
    outs = outs.argmax(dim=1)
    correct = (outs == labels).sum().item()
    total = len(labels)

    #计算除了0以外元素的正确率
    select = (labels != 0)
    outs = outs[select]
    labels = labels[select]
    correct_content = (outs == labels).sum().item()
    total_content = len(labels)

    return correct, total, correct_content, total_content


get_correct_and_total_count(torch.ones(16), torch.randn(16, 26))

(0, 16, 0, 16)

In [53]:
# #训练
# def train(epochs):
#     # lr = 2e-5 if model.tuneing else 5e-4
#     lr = 1e-5 if model.tuneing else 1e-4    
#     optimizer = AdamW(model.parameters(), lr=lr)
#     # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

#     criterion = torch.nn.CrossEntropyLoss()

#     model.train()
#     for epoch in range(epochs):
#         for step, (inputs, labels) in enumerate(loader):
#             outs = model(inputs)
            
#             #对outs和label变形,并且移除pad
#             outs, labels = reshape_and_remove_pad(outs, labels,
#                                                 inputs['attention_mask'])

#             #梯度下降
#             loss = criterion(outs, labels)
#             loss.backward()
#             optimizer.step()
#             optimizer.zero_grad()

#             if step % 50 == 0:
#                 counts = get_correct_and_total_count(labels, outs)

#                 accuracy = counts[0] / counts[1]
#                 accuracy_content = counts[2] / counts[3]

#                 print(epoch, step, loss.item(), accuracy, accuracy_content)

#         torch.save(model, '../model/NER_ZH.model')

In [54]:
#训练
def train(epochs):
    lr = 1e-6 if model.tuneing else 1e-5
    # lr = 1e-5 if model.tuneing else 1e-4

    optimizer = AdamW(model.parameters(), lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # criterion = torch.nn.CrossEntropyLoss()

    # 计算每个类别的样本数量
    label_counts = [0] * 26  # 有26个类别
    for _, labels in dataset:
        for label in labels:
            if label != 25:
              label_counts[label] += 1

    # 计算权重，做倒数
    weights = [1.0 / count if count > 0 else 0 for count in label_counts]
    weights = torch.tensor(weights).to(device)

    criterion = torch.nn.CrossEntropyLoss(weight=weights, ignore_index=25) 


    model.train()
    for epoch in range(epochs):
        print(f"Epoch {epoch+1}/{epochs}")
        progress_bar = tqdm(loader, desc="Training", unit="batch")

        for step, (inputs, labels) in enumerate(progress_bar):

            # 将输入移动到设备
            inputs = {k: v.to(device) for k, v in inputs.items()}
            labels = labels.to(device)
            
            outs = model(inputs)

            # 对outs和label变形,并且移除pad
            outs, labels = reshape_and_remove_pad(outs, labels,
                                                inputs['attention_mask'])

            # 梯度下降
            loss = criterion(outs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()

            if step % 500 == 0:
                counts = get_correct_and_total_count(labels, outs)
                accuracy = counts[0] / counts[1] if counts[1] > 0 else 0
                accuracy_content = counts[2] / counts[3] if counts[3] > 0 else 0

                progress_bar.set_postfix({
                    "loss": f"{loss.item():.4f}",
                    "accuracy": f"{accuracy:.4f}",
                    # "accuracy_content": f"{accuracy_content:.4f}",
                    "accuracy_content": f"{accuracy_content}",
                })
        
        torch.save(model, '../model/NER_weight.model')


In [55]:
model.fine_tuneing(False)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(8)

40.7066
Epoch 1/8


Training: 100%|█████████████| 3123/3123 [00:36<00:00, 86.72batch/s, loss=2.9356, accuracy=0.8727, accuracy_content=0.25]


Epoch 2/8


Training: 100%|█| 3123/3123 [00:36<00:00, 86.08batch/s, loss=3.1156, accuracy=0.8000, accuracy_content=0.181818181818181


Epoch 3/8


Training: 100%|█| 3123/3123 [00:36<00:00, 86.55batch/s, loss=2.4036, accuracy=0.8667, accuracy_content=0.333333333333333


Epoch 4/8


Training: 100%|█| 3123/3123 [00:37<00:00, 83.11batch/s, loss=2.3356, accuracy=0.9167, accuracy_content=0.714285714285714


Epoch 5/8


Training: 100%|█████████████| 3123/3123 [00:42<00:00, 72.77batch/s, loss=3.1043, accuracy=0.8819, accuracy_content=0.56]


Epoch 6/8


Training: 100%|█| 3123/3123 [00:42<00:00, 72.63batch/s, loss=3.1961, accuracy=0.7419, accuracy_content=0.384615384615384


Epoch 7/8


Training: 100%|█| 3123/3123 [00:43<00:00, 72.09batch/s, loss=2.6037, accuracy=0.8269, accuracy_content=0.166666666666666


Epoch 8/8


Training: 100%|██████████████| 3123/3123 [00:44<00:00, 70.22batch/s, loss=2.4011, accuracy=0.8594, accuracy_content=0.2]


In [56]:
model.fine_tuneing(True)
print(sum(p.numel() for p in model.parameters()) / 10000)
train(1)

11310.4922
Epoch 1/1


Training: 100%|██████████████| 3123/3123 [03:38<00:00, 14.30batch/s, loss=2.3313, accuracy=0.8571, accuracy_content=0.6]


In [57]:
text_file = '../data/text_A_test.txt'
label_file = '../data/label_A_test.txt'

#测试
def predict():
    model_load = torch.load('../model/NER_weight.model', weights_only=False)
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    for i, (inputs, labels) in enumerate(loader_test):
        break

    with torch.no_grad():
        outs = model_load(inputs).argmax(dim=2)

    for i in range(2):
        #移除pad
        select = inputs['attention_mask'][i] == 1
        input_id = inputs['input_ids'][i, select]
        out = outs[i, select]
        label = labels[i, select]
        
        #输出原句子
        print(tokenizer.decode(input_id).replace(' ', ''))

        #输出tag
        for tag in [label, out]:
            s = ''
            for j in range(len(tag)):
                if tag[j] == 0:
                    s += '·'
                    continue
                s += tokenizer.decode(input_id[j])
                s += str(tag[j].item())

            print(s)
        print('==========================')


predict()

[CLS]夜半傳發，選輕騎二千人，人持一赤幟，從閒道萆山而望趙軍，誡曰：「趙見我走，必空壁逐我，若疾入趙壁，拔趙幟，立漢赤幟。[SEP]
[CLS]25·半24·······················趙20································[SEP]25
·夜21半23·······················趙20······趙20·············趙20···趙20···漢20····
[CLS]」灌嬰雖少，然數力戰，乃拜灌嬰爲中大夫，令李必、駱甲爲左右校尉，將郎中騎兵擊楚騎於滎陽東，大破之。[SEP]
[CLS]25·灌1嬰3··········灌1嬰3·中13大14夫15··李1必3·駱1甲3·左13右14校14尉15·······楚20··滎5陽7······[SEP]25
··灌1嬰3··········灌1嬰3·中13大14夫15·令16李1必3·駱1甲3·左13右14校14尉15··郎13中14···楚20··滎5陽7東7······


In [58]:
#测试
def test():
    model_load = torch.load('../model/NER_weight.model', weights_only=False)
    model_load.eval()

    loader_test = torch.utils.data.DataLoader(dataset=TextLabelDataset(text_file, label_file),
                                              batch_size=2,
                                              collate_fn=collate_fn,
                                              shuffle=True,
                                              drop_last=True)

    correct = 0
    total = 0

    correct_content = 0
    total_content = 0

    for step, (inputs, labels) in enumerate(loader_test):
        if step == 5:
            break
        # print(step)

        with torch.no_grad():
            outs = model_load(inputs)
            # outs = torch.zeros(model_load(inputs).shape).to(device)

        #对outs和label变形,并且移除pad
        outs, labels = reshape_and_remove_pad(outs, labels,
                                              inputs['attention_mask'])

        counts = get_correct_and_total_count(labels, outs)
        print(counts)
        correct += counts[0]
        total += counts[1]
        correct_content += counts[2]
        total_content += counts[3]

    print(correct / total, correct_content / total_content)


test()

(49, 57, 15, 21)
(50, 67, 18, 24)
(52, 56, 7, 11)
(20, 25, 8, 12)
(50, 59, 3, 8)
0.8371212121212122 0.6710526315789473
