# 基于BiLSTM + Attention 的中文分词
20307140044 李培基

In [8]:
import torch
import torch.nn as nn
import collections
import json
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

### 获取词嵌入矩阵

In [9]:
def read_embedding(file_path = "./gigaword_chn.all.a2b.uni.ite50.vec"):
    data = {} # 单词和向量的映射字典
    tokens = [] # 单词的列表
    embedding = [] # 词嵌入矩阵
    with open(file_path,"r") as f:
        line = f.readline()
        while line:
            line = line.strip().split(' ')
            if len(line) < 2:
                line = f.readline()
                continue
            word = line[0]
            tokens.append(word)
            vec = [float(item) for item in line[1:]]
            embedding.append(vec)
            # round(float(item), 3)
            data[word] = vec
            line = f.readline()
    return tokens, torch.tensor(embedding)



tokens, embedding = read_embedding("./gigaword_chn.all.a2b.uni.ite50.vec")
print('词嵌入表总数：',len(tokens))
print('Embedding Matrix Shape: ', embedding.shape)
    

词嵌入表总数： 11327
Embedding Matrix Shape:  torch.Size([11327, 50])


### 建立Vocab类

In [10]:
class Vocab:
    '''
        从词嵌入矩阵建立词表Vocab
        有些其他做法则是从训练数据建立词表,
    '''
    def __init__(self, tokens, embedding, reserved_tokens=['<pad>','<unk>']):
        self.idx_to_token = reserved_tokens
        self.token_to_idx = {token: idx
                                    for idx, token in enumerate(self.idx_to_token)}
        for token in tokens:
            assert token not in self.token_to_idx # 确保一一映射
            self.idx_to_token.append(token) 
            self.token_to_idx[token] = len(self.idx_to_token) - 1
        unk_vec = torch.mean(embedding,dim = 0) # 计算未知词元的向量，可以使用词向量的平均值
        pad_vec = torch.zeros(embedding.size(1)) # 创建一个全零的向量，作为填充词元的向量
        self.embedding = torch.cat((pad_vec.unsqueeze(0), unk_vec.unsqueeze(0), embedding), dim=0) # 将填充和未知词元的向量添加到词向量张量的开头

    def __len__(self):
        return len(self.idx_to_token)

    @property
    def pad(self):  
        return 0
     
    @property
    def unk(self):  
        return 1
    
    def __getitem__(self, tokens):
        '''
        从word到序号:
        
        输入token:可以是单个词或者元组列表, 均可以实现返回
        
        存在一次非常巧妙的递归调用
        '''
        if not isinstance(tokens, (list, tuple)): # 如果tokens是一个单词, 单独返回单词
            return self.token_to_idx.get(tokens, self.unk) # 若没有出现在词表中，会返回unk
        return [self.__getitem__(token) for token in tokens] # tokens是一串单词，以列表形式返回，即使这串单词中有未出现在词表中的单词也没有关系

    
    def to_tokens(self, indices):
        '''
        从序号到word
        '''
        if not isinstance(indices, (list, tuple)):
            return self.idx_to_token[indices]
        return [self.idx_to_token[index] for index in indices]
    
vocab = Vocab(tokens, embedding)
print('vocab 长度:',len(vocab))
print('embedding shape:', vocab.embedding.shape)

vocab 长度: 11329
embedding shape: torch.Size([11329, 50])


### 读取数据集的函数

In [11]:
label_to_idx = {'B-CWS': 0, 'E-CWS': 1, 'S-CWS': 2, 'I-CWS': 3}
idx_to_label = ['B-CWS', 'E-CWS', 'S-CWS', 'I-CWS']

def read_data(filename):
    sentences = [] # 存储句子列表
    labels = [] # 存储标签列表
    with open(filename, 'r', encoding='utf-8') as f:
        sentence = [] # 存储当前句子
        label = [] # 存储当前标签
        for line in f:
            line = line.strip() # 去掉首尾空格
            if line: # 如果不是空行
                char, tag = line.split() # 分割字符和标签
                sentence.append(char) # 添加到当前句子
                label.append(label_to_idx[tag]) # 添加到当前标签
            else: # 如果是空行
                if sentence: # 如果当前句子不为空
                    sentences.append(sentence) # 添加到句子列表
                    labels.append(label) # 添加到标签列表
                    sentence = [] # 清空当前句子
                    label = [] # 清空当前标签
    return sentences, labels

sentences, labels = read_data('./pku/train.txt')
print(sentences[0])
print([idx_to_label[i] for i in labels[0]])
print(vocab[sentences[0]])
print(vocab.to_tokens(vocab[sentences[0]]))
print(labels[0])

['迈', '向', '充', '满', '希', '望', '的', '新', '世', '纪', '—', '—', '一', '九', '九', '八', '年', '新', '年', '讲', '话', '（', '附', '图', '片', '１', '张', '）']
['B-CWS', 'E-CWS', 'B-CWS', 'E-CWS', 'B-CWS', 'E-CWS', 'S-CWS', 'S-CWS', 'B-CWS', 'E-CWS', 'B-CWS', 'E-CWS', 'B-CWS', 'I-CWS', 'I-CWS', 'I-CWS', 'E-CWS', 'B-CWS', 'E-CWS', 'B-CWS', 'E-CWS', 'S-CWS', 'S-CWS', 'B-CWS', 'E-CWS', 'S-CWS', 'S-CWS', 'S-CWS']
[2182, 238, 913, 1039, 395, 368, 4, 33, 252, 853, 1298, 1298, 8, 147, 147, 180, 18, 33, 18, 1207, 706, 1, 1174, 974, 787, 1, 617, 1]
['迈', '向', '充', '满', '希', '望', '的', '新', '世', '纪', '—', '—', '一', '九', '九', '八', '年', '新', '年', '讲', '话', '<unk>', '附', '图', '片', '<unk>', '张', '<unk>']
[0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 0, 1, 0, 3, 3, 3, 1, 0, 1, 0, 1, 2, 2, 0, 1, 2, 2, 2]


### 建立数据集

In [12]:
class CWSDataset(Dataset):
    def __init__(self, sentences, labels, vocab):
        assert len(sentences) == len(labels) # 确保句子和标签的数量相同
        self.sentences = sentences # 存储句子列表
        self.labels = labels # 存储标签列表
        self.vocab = vocab
    
    def __len__(self):
        return len(self.sentences) # 返回数据集的大小
    
    def __getitem__(self, idx):
        return torch.tensor(self.vocab[self.sentences[idx]]), torch.tensor(self.labels[idx]) # 返回指定索引的句子和标签

# 读取数据文件，得到句子和标签列表
train_sentences, train_labels = read_data('./pku/train.txt')
test_sentences, test_labels = read_data('./pku/test.txt')

# 创建数据集对象，传入句子和标签列表
train_dataset = CWSDataset(train_sentences, train_labels, vocab)
test_dataset = CWSDataset(test_sentences, test_labels, vocab)
print(len(train_dataset),len(test_dataset))
print(train_dataset[0])

44565 4590
(tensor([2182,  238,  913, 1039,  395,  368,    4,   33,  252,  853, 1298, 1298,
           8,  147,  147,  180,   18,   33,   18, 1207,  706,    1, 1174,  974,
         787,    1,  617,    1]), tensor([0, 1, 0, 1, 0, 1, 2, 2, 0, 1, 0, 1, 0, 3, 3, 3, 1, 0, 1, 0, 1, 2, 2, 0,
        1, 2, 2, 2]))


In [13]:
def collate_fn(data):
    sentences, labels = zip(*data) # 分离句子和标签序列    
    # 获取每个句子的长度，并按照长度降序排列
    lengths = [s.size(0) for s in sentences]
    lengths, indices = torch.sort(torch.tensor(lengths), descending=True)
    sentences = [sentences[i] for i in indices]
    labels = [labels[i] for i in indices]
    # 对不同长度的句子和标签进行填充，指定填充值为0，并且按照长度降序排列
    sentences = torch.nn.utils.rnn.pad_sequence(sentences, padding_value=0, batch_first=True)
    labels = torch.nn.utils.rnn.pad_sequence(labels, padding_value=4, batch_first=True)
    
    return sentences, labels, lengths

batch_size = 32
# 创建数据加载器对象，传入数据集对象，批次大小，是否打乱顺序，以及collate_fn函数
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,  collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=100, shuffle=False, collate_fn=collate_fn)

# 测试
for sentences, labels, lengths in train_loader: # 遍历每个批次的数据
    print(sentences.shape) # 打印句子的张量
    print(labels.shape) # 打印标签的张量
    break# 可以在打印一个批次后就退出循环，或者继续打印更多的批次


torch.Size([32, 85])
torch.Size([32, 85])


### 建立模型

In [14]:
def create_key_padding_mask(lengths):
    '''
        input: lengths 是有效长度的列表 (batch_size)
        output: 一个key_padding_mask: 用于multihead_attn
        由gpt3.5辅助编写
    '''
    batch_size = len(lengths)
    max_seq_len = max(lengths)
    # 创建一个形状为 (batch_size, max_seq_len) 的零矩阵
    key_padding_mask = torch.zeros(batch_size, max_seq_len, dtype=torch.bool).to(device)

    for i, length in enumerate(lengths):
        # 对于每个样本，将有效长度之后的位置设置为True
        key_padding_mask[i, length:] = True

    return key_padding_mask

class RNNModel(torch.nn.Module):
    def __init__(self, embedding_matrix, embed_size, hidden_size, num_layers, num_classes):
        super(RNNModel, self).__init__()
        # 创建一个预训练的embedding层
        self.embedding = torch.nn.Embedding.from_pretrained(embedding_matrix)
        # 创建一个双向的LSTM层
        self.lstm = torch.nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True)
        # 创建一个全连接层
        self.linear = torch.nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x, lengths):
        # x: 可以认为是 (batch_size, seq_len) 
        x = self.embedding(x) # x: (batch_size, seq_len, embed_size)
        # 使用pack_padded_sequence将填充后的序列打包成PackedSequence对象
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
        x, _ = self.lstm(x) 
        x, out_len = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # 解包: (batch_size, seq_len, hidden_size * 2)
        # 将LSTM的输出序列传入全连接层，得到最终的输出序列
        output = self.linear(x) # x: (batch_size, seq_len, num_classes)
        return output

class RNNAttention(torch.nn.Module):
    def __init__(self, embedding_matrix, embed_size, hidden_size, num_layers, num_classes, attn_heads, dropout):
        super(RNNAttention, self).__init__()
        # 创建一个预训练的embedding层
        self.embedding = torch.nn.Embedding.from_pretrained(embedding_matrix)
        # 创建一个双向的LSTM层
        self.lstm = torch.nn.LSTM(input_size=embed_size, hidden_size=hidden_size, num_layers=num_layers, bidirectional=True)
        # 创建一个全连接层
        self.multihead_attn = nn.MultiheadAttention(embed_dim=hidden_size*2, num_heads=attn_heads, dropout=dropout)
        self.linear = torch.nn.Linear(hidden_size * 2, num_classes)
    
    def forward(self, x, lengths):
        # x: 可以认为是 (batch_size, seq_len) 
        x = self.embedding(x) # x: (batch_size, seq_len, embed_size)
        # 使用pack_padded_sequence将填充后的序列打包成PackedSequence对象
        x = torch.nn.utils.rnn.pack_padded_sequence(x, lengths, batch_first=True)
        x, _ = self.lstm(x) 
        x, out_len = torch.nn.utils.rnn.pad_packed_sequence(x, batch_first=True)
        # 解包: (batch_size, seq_len, hidden_size * 2)
        # 将LSTM的输出序列传入全连接层，得到最终的输出序列
        key_padding_mask = create_key_padding_mask(lengths)
        x = x.permute(1,0,2) # multiheadattention 要求第一维度是seq_len
        x,_ = self.multihead_attn(x,x,x,key_padding_mask)
        x = x.permute(1,0,2)
        output = self.linear(x) # x: (batch_size, seq_len, num_classes)
        return output

### 选择模型

In [20]:
# RNN
model = RNNModel(embedding_matrix=vocab.embedding, embed_size=50,hidden_size=128,num_layers=2,num_classes=4)
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)
loss_fn = nn.CrossEntropyLoss(ignore_index=4) # 这个很必要 完全忽略padding的标签(对应index为4)
num_epochs = 20

In [15]:
# RNN with Attention
model = RNNAttention(embedding_matrix=vocab.embedding, embed_size=50,hidden_size=128,num_layers=2,num_classes=4,
                     attn_heads=4, dropout=0.2)
optimizer = torch.optim.AdamW(model.parameters(),lr=1e-3)
loss_fn = nn.CrossEntropyLoss(ignore_index=4) # 这个很必要 完全忽略padding的标签(对应index为4)
num_epochs = 20

### 训练与评测
* 尝试不同超参数搜索组合，调整batch_size,dropout,lr等等多种方式
* 测试集上micro-F1可达93.2%，macro-F1 可达 90.8


In [9]:
def calculate(predictions, labels):
    '''
        计算micro-f1 和 macro-f1
        需要注意的是, 这个任务中每个词只属于一个类别, 所以micro-f1等价于acc
    '''
    correct = 0
    total = 0
    TP = [0,0,0,0]
    FP = [0,0,0,0]
    FN = [0,0,0,0]
    P = [0,0,0,0]
    R = [0,0,0,0]
    for i in range(predictions.shape[0]):
        for j in range(predictions.shape[1]):
            if labels[i][j] == 4:
                continue
            if predictions[i][j] == labels[i][j]:
                correct += 1
                TP[predictions[i][j]] += 1
            else:
                FN[predictions[i][j]] += 1
                FP[labels[i][j]] += 1
            total += 1
    
    accuracy = correct / total # 即micro-f1
    for i in range(4):
        P[i] = TP[i]/(TP[i] + FP[i] + 1e-9)
        R[i] = TP[i]/(TP[i] + FN[i] + 1e-9)
    macro_f1 = sum([ 2*P[i]*R[i]/(P[i] + R[i] + 1e-9) for i in range(4)])/4
    return accuracy, macro_f1


def train(model, train_loader, optimizer, num_epochs, loss_fn, log_freq = 1000, save_path = 'model.pth', policy = 'micro'):
    # device = 'cpu'
    print(f'Start Training, Dev Policy:{policy}_f1 ')
    model.to(device)
    model.train()
    best_micro = 0.0
    best_macro = 0.0
    for epoch in range(num_epochs):
        total_loss = 0.0
        model.train()
        # 遍历数据加载器对象，得到每个批次的数据
        for batch_idx, (sentences, labels, lengths) in enumerate(train_loader):
            sentences = sentences.to(device)
            labels = labels.to(device)
            outputs = model(sentences, lengths) # outputs: (batch_size, seq_len, num_classes)
            preds = torch.argmax(outputs,dim=2) # (batch_size, seq_len)
            if(batch_idx%log_freq == 0):
                micro, macro = calculate(preds,labels)
                print(f'Log: current micro_f1(acc):{micro}, current macro_f1:{macro}')
            outputs = outputs.permute(0,2,1) # CrossEntropy 要求输出的第二维度是分类数
            loss = loss_fn(outputs, labels)
            loss.backward()
            optimizer.step()
            optimizer.zero_grad()
            total_loss += loss.item()
        avg_loss = total_loss / len(train_loader)
        print(f'Epoch {epoch + 1}, average loss: {avg_loss:.4f}')
        # 评测
        model.eval()
        test_micro = 0.0
        test_macro = 0.0
        for batch_idx, (sentences, labels, lengths) in enumerate(test_loader):
            sentences = sentences.to(device)
            labels = labels.to(device)
            outputs = model(sentences, lengths) # outputs: (batch_size, seq_len, num_classes)
            preds = torch.argmax(outputs,dim=2) # (batch_size, seq_len)
            micro,macro = calculate(preds,labels)
            test_micro += micro
            test_macro += macro
        test_micro = test_micro/len(test_loader)
        test_macro = test_macro/len(test_loader)
        print(f'Test: micro_f1(acc) on test set:{test_micro}, macro_f1 on test set: {test_macro}')
        if(test_micro > best_micro):
            best_micro = test_micro
            if policy != 'macro':
                torch.save(model.state_dict(), save_path)
                print(f'Best Model Parameter updated and saved')
        if(test_macro > best_macro):
            best_macro = test_macro
            if policy == 'macro':
                torch.save(model.state_dict(), save_path)
                print(f'Best Model Parameter updated and saved')




In [10]:
train(model,train_loader,optimizer,num_epochs,loss_fn, log_freq=1000)

Start Training, Dev Policy:micro_f1 
Log: current micro_f1(acc):0.22630718954248366, current macro_f1:0.13551542099990785
Log: current micro_f1(acc):0.8971036585365854, current macro_f1:0.8715305059853139
Epoch 1, average loss: 0.3338
Test: micro_f1(acc) on test set:0.915131676351508, macro_f1 on test set: 0.8821977666302673
Best Model Parameter updated and saved
Log: current micro_f1(acc):0.9327102803738317, current macro_f1:0.9110797069663046
Log: current micro_f1(acc):0.9695885509838998, current macro_f1:0.9635713119248549
Epoch 2, average loss: 0.1570
Test: micro_f1(acc) on test set:0.9256190366373019, macro_f1 on test set: 0.8959623733585168
Best Model Parameter updated and saved
Log: current micro_f1(acc):0.9570647931303669, current macro_f1:0.9435405855257601
Log: current micro_f1(acc):0.9584996009577015, current macro_f1:0.9521265291079267
Epoch 3, average loss: 0.1223
Test: micro_f1(acc) on test set:0.9296235784946032, macro_f1 on test set: 0.9032053904347437
Best Model Parame

### Demo

In [41]:
# 在这里输入要进行分词的中文语句
input = '感谢复旦大学自然语言处理实验室的黄老师和几位助教的指导，我在本次实验中收获了很多'
init_string = input

In [42]:
def word_split(input, model_path = './model.pth'):
    input = [char for char in input]
    model = RNNAttention(embedding_matrix=vocab.embedding, embed_size=50,hidden_size=128,num_layers=2,num_classes=4,
                        attn_heads=4, dropout=0.2)
    model.load_state_dict(torch.load(model_path))
    model.to(device)
    model.eval()

    x = torch.tensor(vocab[input]).unsqueeze(0).to(device)
    output = model(x,[x.shape[1]])
    preds = torch.argmax(output,dim=2).squeeze().tolist()
    assert len(input) == len(preds)
    res = []
    char = ''
    for idx in range(len(input)):
        if preds[idx] == 0: # B
            char += input[idx]
            continue
        if preds[idx] == 1:
            char += input[idx]
            res.append(char)
            char = ''
            continue
        if preds[idx] == 2:
            if char!='': # 自己加一点约束
                res.append(char)
            res.append(input[idx])
        else:
            char += input[idx]
            continue
            
    return res

In [43]:
res = word_split(input)
print(input)
print(res)

感谢复旦大学自然语言处理实验室的黄老师和几位助教的指导，我在本次实验中收获了很多
['感谢', '复旦', '大学', '自然', '语言', '处理', '实验室', '的', '黄', '老师', '和', '几', '位', '助教', '的', '指导', '，', '我', '在', '本次', '实验', '中', '收获', '了', '很多']


### 总结与不足
* 把分词任务从一个机器学习的统计任务变为了深度学习的分类标记任务：深度学习的本质也是统计
* 不同的类别之间应该有转移概率的约束，其实应用上条件随机场(CRF)会更好，但是我的服务器断网了，有些包没法安装，以后再试试。
* 比较令人欣喜的是，从结果上来看，模型自己学到了这些约束


### 清除显存

In [68]:
import os
pid = os.getpid()
!kill -9 $pid

: 