# 导入需要的包

In [1]:
#part2: bert feature-base
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
import torch
import transformers as tfs
import warnings

warnings.filterwarnings('ignore')

# 读入数据

In [2]:
train_df = pd.read_csv('train.tsv', delimiter='\t', header=None)

In [3]:
train_df.sample(3)

Unnamed: 0,0,1
4995,"just about the best straight up , old school h...",1
441,men in black ii achieves ultimate insignifican...,0
458,decent but dull,0


In [4]:
len(train_df)

6920

In [5]:
train_set = train_df[:3000]

print("Train set shape:", train_set.shape)

Train set shape: (3000, 2)


In [6]:
train_set[1].value_counts()

1    1565
0    1435
Name: 1, dtype: int64

# 利用BERT进行特征抽取

在这里，我们利用BERT对数据集进行特征抽取，即把输入数据经过BERT模型，来获取输入数据的特征，这些特征包含了整个句子的信息，是语境层面的。这种做法类似于EMLo的特征抽取。需要注意的是，这里并没有使用到BERT的微调，因为BERT并不参与后面的训练，仅仅进行特征抽取操作。

In [7]:
model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-uncased')
tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
model = model_class.from_pretrained(pretrained_weights)

我们使用预训练好的"bert-base-uncased"模型参数进行处理，采用的模型是BertModel，采用的分词器是BertTokenizer。由于我们的输入句子是英文句子，所以需要先分词；然后把单词映射成词汇表的索引，再喂给模型。实际上Bert的分词操作，不是以传统的单词为单位的，而是以wordpiece为单位，这是比单词更细粒度的单位。我们执行以下代码：

In [9]:
pretrained_weights

'bert-base-uncased'

In [8]:
type(pretrained_weights)

str

In [10]:
train_tokenized = train_set[0].apply((lambda x: tokenizer.encode(x, add_special_tokens=True)))

In [13]:
train_set[0]

0       a stirring , funny and finally transporting re...
1       apparently reassembled from the cutting room f...
2       they presume their audience wo n't sit still f...
3       this is a visually stunning rumination on love...
4       jonathan parker 's bartleby should have been t...
                              ...                        
2995    but this new jangle of noise , mayhem and stup...
2996               darkly funny and frequently insightful
2997                                  formuliac , but fun
2998    there 's some good material in their story abo...
2999    damon brings the proper conviction to his role...
Name: 0, Length: 3000, dtype: object

In [14]:
train_tokenized

0       [101, 1037, 18385, 1010, 6057, 1998, 2633, 182...
1       [101, 4593, 2128, 27241, 23931, 2013, 1996, 62...
2       [101, 2027, 3653, 23545, 2037, 4378, 24185, 10...
3       [101, 2023, 2003, 1037, 17453, 14726, 19379, 1...
4       [101, 5655, 6262, 1005, 1055, 12075, 2571, 376...
                              ...                        
2995    [101, 2021, 2023, 2047, 23769, 2571, 1997, 500...
2996     [101, 27148, 6057, 1998, 4703, 12369, 3993, 102]
2997      [101, 2433, 20922, 2278, 1010, 2021, 4569, 102]
2998    [101, 2045, 1005, 1055, 2070, 2204, 3430, 1999...
2999    [101, 11317, 7545, 1996, 5372, 10652, 2000, 20...
Name: 0, Length: 3000, dtype: object

然后，为了提升训练速度，我们需要把句子都处理成同一个长度，即常见的pad操作，我们在短的句子末尾添加一系列的[PAD]符号：

In [15]:
train_max_len = 0
for i in train_tokenized.values:
    if len(i) > train_max_len:
        train_max_len = len(i)

train_padded = np.array([i + [0] * (train_max_len-len(i)) for i in train_tokenized.values])
print("train set shape:",train_padded.shape)

train set shape: (3000, 66)


In [16]:
train_max_len

66

In [17]:
train_padded

array([[  101,  1037, 18385, ...,     0,     0,     0],
       [  101,  4593,  2128, ...,     0,     0,     0],
       [  101,  2027,  3653, ...,     0,     0,     0],
       ...,
       [  101,  2433, 20922, ...,     0,     0,     0],
       [  101,  2045,  1005, ...,     0,     0,     0],
       [  101, 11317,  7545, ...,     0,     0,     0]])

最后，我们还需要让模型知道，哪些词是不用处理的，即上面我们添加的[PAD]符号：

In [18]:
print(train_padded[0])
train_attention_mask = np.where(train_padded != 0, 1, 0)
print(train_attention_mask[0])

[  101  1037 18385  1010  6057  1998  2633 18276  2128 16603  1997  5053
  1998  1996  6841  1998  5687  5469  3152   102     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0     0     0     0     0     0     0
     0     0     0     0     0     0]
[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


经过上面一系列步骤的处理，此时输入数据已经可以正确被Bert模型接收并处理了，我们直接进行特征的输出：

In [19]:
# 训练集
train_input_ids = torch.tensor(train_padded).long()
train_attention_mask = torch.tensor(train_attention_mask).long()
with torch.no_grad():
    train_last_hidden_states = model(train_input_ids, attention_mask=train_attention_mask)

我们来看以下Bert模型给我们的输出是什么样的：

In [23]:
type(train_last_hidden_states)

transformers.modeling_outputs.BaseModelOutputWithPoolingAndCrossAttentions

In [32]:
train_last_hidden_states[1]

tensor([[-0.7915, -0.3743, -0.7874,  ..., -0.5462, -0.5542,  0.7294],
        [-0.9337, -0.3978, -0.7455,  ..., -0.7045, -0.6715,  0.9190],
        [-0.6180, -0.3858, -0.9156,  ..., -0.7709, -0.6184,  0.7713],
        ...,
        [-0.7152, -0.2524, -0.0280,  ..., -0.2456, -0.5770,  0.8065],
        [-0.6578, -0.4616, -0.9024,  ..., -0.7255, -0.6244,  0.7785],
        [-0.8487, -0.3505, -0.6283,  ..., -0.0346, -0.6380,  0.8275]])

In [20]:
train_last_hidden_states[0].size()

torch.Size([3000, 66, 768])

第一维的是样本数量，第二维的是序列长度，第三维是特征数量。也就是说，Bert对于我们的每一个位置的输入，都会输出一个对应的特征向量。

## 切分数据成训练集和测试集

In [33]:
train_features = train_last_hidden_states[0][:,0,:].numpy()
train_labels = train_set[1]

In [34]:
train_features.shape

(3000, 768)

请注意：我们使用[:,0,:]来提取序列第一个位置的输出向量，因为第一个位置是[CLS]，比起其他位置，该向量应该更具有代表性，蕴含了整个句子的信息。紧接着，我们利用sklearn库的方法来把数据集切分成训练集和测试集。

In [35]:
train_features, test_features, train_labels, test_labels = train_test_split(train_features, train_labels)

In [38]:
train_features.shape

(2250, 768)

In [39]:
type(train_features)

numpy.ndarray

## 使用逻辑回归进行训练

在这一部分，我们使用sklearn的逻辑回归模块对我们的训练集进行拟合，最后在测试集上进行评价：

In [40]:
lr_clf = LogisticRegression()
lr_clf.fit(train_features, train_labels)

LogisticRegression()

输出：

In [41]:
lr_clf.score(test_features, test_labels)

0.848

经过逻辑回归模型的拟合，其准确率达到了83.06，分类效果还不错。那么，我们还能进一步提升吗？

## 利用BERT基于微调的方式进行建模

在上一部分，我们利用了Bert抽取特征的能力进行建模，提取了Bert的输出特征，再输入给一个线性层以预测。但Bert本身的不参与模型的训练。现在我们采取另一种方式，即fine-tuned，Bert与线性层一起参与训练，反向传播会更新二者的参数，使得Bert模型更加适合这个分类任务。那么，让我们开始吧~

### 建立模型

In [43]:
#part 2 - bert fine-tuned
import torch
from torch import nn
from torch import optim
import transformers as tfs
import math

class BertClassificationModel(nn.Module):
    def __init__(self):
        super(BertClassificationModel, self).__init__()   
        model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer, 'bert-base-uncased')         
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.bert = model_class.from_pretrained(pretrained_weights)
        self.dense = nn.Linear(768, 2)  #bert默认的隐藏单元数是768， 输出单元是2，表示二分类
        
    def forward(self, batch_sentences):
        #print(batch_sentences[4])
        batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences, add_special_tokens=True,
                                max_length=66, pad_to_max_length=True, truncation=True)      #tokenize、add special token、pad
        input_ids = torch.tensor(batch_tokenized['input_ids'])
        attention_mask = torch.tensor(batch_tokenized['attention_mask'])
        bert_output = self.bert(input_ids, attention_mask=attention_mask)
        bert_cls_hidden_state = bert_output[0][:,0,:]       #提取[CLS]对应的隐藏状态
        linear_output = self.dense(bert_cls_hidden_state)
        return linear_output

In [42]:
help(tokenizer.batch_encode_plus)

Help on method batch_encode_plus in module transformers.tokenization_utils_base:

batch_encode_plus(batch_text_or_text_pairs:Union[List[str], List[Tuple[str, str]], List[List[str]], List[Tuple[List[str], List[str]]], List[List[int]], List[Tuple[List[int], List[int]]]], add_special_tokens:bool=True, padding:Union[bool, str, transformers.tokenization_utils_base.PaddingStrategy]=False, truncation:Union[bool, str, transformers.tokenization_utils_base.TruncationStrategy]=False, max_length:Union[int, NoneType]=None, stride:int=0, is_split_into_words:bool=False, pad_to_multiple_of:Union[int, NoneType]=None, return_tensors:Union[str, transformers.tokenization_utils_base.TensorType, NoneType]=None, return_token_type_ids:Union[bool, NoneType]=None, return_attention_mask:Union[bool, NoneType]=None, return_overflowing_tokens:bool=False, return_special_tokens_mask:bool=False, return_offsets_mapping:bool=False, return_length:bool=False, verbose:bool=True, **kwargs) -> transformers.tokenization_utils

In [53]:
import torch
from torch import nn
from torch import optim
import transformers as tfs
import math

class BertClassificationModel(nn.Module):
    def __init__(self):
        super(BertClassificationModel, self).__init__()
        model_class, tokenizer_class, pretrained_weights = (tfs.BertModel, tfs.BertTokenizer,'bert-base-uncased')
        self.tokenizer = tokenizer_class.from_pretrained(pretrained_weights)
        self.bert = model_class.from_pretrained(pretrained_weights)
        self.dense = nn.Linear(768, 2)
        
    def forward(self, batch_sentences):
        batch_tokenized = self.tokenizer.batch_encode_plus(batch_sentences,add_special_tokens=True,
                          max_length = 66, pad_to_max_length = True, truncation = True)
        input_ids = torch.tensor(batch_tokenized['input_ids'])
        attention_mask = torch.tensor(batch_tokenized['attention_mask'])
        bert_output = self.bert(input_ids, attention_mask = attention_mask)
        bert_cls_hidden_state = bert_output[0][:,0,:] #提取[CLS]对应的隐藏状态
        linear_output = self.dense(bert_cls_hidden_state)
        return linear_output


模型很简单，关键代码都在上面注释了。其主要构成是在bert模型的[CLS]输出位置接上一个线性层，用以预测句子的分类。

### 数据分批

下面我们对原来的数据集进行一些改造，分成batch_size为64大小的数据集，以便模型进行批量梯度下降。

In [46]:
sentences = train_set[0].values
targets = train_set[1].values
train_inputs, test_inputs, train_targets, test_targets = train_test_split(sentences, targets)

batch_size = 64
batch_count = int(len(train_inputs) / batch_size)
batch_train_inputs, batch_train_targets = [], []
for i in range(batch_count):
    batch_train_inputs.append(train_inputs[i*batch_size : (i+1)*batch_size])
    batch_train_targets.append(train_targets[i*batch_size : (i+1)*batch_size])

### 训练模型

In [56]:
#train the model
epochs = 2
lr = 0.01
print_every_batch = 10
bert_classifier_model = BertClassificationModel()
#optimizer = optim.SGD(bert_classifier_model.parameters(), lr=lr)
optimizer = optim.Adam(bert_classifier_model.parameters(), lr=lr)
criterion = nn.CrossEntropyLoss()

for epoch in range(epochs):
    print_avg_loss = 0
    print('第{}次迭代:'.format(epoch))
    for i in range(batch_count):
        inputs = batch_train_inputs[i] #输入向量
        labels = torch.tensor(batch_train_targets[i]) #输入向量标签
        optimizer.zero_grad() #梯度清零
        outputs = bert_classifier_model(inputs) #前向传播
        loss = criterion(outputs, labels) #计算损失
        loss.backward() #反向传播
        optimizer.step()#更新梯度
        
        print_avg_loss += loss.item() #同一批次内的损失相加
        if i % print_every_batch == (print_every_batch-1):
            print("Batch: %d, Loss: %.4f" % ((i+1), print_avg_loss/print_every_batch))
            print_avg_loss = 0

第0次迭代:
Batch: 10, Loss: 1.5290
Batch: 20, Loss: 0.7111
Batch: 30, Loss: 0.7068
第1次迭代:
Batch: 10, Loss: 0.6958
Batch: 20, Loss: 0.7138
Batch: 30, Loss: 0.7080


### 模型评价

In [57]:
# eval the trained model
total = len(test_inputs)
hit = 0
with torch.no_grad():
    for i in range(total):
        outputs = bert_classifier_model([test_inputs[i]])
        _, predicted = torch.max(outputs, 1)
        if predicted == test_targets[i]:
            hit += 1

print("Accuracy: %.2f%%" % (hit / total * 100))

Accuracy: 51.73%


In [None]:
warnings.filterwarnings('ignore')

可以看出，通过微调的方式来建模，经过3个轮次的训练后，模型的准确率达到了90.53%，比起基于特征的建模方式有了较大提升。