# 获取数据路径及文件

In [1]:
import os
all_file_path = []
for root, dirs, files in os.walk('./data'):
    for _dir in dirs:
        all_file_path.append(os.path.join(root, _dir, 'neg.txt'))
        all_file_path.append(os.path.join(root, _dir, 'pos.txt'))
all_file_path

['./data\\clothing\\neg.txt',
 './data\\clothing\\pos.txt',
 './data\\fruit\\neg.txt',
 './data\\fruit\\pos.txt',
 './data\\hotel\\neg.txt',
 './data\\hotel\\pos.txt',
 './data\\pda\\neg.txt',
 './data\\pda\\pos.txt',
 './data\\shampoo\\neg.txt',
 './data\\shampoo\\pos.txt']

# 将数据集中的所有物品类型的评论整合到一起

In [2]:
neg_datalist = []
pos_datalist = []
for file in all_file_path:
    if (os.path.basename(file) == 'neg.txt'):
        with open(file, 'r', encoding='utf-8') as f:
            neg_data = f.read()
            neg_datalist.extend(neg_data.split('\n'))
    else:
        with open(file, 'r', encoding='utf-8') as f:
            pos_data = f.read()
            pos_datalist.extend(pos_data.split('\n'))
len(neg_datalist)

25000

In [3]:
import numpy as np

dataset = np.array(neg_datalist+pos_datalist)
labels = np.array([0]*len(neg_datalist)+[1]*len(pos_datalist))
print(len(dataset))

50000


In [4]:
np.random.seed(2022)
mix_index = np.random.choice(len(dataset),len(dataset))
dataset = dataset[mix_index]
labels = labels[mix_index]

print(labels)

[0 0 0 ... 0 1 1]


# 划分训练集和测试集

In [5]:
from sklearn.model_selection import train_test_split

train_samples, eval_samples, train_labels, eval_labels = train_test_split(
    dataset, labels, test_size=0.1, shuffle=True, random_state=2022)
train_samples = train_samples.tolist()
eval_samples = eval_samples.tolist()
print(len(train_samples))
print(len(eval_samples))

45000
5000


# 得到标签的One-hot编码

In [6]:
def get_dummies(l, size=2):
    res = list()
    for i in l:
        tmp = [0]*size
        tmp[i] = 1
        res.append(tmp)
    return res

# 定义Bert模型

In [7]:
import torch
from transformers import BertTokenizer
from torch.utils.data import DataLoader, TensorDataset

model_name_path = './pretrain_model/bert-base-chinese/'

tokenizer = BertTokenizer.from_pretrained(model_name_path)

In [8]:
# tokenized_text = [tokenizer.tokenize(i) for i in train_sample]
# input_ids = [tokenizer.convert_tokens_to_ids(i) for i in tokenized_text]
# input_labels = get_dummies(train_labels)
# input_ids[0], input_labels[0]

In [9]:
# for i in range(len(input_ids)):
#     # 将数据样本填充至长度为512
#     seq = input_ids[i]
#     if len(seq) != 512:
#         input_ids[i].extend([0]*(512-len(seq)))

# 词元化文本

In [10]:
model_train_inputs = tokenizer(train_samples,
                               padding=True,
                               truncation=True,
                               max_length=512,
                               return_tensors='pt')

# 转换标签

In [11]:
input_labels = get_dummies(train_labels)

# 构建训练和测试的数据集

In [12]:
train_set = TensorDataset(model_train_inputs['input_ids'], torch.FloatTensor(input_labels))

In [13]:
train_loader = DataLoader(dataset=train_set, batch_size=4, shuffle=True)
train_loader

<torch.utils.data.dataloader.DataLoader at 0x19c2c4aee20>

In [14]:
model_eval_inputs = tokenizer(eval_samples,
                              padding=True,
                              truncation=True,
                              max_length=512,
                              return_tensors='pt')
input_labels = get_dummies(eval_labels)

eval_set = TensorDataset(model_eval_inputs['input_ids'],
                         torch.FloatTensor(input_labels))
eval_loader = DataLoader(dataset=eval_set, batch_size=1, shuffle=True)
eval_loader

<torch.utils.data.dataloader.DataLoader at 0x19ba668e3a0>

# 定义训练的设备，GPU or CPU

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

# 搭建Bert分类模型

In [16]:
import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel


class fn_cls(nn.Module):
    def __init__(self):
        super(fn_cls, self).__init__()
        self.model = BertModel.from_pretrained(model_name_path, cache_dir="./")
        self.model.to(device)
        self.dropout = nn.Dropout(0.1)
        self.l1 = nn.Linear(768, 2)

    def forward(self, x, attention_mask=None):
        outputs = self.model(x, attention_mask=attention_mask)
        x = outputs[1]  # 取池化后的结果 batch * 768
        x = x.view(-1, 768)
        x = self.dropout(x)
        x = self.l1(x)
        return x


# 定义优化器，损失函数等

In [17]:
from torch import optim

cls = fn_cls()
cls.to(device)
cls.train()

criterion = nn.BCELoss()
sigmoid = nn.Sigmoid()
optimizer = optim.Adam(cls.parameters(), lr=1e-5)


Some weights of the model checkpoint at ./pretrain_model/bert-base-chinese/ were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


# 定义预测函数

In [18]:
def predict(logits):
    res = torch.argmax(logits, 1)
    return res


# 训练过程 fine-tuning

In [19]:
from torch.autograd import Variable
import time

pre = time.time()

accumulation_steps = 8
epoch = 2

for i in range(epoch):
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = Variable(data).to(device), Variable(
            target.view(-1, 2)).to(device)

        mask = []
        for sample in data:
            mask.append([1 if i != 0 else 0 for i in sample])
        mask = torch.Tensor(mask).to(device)
        
        output = cls(data, attention_mask=mask)
        pred = predict(output)

        loss = criterion(sigmoid(output).view(-1, 2), target)

        # 梯度积累
        loss = loss/accumulation_steps
        loss.backward()

        if((batch_idx+1) % accumulation_steps) == 0:
            # 每 8 次更新一下网络中的参数
            optimizer.step()
            optimizer.zero_grad()

        if ((batch_idx+1) % accumulation_steps) == 1:
            print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss:{:.6f}'.format(
                i+1, batch_idx, len(train_loader), 100. *
                batch_idx/len(train_loader), loss.item()
            ))
        if batch_idx == len(train_loader)-1:
            # 在每个 Epoch 的最后输出一下结果
            print('labels:', target)
            print('pred:', pred)

print('训练时间：', time.time()-pre)


















labels: tensor([[0., 1.],
        [1., 0.],
        [0., 1.],
        [0., 1.]], device='cuda:0')
pred: tensor([1, 0, 1, 1], device='cuda:0')
















labels: tensor([[1., 0.],
        [0., 1.],
        [0., 1.],
        [1., 0.]], device='cuda:0')
pred: tensor([0, 1, 1, 0], device='cuda:0')
训练时间： 10693.34944152832


# 测试过程

In [20]:
from tqdm import tqdm_notebook as tqdm

cls.eval()

correct = 0
total = 0

for batch_idx, (data, target) in enumerate(tqdm(eval_loader)):
    data = data.to(device)
    target = target.long().to(device)

    mask = []
    for sample in data:
        mask.append([1 if i != 0 else 0 for i in sample])
    mask = torch.Tensor(mask).to(device)

    output = cls(data, attention_mask=mask)
    pred = predict(output)

    correct += (pred == target).sum().item()
    total += len(data)

# 准确率应该达到百分之 90 以上
print('正确分类的样本数：{}，样本总数：{}，准确率：{:.2f}%'.format(
    correct, total, 100.*correct/total))


Please use `tqdm.notebook.tqdm` instead of `tqdm.tqdm_notebook`
  for batch_idx, (data, target) in enumerate(tqdm(eval_loader)):


  0%|          | 0/5000 [00:00<?, ?it/s]

正确分类的样本数：5000，样本总数：5000，准确率：100.00%


In [46]:
test_samples = ['很棒的购物体验，但是东西有些问题，物流可以，但是东西没有吊牌', '物流还是挺快的，很好']

In [47]:
model_inputs = tokenizer(test_samples,
                               padding=True,
                               truncation=True,
                               max_length=512,
                               return_tensors='pt')
model_inputs

{'input_ids': tensor([[ 101, 2523, 3472, 4638, 6579, 4289,  860, 7741, 8024,  852, 3221,  691,
         6205, 3300,  763, 7309, 7579, 8024, 4289, 3837, 1377,  809, 8024,  852,
         3221,  691, 6205, 3766, 3300, 1396, 4277,  102],
        [ 101, 4289, 3837, 6820, 3221, 2923, 2571, 4638, 8024, 2523, 1962,  102,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0]])}

In [48]:
cls.eval()

output = cls(model_inputs['input_ids'].to(device), attention_mask=model_inputs['attention_mask'].to(device))
pred = predict(output)
pred


tensor([1, 1], device='cuda:0')