# Bert在情感分类任务的微调

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import BertTokenizer,BertModel
from torchtext.legacy import data,datasets
import numpy as np
import random
import time


SEED = 2022
TRAIN = False
BATCH_SIZE=128
N_EPOCHS=5
HIDDEN_DIM=256
OUTPUT_DIM=1
N_LAYERS=2
BIDIRECTIONAL=True
DROPOUT=0.25

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.backends.cudnn.deterministic=True

In [3]:
# 通过类的静态方法获取对象,这是预训练的对象
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [4]:
init_token_id = tokenizer.cls_token_id
eos_token_id = tokenizer.sep_token_id
pad_token_id = tokenizer.pad_token_id
unk_token_id = tokenizer.unk_token_id
print('init_token_id',init_token_id)
print('eos_token_id',eos_token_id)
print('pad_token_id',pad_token_id)
print('unk_token_id',unk_token_id)

init_token_id 101
eos_token_id 102
pad_token_id 0
unk_token_id 100


In [5]:
max_input_len = tokenizer.max_model_input_sizes['bert-base-uncased']# bert的输入句子长度
print('max_input_len',max_input_len)

max_input_len 512


In [None]:
device='cuda' if torch.cuda.is_available() else 'cpu'
# 把句子长度切成510，加入开头跟结尾符
def tokenize_and_crop(sentence):
    tokens = tokenizer.tokenize(sentence)
    tokens = tokens[:max_input_len-2]
    return tokens

def load_data():
    text = data.Field(
        batch_first=True,
        use_vocab=False,
        tokenize=tokenize_and_crop,
        preprocessing=tokenizer.convert_tokens_to_ids,
        init_token=init_token_id,
        pad_token=pad_token_id,
        unk_token=unk_token_id
    )
    label = data.LabelField(dtype=torch.float)
    # 对于自己的数据集可以修改IMDB，对外调用只要
    train_data,test_data = datasets.IMDB.splits(text,label)
    print(train_data)
    train_data,valid_data = train_data.split(random_state=random.seed(SEED))
    print(f'train examples counts:{len(train_data)}')
    print(f'test examples counts:{len(test_data)}')
    print(f'valid examples counts:{len(valid_data)}')

    label.build_vocab(train_data)

    train_iter,valid_iter,test_iter = data.BucketIterator.splits(
        (train_data,valid_data,test_data),
        batch_size=BATCH_SIZE,
        device=device
        )
    return train_iter,valid_iter,test_iter

res = load_data()

In [38]:
# 查看loaddata内部实现
text = data.Field(
    batch_first=True,
    use_vocab=False,
    tokenize=tokenize_and_crop,
    preprocessing=tokenizer.convert_tokens_to_ids,
    init_token=init_token_id,
    pad_token=pad_token_id,
    unk_token=unk_token_id
)
label = data.LabelField(dtype=torch.float)
# IMDB
train_data,test_data = datasets.IMDB.splits(text_field=text,label_field=label)
print(train_data)
train_data,valid_data = train_data.split(random_state=random.seed(SEED))
print(f'train examples counts:{len(train_data)}')
print(f'test examples counts:{len(test_data)}')
print(f'valid examples counts:{len(valid_data)}')

train examples counts:17500
test examples counts:25000
valid examples counts:7500


In [None]:
train_data.__dict__

In [63]:
train_iter,valid_iter,test_iter = data.BucketIterator.splits(
    (train_data,valid_data,test_data),
    batch_size=BATCH_SIZE,
    device=device
    )
train_iter

<torchtext.legacy.data.iterator.BucketIterator at 0x7f914039e128>

In [62]:
label.build_vocab(train_data)
label.vocab

<torchtext.legacy.vocab.Vocab at 0x7f914039e278>

In [2]:
bert_model = BertModel.from_pretrained('bert-base-uncased')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
bert_model

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0): BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
          

In [66]:
for batch in train_iter:
    print(batch.text.shape)
    break


torch.Size([128, 511])


In [71]:
type(bert_model.config)

transformers.models.bert.configuration_bert.BertConfig

# 测试中文分词工具jieba

In [1]:
# 测试
import jieba
seg_list = jieba.cut('我爱人工智能')
print(' '.join(seg_list))

Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.264 seconds.
Prefix dict has been built successfully.


我 爱 人工智能


# 测试中文转拼音工具Pypinyin

In [2]:
from pypinyin import lazy_pinyin,Style
a = lazy_pinyin('今天是个钓鱼的好天气',style=Style.TONE3)
print(a)

['jin1', 'tian1', 'shi4', 'ge4', 'diao4', 'yu2', 'de', 'hao3', 'tian1', 'qi4']


In [4]:
from pypinyin.contrib.mmseg import seg
text = '今天是个钓鱼的好天气'
b = list(seg.cut(text))
print(b)
print(' '.join(jieba.cut(text)))

['今', '天', '是', '个', '钓', '鱼', '的', '好', '天', '气']
今天 是 个 钓鱼 的 好 天气


# 评估翻译质量的算法库 ScareBleu

In [None]:
import sacrebleu
