# A Gentle Introduction to implementing BERT using Hugging Face: Practice

[Project Link](https://medium.com/analytics-vidhya/a-gentle-introduction-to-implementing-bert-using-hugging-face-35eb480cff3)
* `BERT`
* Sentiment Analysis
* DataSet -- [Standford Treebank Dataset](https://github.com/rajatbhatnagar94/bert_getting_started/tree/master/data)

In [314]:
import os
import pandas as pd
import torch
import transformers
import sklearn
from transformers import BertTokenizer,BertForSequenceClassification
from IPython.core.display import display, HTML
# alibaba modelscope
from modelscope import snapshot_download
# add proxy for downloading models 
os.environ['HTTP_PROXY'] = 'http://10.188.48.164:8365/' 
os.environ['HTTPS_PROXY'] = 'http://10.188.48.164:8365/'

In [315]:
# constructing a dict, cotaining some vital elements
dataset = {
    "name": "Standford treebank",
    "train_path": "data/train.csv",
    "dev_path": "data/dev.csv",
    "test_path": "data/test.csv",
    "classes": ["neg", "pos"]
}
# load data in pandas
def read_data():
    train = pd.read_csv(dataset["train_path"], sep="\t")
    dev = pd.read_csv(dataset["dev_path"], sep="\t")
    test = pd.read_csv(dataset["test_path"], sep="\t")
    return train, dev, test
train, dev, test = read_data()

In [316]:
# load bert-base-uncased from modelscope
# bert_base_uncased = snapshot_download('AI-ModelScope/bert-base-uncased')
# tokenizer the sequences and convert them to vectorized form

# from_pretained加载指定名称的预训练tokenizer
# bert-base-uncased: BERT base版的仅含小写模式
# It's hard to load BertTokenizer.from_pretrained("bert-base-uncased") in BML CodeLab, both hugging-face and modelscope
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
# creating batches of the vectoerized tokens for train, dev and test

`tokenize.encode_plus`

作用：将文本序列编码为模型所需要的输入格式

参数说明


| 序号 | 参数名 | 作用 |
| -------- | -------- | -------- |
| 1     | `test`     | 要编码的seq, `encode_plus`将对seq每个字符串执行相同的编码操作，并将结果合并到一个`tensor`中     |
| 2     | `max_length`     | 编码后序列的最大长度，如果输入序列长度超过这个值，将会被截断     |
| 3     | `pad_to_max_length`     | 为`True`则短于`max_length`部分会被填充     |
| 4     | `add_special_tokens`     | 是否在编码后的序列中加入特殊标记，例如`[CLS]`和`[SEP]`     |
| 6     | `padding_side`     | `pad_to_max_length=True`时，填充是在序列左边还是右边     |
| 7     | `return_attention_mask`     | 是否返回`attention_mask`     |

`encode`仅返回`input_ids`

`encode_plus`返回所有的编码信息，具体如下：
1. `input_ids`: 是单词在词典中的编码
2. `token_type_ids`: 区分两个句子的编码（上句全为0，下句全为1）
3. `attention_mask`: 指定对哪些词进行self-attention操作




In [317]:
def encode(data, tokenizer):
    input_ids = []
    attention_mask = []

    for text in data:
        # encode_plus返回所有的编码信息，包括
        # input_ids: 单词在词典中的编码
        # token_typeIds: 区分两个句子的编码（上句全为0，下句全为1）
        # attention_mask: 哪些input_ids需要进行padding, padding(希望所有输入的seq长度一致，至少是同一batch)
        # offset_mapping: 记录每个拆分出来的内容在原来序列中的位置
        tokenized_text = tokenizer.encode_plus(text, max_length=128, add_special_tokens=True, padding='max_length', return_attention_mask=True)
        input_ids.append(tokenized_text["input_ids"])
        attention_mask.append(tokenized_text["attention_mask"])
    return torch.tensor(input_ids, dtype=torch.long), torch.tensor(attention_mask, dtype=torch.long)

In [318]:
# making batches
def get_batches(df, tokenizer, batch_size=2):
    x = list(df["text"].values)
    # 获取分类标签对应的索引
    def get_label_index(each_y):
        return dataset["classes"].index(each_y)
    y_indices = df["classification"].apply(get_label_index)
    # print(y_indices)
    y = torch.tensor(list(y_indices), dtype=torch.long)
    input_ids, attention_mask = encode(x, tokenizer)
    # print(input_ids.shape)
    # print(attention_mask.shape)
    # print(y.shape)
    tensor_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, y)
    tensor_randomsampler = torch.utils.data.RandomSampler(tensor_dataset)
    tensor_dataloader = torch.utils.data.DataLoader(tensor_dataset, sampler=tensor_randomsampler, batch_size=batch_size)
    return tensor_dataloader

In [320]:
batch_train = get_batches(train, tokenizer, batch_size=2)
batch_dev = get_batches(dev, tokenizer, batch_size=2)
batch_test = get_batches(test, tokenizer, batch_size=2)

In [341]:
def train_model(batch, model, optimizer, scheduler, epochs, device):
    # set the mode to training
    model.train()  
    # iterate via each batch and transfer it to GPU
    for e in range(epochs):
        for i, batch_tuple in enumerate(batch):
            batch_tuple = (t.to(device) for t in batch_tuple)
            input_ids, attention_mask, labels = batch_tuple
            # pass input_ids, attention_mask and labels to the model
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            # print("outputs: ", outputs)
            # loss, logits, hidden_states_output, attention_mask_output = outputs
            # print("loss: ", loss)
            # print("logits: ", logits)
            # print("hidden_states_output: ", hidden_states_output)
            # print("attention_mask_output: ", attention_mask_output)
            # 提取损失值
            loss = outputs.loss
            # 提取预测的分类结果
            logits = outputs.logits
            # 提取隐藏状态
            hidden_states_output = outputs.hidden_states
            # 提取注意力权重
            attention_mask_output = outputs.attentions
            if i % 100 == 0:
                print("loss - {0}, iteration - {1}/{2}".format(loss, e + 1, i))
            model.zero_grad()
            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['max_grad_norm'])
            optimizer.step()
            scheduler.step()

In [346]:
# Evaluation
def evaluate(batch, model, device):
    input_ids, predictions, true_labels, attentions = [], [], [], []
    # set the mode of the model to evaluation
    model.eval()
    # iterate over each batch and execute the forward function
    for i, batch_cpu in enumerate(batch):
        batch_gpu = (t.to(device) for t in batch_cpu)
        input_ids_gpu, attention_mask, labels = batch_gpu
        with torch.no_grad():
            # obtain similar output as we obtained in the training step
            outputs = model(input_ids=input_ids_gpu, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            logits = outputs.logits
            hidden_states_output = outputs.hidden_states
            attention_mask_output = outputs.attentions
            # make use of the logits variable to get the prediction, logits contains the prediction for each class without softmax
            logits = logits.cpu()
            prediction = torch.argmax(logits, dim=1).tolist()
            true_label = labels.cpu().tolist()
            input_ids_cpu = input_ids_gpu.cpu().tolist()
            # select the last layer and last head of the attention_mask_output and return the values [CLS]
            attention_last_layer = attention_mask_output[-1].cpu()
            attention_softmax = attention_last_layer[:, -1, 0].tolist()
            input_ids += input_ids_cpu
            predictions += prediction
            true_labels += true_label
            attentions += attention_softmax
    return input_ids, predictions, true_labels, attentions




In [347]:
# run the whole process
epochs = 2
parameters = {
    "learning_rate": 2e-5,
    "num_warmup_steps": 1000,
    "num_training_steps": len(batch_train) * epochs,
    "max_grad_norm": 1
}

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2, output_hidden_states=True, output_attentions=True)
model.to(device)
optimizer = transformers.AdamW(model.parameters(), lr=parameters["learning_rate"], correct_bias=False)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer, num_warmup_steps=parameters["num_warmup_steps"], num_training_steps=parameters["num_training_steps"])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

In [348]:
train_model(batch_train, model, optimizer, scheduler, epochs, device)

loss - 0.3467331528663635, iteration - 1/0
loss - 0.790759265422821, iteration - 1/100
loss - 0.17640027403831482, iteration - 1/200
loss - 0.027052801102399826, iteration - 1/300
loss - 0.014387719333171844, iteration - 1/400
loss - 0.004789090249687433, iteration - 1/500
loss - 2.378199815750122, iteration - 1/600
loss - 0.008542665280401707, iteration - 1/700
loss - 0.005200526677072048, iteration - 1/800
loss - 0.017449242994189262, iteration - 1/900
loss - 0.10507798939943314, iteration - 1/1000
loss - 1.2595596313476562, iteration - 1/1100
loss - 0.007012128364294767, iteration - 1/1200
loss - 0.0033053935039788485, iteration - 1/1300
loss - 0.0027264056261628866, iteration - 1/1400
loss - 0.005942234303802252, iteration - 1/1500
loss - 0.0006619238411076367, iteration - 1/1600
loss - 0.0057159350253641605, iteration - 1/1700
loss - 0.00575929693877697, iteration - 1/1800
loss - 0.0021406777668744326, iteration - 1/1900
loss - 0.0027160923928022385, iteration - 1/2000
loss - 0.00

In [349]:
# begin to evaluate
input_ids, predictions, true_labels, attentions = evaluate(batch_dev, model, device)
print(sklearn.metrics.classification_report(true_labels, predictions))


              precision    recall  f1-score   support

           0       0.92      0.89      0.90       428
           1       0.89      0.93      0.91       444

    accuracy                           0.91       872
   macro avg       0.91      0.91      0.91       872
weighted avg       0.91      0.91      0.91       872



In [350]:
# visualization
def get_length_without_special_tokens(sentence):
    length = 0
    for i in sentence:
        if i == 0:
            break
        else:
            length += 1
    return length
def print_attention(input_ids_all, attentions_all, tokenizer):
    for input_ids, attention in zip(input_ids_all, attentions_all):
        html = []
        len_input_ids = get_length_without_special_tokens(input_ids)
        input_ids = input_ids[:len_input_ids]
        attention = attention[:len_input_ids]
        for input_id, attention_value in zip(input_ids, attention):
            token = tokenizer.convert_ids_to_tokens(input_id)
            attention_value = attention_value
            html.append('<span style="background-color: rgb(255,255,0,{0})">{1}</span>'.format(10 * attention_value, token))
        html_string = " ".join(html)
        display(HTML(html_string))
print_attention(input_ids, attentions, tokenizer)

In [351]:
# save the model
def save(model, tokenizer):
    output_dir = 'output'
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    print("Saving model to {}".format(output_dir))
    model.save_pretrained(output_dir)
    tokenizer.save_pretrained(output_dir)
save(model, tokenizer)

Saving model to output
