<a href="https://colab.research.google.com/github/Bossism/DailyStudy/blob/master/bert_classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.10.2-py3-none-any.whl (2.8 MB)
[K     |████████████████████████████████| 2.8 MB 5.1 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.45-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 37.3 MB/s 
[?25hCollecting huggingface-hub>=0.0.12
  Downloading huggingface_hub-0.0.17-py3-none-any.whl (52 kB)
[K     |████████████████████████████████| 52 kB 1.4 MB/s 
[?25hCollecting pyyaml>=5.1
  Downloading PyYAML-5.4.1-cp37-cp37m-manylinux1_x86_64.whl (636 kB)
[K     |████████████████████████████████| 636 kB 42.5 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 32.8 MB/s 
Installing collected packages: tokenizers, sacremoses, pyyaml, huggingface-hub, transformers
  Attempting uninstall: pyyaml
    Found existing installati

In [10]:
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
import random
import pandas as pd
from torch.utils.data import Dataset
# import pysnooper
from torch.utils.data import  DataLoader
from torch.nn.utils.rnn import pad_sequence

PRETRAINED_MODEL_NAME = "bert-base-chinese" # 指定预训练模型
# 取得此预训练模型所使用的tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
# print("PyTorch 版本：", torch.__version__)

vocab = tokenizer.vocab  # 21128
# print("字典大小：", len(vocab))

random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]
# print("{0:20}{1:15}".format("token", "index"))
# print("-"*25)
# for t, id in zip(random_tokens, random_ids):
#     print("{0:15}{1:10}".format(t, id))

text = "[CLS] 等到潮水[MASK]了,就知道谁没穿裤子。"
tokens = tokenizer.tokenize(text)
ids = tokenizer.convert_tokens_to_ids(tokens)
# print(text)
# print(tokens)
# print(ids)

from transformers import BertForMaskedLM
tokens_tensor = torch.tensor([ids])
segments_tensors = torch.zeros_like(tokens_tensor)
maskedLM_model = BertForMaskedLM.from_pretrained(PRETRAINED_MODEL_NAME)
clear_output()
maskedLM_model.eval()
with torch.no_grad():
    outputs = maskedLM_model(tokens_tensor, segments_tensors)
    predictions = outputs[0]
del maskedLM_model
masked_index = 5
k = 3
probs, indices = torch.topk(torch.softmax(predictions[0, masked_index], -1), k)
predicted_tokens = tokenizer.convert_ids_to_tokens(indices.tolist())
# print("输入 tokens:", tokens[:10], '...')
# print('-'*50)
# for i, (t, p) in enumerate(zip(predicted_tokens, probs), 1):
#     tokens[masked_index] = t
#     print("Top {} ({:2}%) : {}".format(i, int(p.item() * 100), tokens[:10]), '...')

# prepare raw data
df_train = pd.read_csv("train.csv")
# empty_title = ((df_train['title1_zh'].isnull()) | (df_train['title2_zh'].isnull()) | (df_train['title2_zh'] == '0') | (df_train['title2_zh'] == ''))
empty_title = ((df_train['title2_zh'].isnull()) \
               | (df_train['title1_zh'].isnull()) \
               | (df_train['title2_zh'] == '') \
               | (df_train['title2_zh'] == '0'))
df_train = df_train[~empty_title]
MAX_LENGTH = 30
df_train = df_train[~(df_train.title1_zh.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_zh.apply(lambda x : len(x)) > MAX_LENGTH)]
SAMPLE_FRAC = 0.01
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_zh', 'title2_zh', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']
df_train.to_csv("train.tsv", sep='\t', index=False)
# print("训练样本数：", len(df_train)) # 2657
# print(df_train.head())
# print(df_train.label.value_counts() / len(df_train)) # unrelated  0.679338

df_test = pd.read_csv("test.csv")
df_test = df_test.loc[:, ["title1_zh", "title2_zh", "id"]]
df_test.columns = ["text_a", "text_b", "Id"]
df_test.to_csv("test.tsv", sep="\t", index=False)
# print("预测样本数:", len(df_test))  # 80126
# print(df_test.head())

# Convert the original text into a BERT compatible input format
class FakeNewDataset(Dataset):
    def __init__(self, mode, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode
        self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer

    def __len__(self):
        return self.len
    # @pysnooper.snoop()
    # Define a function to return a training/test data
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)

        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)

        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a

        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)

        segments_tensor = torch.tensor([0]*len_a + [1]*len_b, dtype=torch.long)

        return (tokens_tensor, segments_tensor, label_tensor)


trainset = FakeNewDataset("train", tokenizer=tokenizer)

# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
text_a, text_b, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
# print(f"""[原始文本]
# 句子 1：{text_a}
# 句子 2：{text_b}
# 分類  ：{label}
#
# --------------------
#
# [Dataset 回傳的 tensors]
# tokens_tensor  ：{tokens_tensor}
#
# segments_tensor：{segments_tensor}
#
# label_tensor   ：{label_tensor}
#
# --------------------
#
# [還原 tokens_tensors]
# {combined_text}
# """)

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]

    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None

    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    # attention mask 将tokens_tensors中不为zero padding的位置设为1， 让bert只关注这些位置的tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

    return tokens_tensors, segments_tensors, masks_tensors, label_ids

BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

data = next(iter(trainloader))

tokens_tensors, segments_tensors, masks_tensors, label_ids = data

# print(f"""
# tokens_tensors.shape   = {tokens_tensors.shape}
# {tokens_tensors}
# ------------------------
# segments_tensors.shape = {segments_tensors.shape}
# {segments_tensors}
# ------------------------
# masks_tensors.shape    = {masks_tensors.shape}
# {masks_tensors}
# ------------------------
# label_ids.shape        = {label_ids.shape}
# {label_ids}
# """)

# add a layer to BERT
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 3
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)
clear_output()
# print("""
# name            module
# ----------------------""")
# for name, module in model.named_children():
#     if name == "bert":
#         for n, _ in module.named_children():
#             print(f"{name}:{n}")
#     else:
#         print("{:15} {}".format(name, module))

def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0

    with torch.no_grad():
        for data in dataloader:
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]

            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors,
                            token_type_ids=segments_tensors,
                            attention_mask=masks_tensors)

            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)

            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()

            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))

        if compute_acc:
            acc = correct / total
            return predictions, acc
        return predictions

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

def get_learnable_params(module):
    return [p for p in module.parameters() if p.requires_grad]


model_params = get_learnable_params(model)
clf_params = get_learnable_params(model.classifier)
print(f"""
整個分類模型的參數量：{sum(p.numel() for p in model_params)}
線性分類器的參數量：{sum(p.numel() for p in clf_params)}
""")

model.train()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)
EPOCHS = 6
for epoch in range(EPOCHS):
    running_loss = 0.0
    for data in trainloader:
        tokens_tensors, segments_tensors, masks_tensors, labels = [t.to(device) for t in data]
        optimizer.zero_grad()
        outputs = model(input_ids=tokens_tensors,
                        token_type_ids=segments_tensors,
                        attention_mask=masks_tensors,
                        labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d loss: %.3f, acc: %.3f' % (epoch + 1, running_loss, acc))

# %%time
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = FakeNewDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, 
                        collate_fn=create_mini_batch)

# 用分類模型預測測試集
predictions = get_predictions(model, testloader)

# 用來將預測的 label id 轉回 label 文字
index_map = {v: k for k, v in testset.label_map.items()}

# 生成 Kaggle 繳交檔案
df = pd.DataFrame({"Category": predictions.tolist()})
df['Category'] = df.Category.apply(lambda x: index_map[x])
df_pred = pd.concat([testset.df.loc[:, ["Id"]], 
                          df.loc[:, 'Category']], axis=1)
df_pred.to_csv('bert_1_prec_training_samples.csv', index=False)
df_pred.head()

classification acc: 0.029356417011667295

整個分類模型的參數量：102269955
線性分類器的參數量：2307

[epoch 1 loss: 32.024, acc: 0.808
[epoch 2 loss: 17.966, acc: 0.864
[epoch 3 loss: 13.264, acc: 0.882
[epoch 4 loss: 9.913, acc: 0.899
[epoch 5 loss: 6.871, acc: 0.925
[epoch 6 loss: 5.748, acc: 0.930


Unnamed: 0,Id,Category
0,321187,unrelated
1,321190,unrelated
2,321189,unrelated
3,321193,unrelated
4,321191,unrelated
