In [8]:
import torch
from transformers import BertTokenizer
from IPython.display import clear_output

PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

clear_output()
print("PyTorch 版本：", torch.__version__)

PyTorch 版本： 1.7.0


In [7]:
!pip install pandas

distutils: /Users/thetawang/miniforge_x86_64/envs/pytorch_x86/include/python3.7m/UNKNOWN
sysconfig: /Users/thetawang/miniforge_x86_64/envs/pytorch_x86/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m
Collecting pandas
  Downloading pandas-1.2.4-cp37-cp37m-macosx_10_9_x86_64.whl (10.4 MB)
[K     |████████████████████████████████| 10.4 MB 2.6 MB/s eta 0:00:01
[?25hCollecting pytz>=2017.3
  Downloading pytz-2021.1-py2.py3-none-any.whl (510 kB)
[K     |████████████████████████████████| 510 kB 4.4 MB/s eta 0:00:01
Installing collected packages: pytz, pandas
distutils: /Users/thetawang/miniforge_x86_64/envs/pytorch_x86/include/python3.7m/UNKNOWN
sysconfig: /Users/thetawang/miniforge_x86_64/envs/pytorch_x86/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m
Successfully installed pandas-1.2.4 pytz-2021.1


In [2]:
vocab = tokenizer.vocab
print("字典大小：", len(vocab))

字典大小： 21128


In [3]:
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids):
    print("{0:15}{1:10}".format(t, id))

token               index          
-------------------------
##橋                 16635
凿                    1142
##刚                 14214
错                    7231
il                  12197
限                    7361
##遗                 19947
佘                     864
##☀                 13619
漱                    4038


## 視覺化

In [68]:

# 安裝 BertViz
"""
import sys
!test -d bertviz_repo || git clone https://github.com/jessevig/bertviz bertviz_repo
if not 'bertviz_repo' in sys.path:
  sys.path += ['bertviz_repo']
"""

!pip install bertviz

distutils: /Users/thetawang/miniforge_x86_64/envs/pytorch_x86/include/python3.7m/UNKNOWN
sysconfig: /Users/thetawang/miniforge_x86_64/envs/pytorch_x86/include/python3.7m[0m
user = False
home = None
root = None
prefix = None[0m
Collecting bertviz
  Downloading bertviz-1.0.0-py3-none-any.whl (162 kB)
[K     |████████████████████████████████| 162 kB 783 kB/s eta 0:00:01
Collecting sentencepiece
  Downloading sentencepiece-0.1.95-cp37-cp37m-macosx_10_6_x86_64.whl (1.1 MB)
[K     |████████████████████████████████| 1.1 MB 1.0 MB/s eta 0:00:01
Collecting future
  Downloading future-0.18.2.tar.gz (829 kB)
[K     |████████████████████████████████| 829 kB 1.7 MB/s eta 0:00:01
Collecting dataclasses
  Downloading dataclasses-0.6-py3-none-any.whl (14 kB)
Building wheels for collected packages: future
  Building wheel for future (setup.py) ... [?25ldone
[?25h  Created wheel for future: filename=future-0.18.2-py3-none-any.whl size=491059 sha256=e3353b560dda5c11590a819bd452ca6424563c895416d4de

In [15]:
# import packages
from transformers import BertTokenizer, BertModel
from bertviz import head_view

# 在 jupyter notebook 裡頭顯示 visualzation 的 helper
def call_html():
  import IPython
  display(IPython.core.display.HTML('''
        <script src="/static/components/requirejs/require.js"></script>
        <script>
          requirejs.config({
            paths: {
              base: '/static/base',
              "d3": "https://cdnjs.cloudflare.com/ajax/libs/d3/3.5.8/d3.min",
              jquery: '//ajax.googleapis.com/ajax/libs/jquery/2.0.0/jquery.min',
            },
          });
        </script>
        '''))

clear_output()

## 猜測識別

In [37]:
import torch
from IPython.display import clear_output
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained(model_version)



# 記得我們是使用中文 BERT
model_version = 'bert-base-chinese'

# 情境句子
text_a = "[CLS]等到潮水[MASK]了,就知道誰沒穿褲子"
text_b = "等到潮水[MASK]了"
text_c = "就知道誰沒穿褲子"

#
def get_input_from_mask_sentence(a):
    tokens = tokenizer.tokenize(a)
    ids = tokenizer.convert_tokens_to_ids(tokens)
    
    # 除了 tokens 以外我們還需要辨別句子的 segment ids
    input_ids = torch.tensor([ids])  # (1, seq_len)
    token_type_ids = torch.zeros_like(input_ids)  # (1, seq_len)
    return tokens, input_ids ,token_type_ids

#
def get_input_from_two_sentence(a, b):
    inputs = tokenizer.encode_plus(a, b, return_tensors='pt', add_special_tokens=True)
    input_ids ,token_type_ids = inputs['input_ids'],inputs['token_type_ids']
    
    return input_ids ,token_type_ids

In [38]:
from transformers import BertForMaskedLM

#
maskedLM_model = BertForMaskedLM.from_pretrained(model_version)
clear_output()

tokens, input_ids ,token_type_ids = get_input_from_mask_sentence(text_a)

# 使用 masked LM 估計 [MASK] 位置所代表的實際 token 
maskedLM_model.eval()
with torch.no_grad():
    outputs = maskedLM_model(input_ids, token_type_ids=token_type_ids)
    predictions = outputs[0]
    # (1, seq_len, num_hidden_units)
del maskedLM_model

# 將 [MASK] 位置的機率分佈取 top k 最有可能的 tokens 出來
masked_index = 5
k = 3
probs, indices = torch.topk(torch.softmax(predictions[0, masked_index], -1), k)
predicted_tokens = tokenizer.convert_ids_to_tokens(indices.tolist())

# 顯示 top k 可能的字。一般我們就是取 top 1 當作預測值
print("輸入 tokens ：", tokens[:10], '...')
print('-' * 50)
for i, (t, p) in enumerate(zip(predicted_tokens, probs), 1):
    tokens[masked_index] = t
    print("Top {} ({:2}%)：{}".format(i, int(p.item() * 100), tokens[:10]), '...')
    


##
attention = model(input_ids, token_type_ids=token_type_ids)[-1]
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()



# 交給 BertViz 視覺化
head_view(attention, tokens)


輸入 tokens ： ['[CLS]', '等', '到', '潮', '水', '[MASK]', '了', ',', '就', '知'] ...
--------------------------------------------------
Top 1 (60%)：['[CLS]', '等', '到', '潮', '水', '來', '了', ',', '就', '知'] ...
Top 2 (11%)：['[CLS]', '等', '到', '潮', '水', '濕', '了', ',', '就', '知'] ...
Top 3 ( 4%)：['[CLS]', '等', '到', '潮', '水', '過', '了', ',', '就', '知'] ...


<IPython.core.display.Javascript object>

In [35]:
from transformers import BertModel

model = BertModel.from_pretrained(model_version, output_attentions=True)
tokenizer = BertTokenizer.from_pretrained(model_version)

##
input_ids ,token_type_ids = get_input_from_two_sentence(text_b, text_c)
attention = model(input_ids, token_type_ids=token_type_ids)[-1]

#
input_id_list = input_ids[0].tolist() # Batch index 0
tokens = tokenizer.convert_ids_to_tokens(input_id_list)
call_html()

# 交給 BertViz 視覺化
head_view(attention, tokens)


<IPython.core.display.Javascript object>

## 1. 準備資料

In [39]:
import os
import pandas as pd

# 簡單的數據清理，去除空白標題的 examples
df_train = pd.read_csv("./kaggle/train.csv")
empty_title = ((df_train['title2_zh'].isnull()) \
               | (df_train['title1_zh'].isnull()) \
               | (df_train['title2_zh'] == '') \
               | (df_train['title2_zh'] == '0'))
df_train = df_train[~empty_title]
# 剔除過長的樣本以避免 BERT 無法將整個輸入序列放入記憶體不多的 GPU
MAX_LENGTH = 30
df_train = df_train[~(df_train.title1_zh.apply(lambda x : len(x)) > MAX_LENGTH)]
df_train = df_train[~(df_train.title2_zh.apply(lambda x : len(x)) > MAX_LENGTH)]

# 只用 1% 訓練數據看看 BERT 對少量標註數據有多少幫助
SAMPLE_FRAC = 0.01
df_train = df_train.sample(frac=SAMPLE_FRAC, random_state=9527)

# 去除不必要的欄位並重新命名兩標題的欄位名
df_train = df_train.reset_index()
df_train = df_train.loc[:, ['title1_zh', 'title2_zh', 'label']]
df_train.columns = ['text_a', 'text_b', 'label']

# idempotence, 將處理結果另存成 tsv 供 PyTorch 使用
df_train.to_csv("train.tsv", sep="\t", index=False)

print("訓練樣本數：", len(df_train))

print(f"{df_train.label.value_counts() / len(df_train)}")
df_train.head()


訓練樣本數： 2657
unrelated    0.679338
agreed       0.294317
disagreed    0.026346
Name: label, dtype: float64


Unnamed: 0,text_a,text_b,label
0,苏有朋要结婚了，但网友觉得他还是和林心如比较合适,好闺蜜结婚给不婚族的秦岚扔花球，倒霉的秦岚掉水里笑哭苏有朋！,unrelated
1,爆料李小璐要成前妻了贾乃亮模仿王宝强一步到位、快刀斩乱麻！,李小璐要变前妻了？贾乃亮可能效仿王宝强当机立断，快刀斩乱麻！,agreed
2,为彩礼，母亲把女儿嫁给陌生男子，十年后再见面，母亲湿了眼眶,阿姨，不要彩礼是觉得你家穷，给你台阶下，不要以为我嫁不出去！,unrelated
3,猪油是个宝，一勺猪油等于十副药，先备起来再说,传承千百的猪油为何变得人人唯恐避之不及？揭开猪油的四大谣言！,unrelated
4,剖析：香椿，为什么会致癌？,香椿含亚硝酸盐多吃会致癌？测完发现是谣言,disagreed


In [58]:
df_test = pd.read_csv("./kaggle/test.csv")
df_test = df_test.loc[:, ["title1_zh", "title2_zh", "id"]]
df_test.columns = ["text_a", "text_b", "Id"]
df_test.to_csv("test.tsv", sep="\t", index=False)

print("預測樣本數：", len(df_test))
df_test.head()

預測樣本數： 80126


Unnamed: 0,text_a,text_b,Id
0,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,321187
1,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,321190
2,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,321189
3,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,321193
4,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,321191


## 2. 將原始文本轉換成 BERT 相容的輸入格式

In [6]:
"""
實作一個可以用來讀取訓練 / 測試集的 Dataset，這是你需要徹底了解的部分。
此 Dataset 每次將 tsv 裡的一筆成對句子轉換成 BERT 相容的格式，並回傳 3 個 tensors：
- tokens_tensor：兩個句子合併後的索引序列，包含 [CLS] 與 [SEP]
- segments_tensor：可以用來識別兩個句子界限的 binary tensor
- label_tensor：將分類標籤轉換成類別索引的 tensor, 如果是測試集則回傳 None
"""
from torch.utils.data import Dataset
 
    
class FakeNewsDataset(Dataset):
    # 讀取前處理後的 tsv 檔並初始化一些參數
    def __init__(self, mode, tokenizer,**kwargs):
        assert mode in ["train", "test", "once"]  # 一般訓練你會需要 dev set
        
        self.mode = mode
        # 大數據你會需要用 iterator=True
        if mode != "once":
            self.df = pd.read_csv(mode + ".tsv", sep="\t").fillna("")
        else:
            assert "text_a" in kwargs and "text_b" in kwargs and "label" in kwargs, f"args missing: {kwargs}"
            self.df = self.input_preprocess(kwargs["text_a"],kwargs["text_b"],kwargs["label"])
            
        self.len = len(self.df)
        self.label_map = {'agreed': 0, 'disagreed': 1, 'unrelated': 2}
        self.tokenizer = tokenizer  # 我們將使用 BERT tokenizer
        
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a, text_b = self.df.iloc[idx, :2].values
            label_tensor = None
        else:
            text_a, text_b, label = self.df.iloc[idx, :].values
            # 將 label 文字也轉換成索引方便轉換成 tensor
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        # 第二個句子的 BERT tokens
        tokens_b = self.tokenizer.tokenize(text_b)
        word_pieces += tokens_b + ["[SEP]"]
        len_b = len(word_pieces) - len_a
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a + [1] * len_b, 
                                        dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len
    
    @staticmethod
    def input_preprocess(text_a,text_b,label):
        assert len(text_a) == len(text_b) == len(label), "count have to equal of sample sets"
        return pd.DataFrame({'text_a': text_a, 'text_b': text_b ,"label":label})
    
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = FakeNewsDataset("train", tokenizer=tokenizer)

In [13]:


# 選擇第一個樣本
sample_idx = 0

# 將原始文本拿出做比較
text_a, text_b, label = trainset.df.iloc[sample_idx].values

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())
combined_text = "".join(tokens)

# 渲染前後差異，毫無反應就是個 print。可以直接看輸出結果
print(f"""[原始文本]
句子 1：{text_a}
句子 2：{text_b}
分類  ：{label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor}

segments_tensor：{segments_tensor}

label_tensor   ：{label_tensor}

--------------------

[還原 tokens_tensors]
{combined_text}
""")


[原始文本]
句子 1：苏有朋要结婚了，但网友觉得他还是和林心如比较合适
句子 2：好闺蜜结婚给不婚族的秦岚扔花球，倒霉的秦岚掉水里笑哭苏有朋！
分類  ：unrelated

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101, 5722, 3300, 3301, 6206, 5310, 2042,  749, 8024,  852, 5381, 1351,
        6230, 2533,  800, 6820, 3221, 1469, 3360, 2552, 1963, 3683, 6772, 1394,
        6844,  102, 1962, 7318, 6057, 5310, 2042, 5314,  679, 2042, 3184, 4638,
        4912, 2269, 2803, 5709, 4413, 8024,  948, 7450, 4638, 4912, 2269, 2957,
        3717, 7027, 5010, 1526, 5722, 3300, 3301, 8013,  102])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
        1, 1, 1, 1, 1, 1, 1, 1, 1])

label_tensor   ：2

--------------------

[還原 tokens_tensors]
[CLS]苏有朋要结婚了，但网友觉得他还是和林心如比较合适[SEP]好闺蜜结婚给不婚族的秦岚扔花球，倒霉的秦岚掉水里笑哭苏有朋！[SEP]



In [14]:
"""
實作可以一次回傳一個 mini-batch 的 DataLoader
這個 DataLoader 吃我們上面定義的 `FakeNewsDataset`，
回傳訓練 BERT 時會需要的 4 個 tensors：
- tokens_tensors  : (batch_size, max_seq_len_in_batch)
- segments_tensors: (batch_size, max_seq_len_in_batch)
- masks_tensors   : (batch_size, max_seq_len_in_batch)
- label_ids       : (batch_size)
"""

from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# 這個函式的輸入 `samples` 是一個 list，裡頭的每個 element 都是
# 剛剛定義的 `FakeNewsDataset` 回傳的一個樣本，每個樣本都包含 3 tensors：
# - tokens_tensor
# - segments_tensor
# - label_tensor
# 它會對前兩個 tensors 作 zero padding，並產生前面說明過的 masks_tensors
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


# 初始化一個每次回傳 64 個訓練樣本的 DataLoader
# 利用 `collate_fn` 將 list of samples 合併成一個 mini-batch 是關鍵
BATCH_SIZE = 16
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, 
                         collate_fn=create_mini_batch)

In [43]:
data = next(iter(trainloader))

tokens_tensors, segments_tensors, \
    masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 63]) 
tensor([[ 101, 5722, 3300,  ...,    0,    0,    0],
        [ 101, 4255, 3160,  ..., 8013,  102,    0],
        [ 101,  711, 2506,  ..., 8013,  102,    0],
        ...,
        [ 101,  671, 2157,  ...,    0,    0,    0],
        [ 101, 1380,  677,  ...,    0,    0,    0],
        [ 101, 2458, 1853,  ...,    0,    0,    0]])
------------------------
segments_tensors.shape = torch.Size([64, 63])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        [0, 0, 0,  ..., 1, 1, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 63])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        [1, 1, 1,  ..., 1, 1, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
------------------------
label_ids.shape        

## 3. 在 BERT 之上加入新 layer 成下游任務模型

In [44]:
# 載入一個可以做中文多分類任務的模型，n_class = 3
from transformers import BertForSequenceClassification

PRETRAINED_MODEL_NAME = "bert-base-chinese"
NUM_LABELS = 3

model = BertForSequenceClassification.from_pretrained(
    PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)

clear_output()

# high-level 顯示此模型裡的 modules
print("""
name            module
----------------------""")
for name, module in model.named_children():
    if name == "bert":
        for n, _ in module.named_children():
            print(f"{name}:{n}")
    else:
        print("{:15} {}".format(name, module))

print(model.config)


name            module
----------------------
bert:embeddings
bert:encoder
bert:pooler
dropout         Dropout(p=0.1, inplace=False)
classifier      Linear(in_features=768, out_features=3, bias=True)
BertConfig {
  "_name_or_path": "bert-base-chinese",
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "directionality": "bidi",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "pooler_fc_size": 768,
  "pooler_num_attention_heads": 12,
  "pooler_num_fc_layers": 3,
  "pooler_size_per_head": 128,
  "pooler_type": "first_t

In [22]:
"""
定義一個可以針對特定 DataLoader 取得模型預測結果以及分類準確度的函式
之後也可以用來生成上傳到 Kaggle 競賽的預測結果

2019/11/22 更新：在將 `tokens`、`segments_tensors` 等 tensors
丟入模型時，強力建議指定每個 tensor 對應的參數名稱，以避免 HuggingFace
更新 repo 程式碼並改變參數順序時影響到我們的結果。
"""

def get_pred(sentence):
    pass

def get_predictions(model, dataloader, compute_acc=False, sample_num=-1):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to(device_name) for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]

            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
            
            print(f"total: {total}")
            if total > sample_num and sample_num != -1
                break
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions
    
# 讓模型跑在 GPU 上並取得訓練集的分類準確率
device_name = "cuda:0" if torch.cuda.is_available() else "cpu"
device = torch.device(device_name)
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

device: cpu
total: 16
classification acc: 0.6875


## 4. 訓練該下游任務模型

In [54]:
import datetime
# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 6  # 幸運數字
print(f"start training...")
for epoch in range(EPOCHS):
    print(f"[epoch {epoch}]")
    running_loss = 0.0
    step = 0
    for data in trainloader:
        step+=1
        print(f"steps: {step}")
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
        if step>10:
            break
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True,sample_num=)

    print('[epoch %d] loss: %.3f, acc: %.3f' % (epoch + 1, running_loss, acc))

start training...
[epoch 0]
steps: 1
steps: 2
steps: 3
steps: 4
steps: 5
steps: 6
steps: 7
steps: 8
steps: 9
steps: 10
steps: 11
total: 64
[epoch 1] loss: 4.555, acc: 0.969
[epoch 1]
steps: 1
steps: 2
steps: 3
steps: 4
steps: 5
steps: 6
steps: 7
steps: 8
steps: 9
steps: 10
steps: 11
total: 64
[epoch 2] loss: 4.043, acc: 0.969
[epoch 2]
steps: 1
steps: 2
steps: 3
steps: 4
steps: 5
steps: 6
steps: 7
steps: 8
steps: 9
steps: 10
steps: 11
total: 64
[epoch 3] loss: 3.501, acc: 0.953
[epoch 3]
steps: 1
steps: 2
steps: 3
steps: 4
steps: 5
steps: 6
steps: 7
steps: 8
steps: 9
steps: 10
steps: 11
total: 64
[epoch 4] loss: 2.678, acc: 0.969
[epoch 4]
steps: 1
steps: 2
steps: 3
steps: 4
steps: 5
steps: 6
steps: 7
steps: 8
steps: 9
steps: 10
steps: 11
total: 64
[epoch 5] loss: 2.272, acc: 0.969
[epoch 5]
steps: 1
steps: 2
steps: 3
steps: 4
steps: 5
steps: 6
steps: 7
steps: 8
steps: 9
steps: 10
steps: 11
total: 64
[epoch 6] loss: 1.924, acc: 0.969


In [106]:
BATCH_SIZE =16
testset = FakeNewsDataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=BATCH_SIZE, 
                        collate_fn=create_mini_batch)

# 用來將預測的 label id 轉回 label 文字
index_map = {v: k for k, v in testset.label_map.items()}

# 用分類模型預測測試集
predictions = get_predictions(model, testloader)

#
df = pd.DataFrame({"predicted": predictions.tolist()})
df['predicted'] = df.predicted.apply(lambda x: index_map[x])
df1 = pd.concat([trainset.df, df.loc[:, 'predicted']], axis=1)[:BATCH_SIZE]
df1.head()

tokens_tensors:tensor([[ 101, 5855, 2861, 6622,  782, 3698, 4255, 3476,  106, 1812, 1350, 2600,
         5320, 1920, 6848, 3313, 1346, 6848, 5815, 4636,  674, 6848, 4873, 4385,
          818, 2600, 5320, 1327, 1213, 2255, 1920,  102, 6792, 6469, 8013, 7027,
         3203, 2135, 3175, 1415, 6371, 6589, 1825, 2209, 1217, 4673, 1164, 4289,
         3855, 8024, 7410, 6887, 3221,  817, 3419, 3766, 6448, 2879, 8043,  102],
        [ 101, 5855, 6809, 1990, 6158, 2936, 1400, 1440, 6425, 5401, 1744, 4638,
          671, 1368, 6413, 8024, 1355,  782, 3918, 2590,  102, 8108, 1920, 3297,
         6375, 5401, 1744,  782, 4685,  928, 4638, 5774, 6414, 6469, 6241, 8024,
         1963, 6060, 6062,  782, 2958, 2971, 4708, 5401, 1744,  102,    0,    0,
            0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0],
        [ 101, 5855, 6809, 1990, 3634, 7555, 6369, 1153, 3766, 3300, 3634, 1744,
         4788, 1776, 4638, 6413, 8024, 5401, 1744, 6820,  833, 2190,  823, 2861,
         10

total: 0


Unnamed: 0,text_a,text_b,label,predicted
0,苏有朋要结婚了，但网友觉得他还是和林心如比较合适,好闺蜜结婚给不婚族的秦岚扔花球，倒霉的秦岚掉水里笑哭苏有朋！,unrelated,agreed
1,爆料李小璐要成前妻了贾乃亮模仿王宝强一步到位、快刀斩乱麻！,李小璐要变前妻了？贾乃亮可能效仿王宝强当机立断，快刀斩乱麻！,agreed,unrelated
2,为彩礼，母亲把女儿嫁给陌生男子，十年后再见面，母亲湿了眼眶,阿姨，不要彩礼是觉得你家穷，给你台阶下，不要以为我嫁不出去！,unrelated,unrelated
3,猪油是个宝，一勺猪油等于十副药，先备起来再说,传承千百的猪油为何变得人人唯恐避之不及？揭开猪油的四大谣言！,unrelated,unrelated
4,剖析：香椿，为什么会致癌？,香椿含亚硝酸盐多吃会致癌？测完发现是谣言,disagreed,unrelated


In [3]:
import torch
PATH_MODEL_SAVE = "./save.pt"

# Save:
torch.save(model.state_dict(), PATH_MODEL_SAVE)

NameError: name 'model' is not defined

In [24]:
from transformers import BertConfig, BertForSequenceClassification,BertTokenizer
import torch
import pandas as pd
from torch.utils.data import DataLoader



# Load:
NUM_LABELS = 3
PRETRAINED_MODEL_NAME = "bert-base-chinese"  # 指定繁簡中文 BERT-BASE 預訓練模型

# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
config = BertConfig.from_pretrained("bert-base-uncased", num_labels=NUM_LABELS, hidden_dropout_prob=0.1)
model = BertForSequenceClassification(config=config)
model.load_state_dict(torch.load(PATH_MODEL_SAVE))
model.eval()

# TEST:

BATCH_SIZE =16
text_a = ["警署稱黑衣人砸辦公室事件純屬誤會"]
text_b = ["警署遭爆料公社爆料黑衣人砸辦公室事件吃案"]
text_label = ["disagreed"]
onceset = FakeNewsDataset("once", tokenizer=tokenizer,text_a=text_a,text_b=text_b,label=text_label)
onceloader = DataLoader(onceset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

# 用來將預測的 label id 轉回 label 文字
index_map = {v: k for k, v in onceset.label_map.items()}

# 用分類模型預測測試集
predictions = get_predictions(model, onceloader)
df = pd.DataFrame({"predicted": predictions.tolist()})
df['predicted'] = df.predicted.apply(lambda x: index_map[x])
df1 = pd.concat([onceset.df, df.loc[:, 'predicted']], axis=1)[:BATCH_SIZE]
df1.head()

total: 0


Unnamed: 0,text_a,text_b,label,predicted
0,警署稱黑衣人砸辦公室事件純屬誤會,警署遭爆料公社爆料黑衣人砸辦公室事件吃案,disagreed,unrelated
