# 透過BERT實現假新聞預測
### 修改自 https://leemeng.tw/attack_on_bert_transfer_learning_in_nlp.html

## 引入需要的library

In [1]:
import os
import csv
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer
from IPython.display import clear_output
from transformers import BertForSequenceClassification
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset

I0213 03:56:04.017554 139715234146048 file_utils.py:38] PyTorch version 1.1.0 available.


## 讀取資料並合併

In [2]:
xls = pd.ExcelFile('line_fake_news.xlsx')
df1 = pd.read_excel(xls, '工作表1', header=None)
df2 = pd.read_excel(xls, '工作表2', header=None)

In [3]:
not_rumor = df1.drop(columns=[0, 1, 4])
not_rumor.columns = ['label', 'content']
rumor = df2.drop(columns=[0, 1, 4, 5])
rumor.columns = ['label', 'content']

In [4]:
all_data = pd.concat([rumor, not_rumor], ignore_index=True).sample(frac=1)
all_data.head(10)

Unnamed: 0,label,content
387,NOT_RUMOR,這篇文章寫的真好！介紹給所有家人，值得推薦給全國中老年人看！（別錯過）“喝水”是長壽的第一要...
157,RUMOR,生魚片中毒案例日增日本當局呼籲提防海獸胃線蟲
85,RUMOR,根據日本論壇網站「Rocket24」報導，一名日本星巴克的前員工表示，自己絕不是故意要讓星巴...
47,RUMOR,「EXTRA無糖口香糖」含阿斯巴甜，會致癌
298,NOT_RUMOR,#兒童節優惠~2017全台遊樂園/博物館/觀光工廠連假優惠#宜蘭蘭陽博物館優惠期間：4/4(...
389,NOT_RUMOR,中央氣象局已發佈「尼莎」颱風海上警報，因應颱風來襲，自來水公司第三區管理處呼籲所有自來水用戶...
245,NOT_RUMOR,真逆齡台灣高齡醫學專家陳亮恭醫師，推翻過去陳舊觀念，在《真逆齡》一書中，告訴你抗老大智慧：)...
301,NOT_RUMOR,海倫清桃前天臉書爆炸性發文，沒多久她和苦守寒窯18年的老公戴發奎（葛格）攜手現身自掀她年齡、...
58,RUMOR,人家結婚干你屁事？政治學博士：結婚是兩個人的事，修法卻是兩千萬人的事！政治學博士曰：「人家結...
120,RUMOR,桃園傳吳郭魚羅湖病毒疫情，高雄嚴陣以待；請各位兄弟姐妹、親朋好友最近少碰吳郭魚；目前病毒無藥...


In [5]:
all_data = all_data[all_data['content'].str.len()<512]
all_data.head(10)

Unnamed: 0,label,content
157,RUMOR,生魚片中毒案例日增日本當局呼籲提防海獸胃線蟲
85,RUMOR,根據日本論壇網站「Rocket24」報導，一名日本星巴克的前員工表示，自己絕不是故意要讓星巴...
47,RUMOR,「EXTRA無糖口香糖」含阿斯巴甜，會致癌
389,NOT_RUMOR,中央氣象局已發佈「尼莎」颱風海上警報，因應颱風來襲，自來水公司第三區管理處呼籲所有自來水用戶...
245,NOT_RUMOR,真逆齡台灣高齡醫學專家陳亮恭醫師，推翻過去陳舊觀念，在《真逆齡》一書中，告訴你抗老大智慧：)...
301,NOT_RUMOR,海倫清桃前天臉書爆炸性發文，沒多久她和苦守寒窯18年的老公戴發奎（葛格）攜手現身自掀她年齡、...
58,RUMOR,人家結婚干你屁事？政治學博士：結婚是兩個人的事，修法卻是兩千萬人的事！政治學博士曰：「人家結...
120,RUMOR,桃園傳吳郭魚羅湖病毒疫情，高雄嚴陣以待；請各位兄弟姐妹、親朋好友最近少碰吳郭魚；目前病毒無藥...
144,RUMOR,夫婦吃隔夜菜中毒，丈夫身亡！這四種「隔夜菜」比毒藥還狠
376,NOT_RUMOR,知識就是力量～知識决定一個人的行動，願與大家一起努力愛地球！✌分解多久需要多長時間紙巾-2-...


## 切成訓練及測試資料

In [6]:
train_df = all_data.sample(frac=0.7, random_state=200) #random state is a seed value
test_df = all_data.drop(train_df.index)
print(len(train_df))
print(len(test_df))

197
84


## 使用 pretrained 的模型

In [7]:
PRETRAINED_MODEL_NAME = 'bert-base-chinese'
# 取得此預訓練模型所使用的 tokenizer
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)

I0213 03:56:05.841853 139715234146048 tokenization_utils.py:418] loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-vocab.txt from cache at /root/.cache/torch/transformers/8a0c070123c1f794c42a29c6904beb7c1b8715741e235bee04aca2c7636fc83f.9b42061518a39ca00b8b52059fd2bede8daa613f8a8671500e518a8c29de8c00


## 將文章轉換為訓練需要的格式

In [8]:
class FakeNewsDataset(Dataset):
    def __init__(self, df, mode, tokenizer):
        assert mode in ["train", "test"]
        self.mode = mode
        if self.mode == "train":
            self.df = train_df
        else:
            self.df = test_df
        self.label_map = {'NOT_RUMOR': 0, 'RUMOR': 1}
        self.len = len(self.df)
        self.tokenizer = tokenizer
    
    # 定義回傳一筆訓練 / 測試數據的函式
    def __getitem__(self, idx):
        if self.mode == "test":
            text_a = self.df.content.values[idx]
            label = self.df.label.values[idx]
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
        else:
            text_a = self.df.content.values[idx]
            label = self.df.label.values[idx]
            label_id = self.label_map[label]
            label_tensor = torch.tensor(label_id)
            
        # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
        word_pieces = ["[CLS]"]
        tokens_a = self.tokenizer.tokenize(text_a)
        word_pieces += tokens_a + ["[SEP]"]
        len_a = len(word_pieces)
        
        # 將整個 token 序列轉換成索引序列
        ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)
        
        # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
        segments_tensor = torch.tensor([0] * len_a , dtype=torch.long)
        
        return (tokens_tensor, segments_tensor, label_tensor)
    
    def __len__(self):
        return self.len

def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 測試集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad 到同一序列長度
    tokens_tensors = pad_sequence(tokens_tensors, 
                                  batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, 
                                    batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, 
                                dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(
        tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids


def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            # 選出機率最高者
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

## 印出轉換後結果

In [9]:
# 初始化一個專門讀取訓練樣本的 Dataset，使用中文 BERT 斷詞
trainset = FakeNewsDataset(train_df, 'train', tokenizer=tokenizer)
BATCH_SIZE = 2
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE, collate_fn=create_mini_batch)

sample_idx = 0
tokens_tensor, segments_tensor, label_tensor = trainset[sample_idx]
print(tokens_tensor)
print(segments_tensor)
print(label_tensor)

tensor([ 101,  886, 4500, 2641, 6879, 1305, 3022, 2949, 6880, 8024,  679, 1372,
        5543,  775, 1062, 6722, 6752,  733, 1032, 2669, 8024, 1315, 3189, 6629,
         122,  943, 3299, 8024, 6917, 5543,  775,  912, 1164, 1555, 2421, 6554,
        4289, 2835, 2850, 8108, 1039, 4638, 1962, 2434, 8013, 2641, 6879, 1305,
        1062, 1385, 6134, 4850, 8024, 8108, 3299, 8153, 3189, 6629, 5635, 8111,
        3299, 8130, 3189, 3632, 8024, 3022,  733, 1378, 1266, 2949, 6880, 1642,
        2641, 6879, 1305, 8024,  699, 1762,  122, 2207, 3229, 1058, 6752, 6868,
        1059, 2157,  912, 1164, 1555, 2421, 8024, 1086,  886, 4500, 2641, 6879,
        1305, 6554, 6525, 1476, 1565,  510, 7934, 1259,  510,  912, 4534, 5023,
        2900, 2137, 1555, 1501, 8024, 3680,  816, 2218, 1377,  775, 8108, 1039,
        1032, 2669,  511, 2641, 6879, 1305, 1062, 1385, 7674, 2428, 5645,  912,
        1164, 1555, 2421, 1394,  868, 8024, 2972, 1139, 2949, 6880, 6752, 6868,
        1555, 2421, 1377,  775, 6554, 42

## 使用GPU fine-tune BERT模型

In [10]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 2
model = BertForSequenceClassification.from_pretrained('bert-base-chinese', num_labels=NUM_LABELS)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

EPOCHS = 7
for epoch in range(EPOCHS):
    running_loss = 0.0
    for data in trainloader:

        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()

        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()

    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' % (epoch + 1, running_loss, acc))
model.save_pretrained('fake_news_model') 

I0213 03:56:06.845724 139715234146048 configuration_utils.py:254] loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-chinese-config.json from cache at /root/.cache/torch/transformers/8a3b1cfe5da58286e12a0f5d7d182b8d6eca88c08e26c332ee3817548cf7e60a.3767c74c8ed285531d04153fe84a0791672aff52f7249b27df341dbce09b8305
I0213 03:56:06.846560 139715234146048 configuration_utils.py:290] Model config BertConfig {
  "architectures": [
    "BertForMaskedLM"
  ],
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "directionality": "bidi",
  "do_sample": false,
  "eos_token_ids": 0,
  "finetuning_task": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "is_decoder": false,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1
  },
  "layer_norm_eps": 1e-12,
  "length_penalty": 1.0,
  "max_length": 20,
  "ma

classification acc: 0.467005076142132
[epoch 1] loss: 65.654, acc: 0.843
[epoch 2] loss: 44.572, acc: 0.949
[epoch 3] loss: 25.085, acc: 0.944
[epoch 4] loss: 14.896, acc: 1.000
[epoch 5] loss: 4.306, acc: 1.000
[epoch 6] loss: 2.142, acc: 1.000


I0213 03:57:07.954715 139715234146048 configuration_utils.py:118] Configuration saved in fake_news_model/config.json


[epoch 7] loss: 1.476, acc: 1.000


I0213 03:57:08.215232 139715234146048 modeling_utils.py:296] Model weights saved in fake_news_model/pytorch_model.bin


## 拿先前的Pretrained model預測現在的資料

In [11]:
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = FakeNewsDataset(test_df, 'test', tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=4, collate_fn=create_mini_batch)
model = BertForSequenceClassification.from_pretrained('fake_news_model_train56')
clear_output()
_, train_acc = get_predictions(model, trainloader, compute_acc=True)
_, test_acc = get_predictions(model, testloader, compute_acc=True)
print('train by 56 training data')
print(train_acc, test_acc)

train by 56 training data
0.7715736040609137 0.7619047619047619


In [12]:
model = BertForSequenceClassification.from_pretrained('fake_news_model_train56_2')
clear_output()
_, train_acc = get_predictions(model, trainloader, compute_acc=True)
_, test_acc = get_predictions(model, testloader, compute_acc=True)
print('train by 56_2 training data')
print(train_acc, test_acc)

train by 56_2 training data
0.7258883248730964 0.6666666666666666


In [13]:
model = BertForSequenceClassification.from_pretrained('fake_news_model_train134')
clear_output()
_, train_acc = get_predictions(model, trainloader, compute_acc=True)
_, test_acc = get_predictions(model, testloader, compute_acc=True)
print('train by 134 training data')
print(train_acc, test_acc)

train by 134 training data
0.8426395939086294 0.8452380952380952


In [14]:
model = BertForSequenceClassification.from_pretrained('fake_news_model_train140')
clear_output()
_, train_acc = get_predictions(model, trainloader, compute_acc=True)
_, test_acc = get_predictions(model, testloader, compute_acc=True)
print('train by 140 training data')
print(train_acc, test_acc)

train by 140 training data
0.8426395939086294 0.8095238095238095


In [15]:
model = BertForSequenceClassification.from_pretrained('fake_news_model_train169')
clear_output()
_, train_acc = get_predictions(model, trainloader, compute_acc=True)
_, test_acc = get_predictions(model, testloader, compute_acc=True)
print('train by 169 training data')
print(train_acc, test_acc)

train by 169 training data
0.8375634517766497 0.8214285714285714


In [16]:
model = BertForSequenceClassification.from_pretrained('fake_news_model_train225')
clear_output()
_, train_acc = get_predictions(model, trainloader, compute_acc=True)
_, test_acc = get_predictions(model, testloader, compute_acc=True)
print('train by 225 training data')
print(train_acc, test_acc)

train by 225 training data
0.9390862944162437 0.9404761904761905
