In [3]:
# Basic
import time
import os
import pandas as pd
from transformers import BertTokenizer

# PyTorch
import torch
from torch.utils.data import Dataset,random_split
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

# IMDB
from keras.datasets import imdb

In [4]:
# 取得 BERT 內的 pre-train tokenizer
PRETRAINED_MODEL_NAME = "bert-base-uncased" #英文pretrain(不區分大小寫)
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
vocab = tokenizer.vocab
print("dict size", len(vocab))

# 隨機看一下 BERT tokenizer 完的字典
import random
random_tokens = random.sample(list(vocab), 10)
random_ids = [vocab[t] for t in random_tokens]

print("{0:20}{1:15}".format("token", "index"))
print("-" * 25)
for t, id in zip(random_tokens, random_ids): #隨便看幾個字
  print("{0:15}{1:10}".format(t, id))

dict size 30522
token               index          
-------------------------
vijay               17027
proponent           22488
##rigue             27611
csi                 22174
rt                  19387
##tial              20925
contributes         16605
claw                15020
del                  3972
antilles            27695


In [5]:
# 僅保留訓練資料集前10000個最常出現的單詞，捨棄低頻的單詞
(train_data,train_labels),(test_data,test_labels) = imdb.load_data(num_words=10000)

In [6]:
# 下載IMDB的字典 word_index -> word:index
word_index = imdb.get_word_index()

# 鍵值對調 reverse_word_index -> index:word
reverse_word_index = {value:key for key,value in word_index.items()}

# 查看每一筆評論內容(index-3，因為index=0,1和2分別是“填充”,“序列開始”,“未知”的保留索引)，查不到的以?表示
def read_IMDB_text(train_data):
  text = ' '.join([reverse_word_index.get(i-3,'?') for i in train_data])
  return text

In [7]:
# 做成train/test的dataframe
df_train = pd.DataFrame({'TRAIN_text_to_sequence':train_data,"TRAIN_label":train_labels})
df_test = pd.DataFrame({'TEST_text_to_sequence':test_data,"TEST_label":test_labels})

df_train['TRAIN_text'] = df_train['TRAIN_text_to_sequence'].apply(read_IMDB_text)
df_test['TEST_text'] = df_test['TEST_text_to_sequence'].apply(read_IMDB_text)

df_train = df_train[["TRAIN_text","TRAIN_label"]]
df_test = df_test[["TEST_text","TEST_label"]]

display(df_train.head())
display(df_test.head())

Unnamed: 0,TRAIN_text,TRAIN_label
0,? this film was just brilliant casting locatio...,1
1,? big hair big boobs bad music and a giant saf...,0
2,? this has to be one of the worst films of the...,0
3,? the ? ? at storytelling the traditional sort...,1
4,? worst mistake of my life br br i picked this...,0


Unnamed: 0,TEST_text,TEST_label
0,? please give this one a miss br br ? ? and th...,0
1,? this film requires a lot of patience because...,1
2,? many animation buffs consider ? ? the great ...,1
3,? i generally love this type of movie however ...,0
4,? like some other people wrote i'm a die hard ...,1


In [8]:
# 建立Dataset
class IMDB_Dataset(Dataset):
  def __init__(self, mode, tokenizer):
    assert mode in ["train", "test"]  
    self.mode = mode
    self.df = eval(f"df_{mode}") # df_train or df_test
    self.len = len(self.df)
    self.maxlen = 300      #限制文章長度(depend on 你的記憶體)
    self.tokenizer = tokenizer  # 把 BERT tokenizer 傳進來
  
  # 定義回傳一筆訓練/測試數據的函式
  def __getitem__(self, idx):
    origin_text = self.df.iloc[idx][0] # 原始文本
    origin_label = self.df.iloc[idx][1]      # 原始分類
    if self.mode == "test":
        text = self.df.iloc[idx][0]
        label_tensor = None 
        # label_id = self.df.iloc[idx][1]
        # label_tensor = torch.tensor(label_id)
    else:     
        text = self.df.iloc[idx][0]
        # label_id = self.label_id
        label_tensor = torch.tensor(origin_label)
        
    
    # 建立第一個句子的 BERT tokens 並加入分隔符號 [SEP]
    word_pieces = ["[CLS]"]
    tokens_a = self.tokenizer.tokenize(text)
    word_pieces += tokens_a[:self.maxlen] + ["[SEP]"]
    len_a = len(word_pieces)
            
    # 將整個 token 序列轉換成索引序列
    ids = self.tokenizer.convert_tokens_to_ids(word_pieces)
    tokens_tensor = torch.tensor(ids)
    
    # 將第一句包含 [SEP] 的 token 位置設為 0，其他為 1 表示第二句
    segments_tensor = torch.tensor([0] * len_a,dtype=torch.long)
    
    return (tokens_tensor, segments_tensor, label_tensor, origin_text, origin_label)

  def __len__(self):
    return self.len

In [9]:
# initialize Dataset
trainset = IMDB_Dataset("train", tokenizer=tokenizer)
testset = IMDB_Dataset("test", tokenizer=tokenizer)

In [10]:
# 隨便選一個樣本
sample_idx = 2

# 利用剛剛建立的 Dataset 取出轉換後的 id tensors
tokens_tensor, segments_tensor, label_tensor, origin_text, origin_label = trainset[sample_idx]

# 將 tokens_tensor 還原成文本
tokens = tokenizer.convert_ids_to_tokens(tokens_tensor.tolist())

print(f"""[原始文本]
句子：{origin_text}
分類  ：{origin_label}

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：{tokens_tensor[0:20]}

segments_tensor：{segments_tensor[0:20]}

label_tensor   ：{label_tensor}

""")

[原始文本]
句子：? this has to be one of the worst films of the 1990s when my friends i were watching this film being the target audience it was aimed at we just sat watched the first half an hour with our jaws touching the floor at how bad it really was the rest of the time everyone else in the theatre just started talking to each other leaving or generally crying into their popcorn that they actually paid money they had ? working to watch this feeble excuse for a film it must have looked like a great idea on paper but on film it looks like no one in the film has a clue what is going on crap acting crap costumes i can't get across how ? this is to watch save yourself an hour a bit of your life
分類  ：0

--------------------

[Dataset 回傳的 tensors]
tokens_tensor  ：tensor([ 101, 1029, 2023, 2038, 2000, 2022, 2028, 1997, 1996, 5409, 3152, 1997,
        1996, 4134, 2043, 2026, 2814, 1045, 2020, 3666])

segments_tensor：tensor([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

label_tenso

  origin_text = self.df.iloc[idx][0] # 原始文本
  origin_label = self.df.iloc[idx][1]      # 原始分類
  text = self.df.iloc[idx][0]


In [11]:
def create_mini_batch(samples):
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]
    
    # 訓練集有 labels
    if samples[0][2] is not None:
        label_ids = torch.stack([s[2] for s in samples])
    else:
        label_ids = None
    
    # zero pad到該batch下最長的長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors,batch_first=True)
    
    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape,dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)
    
    return tokens_tensors, segments_tensors, masks_tensors, label_ids

In [12]:
BATCH_SIZE = 64
trainloader = DataLoader(trainset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=True)
testloader = DataLoader(testset, batch_size=BATCH_SIZE,collate_fn=create_mini_batch,shuffle=False)

data = next(iter(trainloader))
tokens_tensors, segments_tensors, masks_tensors, label_ids = data

print(f"""
tokens_tensors.shape   = {tokens_tensors.shape} 
{tokens_tensors}
------------------------
segments_tensors.shape = {segments_tensors.shape}
{segments_tensors}
------------------------
masks_tensors.shape    = {masks_tensors.shape}
{masks_tensors}
------------------------
label_ids.shape        = {label_ids.shape}
{label_ids}
""")


tokens_tensors.shape   = torch.Size([64, 302]) 
tensor([[  101,  1029,  1045,  ...,     0,     0,     0],
        [  101,  1029,  2206,  ...,  1998,  9969,   102],
        [  101,  1029, 12435,  ...,     0,     0,     0],
        ...,
        [  101,  1029,  1996,  ...,     0,     0,     0],
        [  101,  1029,  1045,  ...,     0,     0,     0],
        [  101,  1029, 13970,  ...,     0,     0,     0]])
------------------------
segments_tensors.shape = torch.Size([64, 302])
tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]])
------------------------
masks_tensors.shape    = torch.Size([64, 302])
tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])
---------

  origin_text = self.df.iloc[idx][0] # 原始文本
  origin_label = self.df.iloc[idx][1]      # 原始分類
  text = self.df.iloc[idx][0]


In [13]:
from transformers import BertForSequenceClassification

NUM_LABELS = 2
model = BertForSequenceClassification.from_pretrained(PRETRAINED_MODEL_NAME, num_labels=NUM_LABELS)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [14]:
def get_predictions(model, dataloader, compute_acc=False):
    predictions = None
    correct = 0
    total = 0
      
    with torch.no_grad():
        # 遍巡整個資料集
        for data in dataloader:
            # 將所有 tensors 移到 GPU 上
            if next(model.parameters()).is_cuda:
                data = [t.to("cuda:0") for t in data if t is not None]
            
            
            # 別忘記前 3 個 tensors 分別為 tokens, segments 以及 masks
            # 且強烈建議在將這些 tensors 丟入 `model` 時指定對應的參數名稱
            tokens_tensors, segments_tensors, masks_tensors = data[:3]
            outputs = model(input_ids=tokens_tensors, 
                            token_type_ids=segments_tensors, 
                            attention_mask=masks_tensors)
            
            logits = outputs[0]
            _, pred = torch.max(logits.data, 1)
            
            # 用來計算訓練集的分類準確率
            if compute_acc:
                labels = data[3]
                total += labels.size(0)
                correct += (pred == labels).sum().item()
                
            # 將當前 batch 記錄下來
            if predictions is None:
                predictions = pred
            else:
                predictions = torch.cat((predictions, pred))
    
    if compute_acc:
        acc = correct / total
        return predictions, acc
    return predictions

In [None]:
# 讓模型跑在 GPU 上並取得訓練集的分類準確率 並且print出進度
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print("device:", device)
model = model.to(device)
_, acc = get_predictions(model, trainloader, compute_acc=True)
print("classification acc:", acc)

In [None]:
%%time

# 訓練模式
model.train()

# 使用 Adam Optim 更新整個分類模型的參數
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)


EPOCHS = 6
for epoch in range(EPOCHS):
    
    print('epoch:',epoch)
    running_loss = 0.0
    for data in trainloader:
        
        tokens_tensors, segments_tensors, \
        masks_tensors, labels = [t.to(device) for t in data]

        # 將參數梯度歸零
        optimizer.zero_grad()
        
        # forward pass
        outputs = model(input_ids=tokens_tensors, 
                        token_type_ids=segments_tensors, 
                        attention_mask=masks_tensors, 
                        labels=labels)

        loss = outputs[0]
        # backward
        loss.backward()
        optimizer.step()


        # 紀錄當前 batch loss
        running_loss += loss.item()
        
    # 計算分類準確率
    _, acc = get_predictions(model, trainloader, compute_acc=True)

    print('[epoch %d] loss: %.3f, acc: %.3f' %
          (epoch + 1, running_loss, acc))

In [None]:
# 建立測試集。這邊我們可以用跟訓練時不同的 batch_size，看你 GPU 多大
testset = IMDB_Dataset("test", tokenizer=tokenizer)
testloader = DataLoader(testset, batch_size=256, collate_fn=create_mini_batch)

# 用分類模型預測測試集
predictions = get_predictions(model, testloader)
predictions