In [1]:
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [2]:
!pip install transformers
import torch
import sys
import re
import os
import random
import numpy as np
import pandas as pd
from torch import optim
from sklearn.metrics import accuracy_score
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import BertModel, BertTokenizer

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/f4/9f93f06dd2c57c7cd7aa515ffbf9fcfd8a084b92285732289f4a5696dd91/transformers-3.2.0-py3-none-any.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 2.8MB/s 
[?25hCollecting sentencepiece!=0.1.92
[?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
[K     |████████████████████████████████| 1.1MB 15.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 22.4MB/s 
[?25hCollecting tokenizers==0.8.1.rc2
[?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl 

> init_seeds    --> 固定random number 以利判斷訓練好壞 </br>
> loadInputFile --> 整理資料 </br>
> CRFFormatData --> 編排BIO

In [4]:
def init_seeds(seed=0):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    # Sets the seed for generating random numbers.
    torch.manual_seed(seed)
    # Sets the seed for generating random numbers for the current GPU.
    torch.cuda.manual_seed(seed)
    # Sets the seed for generating random numbers on all GPUs.
    torch.cuda.manual_seed_all(seed)


def loadInputFile(path):
    trainingset = list()  # store trainingset [content,content,...]
    position = list()  # store position [article_id, start_pos, end_pos, entity_text, entity_type, ...]
    mentions = dict()  # store mentions[mention] = Type
    with open(file_path, 'r', encoding='utf8') as f:
        file_text=f.read().encode('utf-8').decode('utf-8-sig')
    datas=file_text.split('\n\n--------------------\n\n')[:-1]
    for data in datas:
        data=data.split('\n')
        content=data[0]
        trainingset.append(content)
        annotations=data[1:]
        for annot in annotations[1:]:
            annot=annot.split('\t') #annot= article_id, start_pos, end_pos, entity_text, entity_type
            position.extend(annot)
            mentions[annot[3]]=annot[4]
    
    return trainingset, position, mentions

def CRFFormatData(trainingset, position):
    outputfile = []

    # output file lines
    count = 0 # annotation counts in each content
    tagged = list()
    for article_id in range(len(trainingset)):
        trainingset_split = list(trainingset[article_id])
        while '' or ' ' in trainingset_split:
            if '' in trainingset_split:
                trainingset_split.remove('')
            else:
                trainingset_split.remove(' ')
        start_tmp = 0
        for position_idx in range(0,len(position),5):
            if int(position[position_idx]) == article_id:
                count += 1
                if count == 1:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos == 0:
                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                            
                            output_str = token[token_idx] + ' ' + label + '\n'
                            outputfile.append(output_str)

                    else:
                        token = list(trainingset[article_id][0:start_pos])
                        whole_token = trainingset[article_id][0:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            
                            output_str = token[token_idx] + ' ' + 'O' + '\n'
                            outputfile.append(output_str)

                        token = list(trainingset[article_id][start_pos:end_pos])
                        whole_token = trainingset[article_id][start_pos:end_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            # BIO states
                            if token[0] == '':
                                if token_idx == 1:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type
                            else:
                                if token_idx == 0:
                                    label = 'B-'+entity_type
                                else:
                                    label = 'I-'+entity_type

                            output_str = token[token_idx] + ' ' + label + '\n'
                            outputfile.append(output_str)

                    start_tmp = end_pos
                else:
                    start_pos = int(position[position_idx+1])
                    end_pos = int(position[position_idx+2])
                    entity_type=position[position_idx+4]
                    if start_pos<start_tmp:
                        continue
                    else:
                        token = list(trainingset[article_id][start_tmp:start_pos])
                        whole_token = trainingset[article_id][start_tmp:start_pos]
                        for token_idx in range(len(token)):
                            if len(token[token_idx].replace(' ','')) == 0:
                                continue
                            output_str = token[token_idx] + ' ' + 'O' + '\n'
                            outputfile.append(output_str)

                    token = list(trainingset[article_id][start_pos:end_pos])
                    whole_token = trainingset[article_id][start_pos:end_pos]
                    for token_idx in range(len(token)):
                        if len(token[token_idx].replace(' ','')) == 0:
                            continue
                        # BIO states
                        if token[0] == '':
                            if token_idx == 1:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        else:
                            if token_idx == 0:
                                label = 'B-'+entity_type
                            else:
                                label = 'I-'+entity_type
                        
                        output_str = token[token_idx] + ' ' + label + '\n'
                        outputfile.append(output_str)
                    start_tmp = end_pos

        token = list(trainingset[article_id][start_tmp:])
        whole_token = trainingset[article_id][start_tmp:]

        for token_idx in range(len(token)):
            if len(token[token_idx].replace(' ','')) == 0:
                continue

            
            output_str = token[token_idx] + ' ' + 'O' + '\n'
            outputfile.append(output_str)

        count = 0
    
        # 一個article的結束
        output_str = '\n'
        outputfile.append(output_str)
        ID = trainingset[article_id]

        if article_id%10 == 0:
            print('Total complete articles:', article_id)

    return outputfile

* 因為直接使用Bert tokenizer對整個句子做tokneizer的話 專業用語的英文字母會被變成一個Mask
> 例如： ALP 在BIO格式中應該要為 O O O、然而Bert tokenize會直接把ALP變成[UNK]、造成跟BIO中的格式長度不相同。</br>

* 因此，用seperate_words_bio將已經整理好了CRF來將words跟bio label分割成兩個不同的List, 同時將每個字後面增加一個空格，如此一來Bert tokenizer就不會直接把整個英文字母變成一個Mask 而是將每個英文字母都變成一個Mask了。



In [189]:
def seperate_words_bio(data):
    token = []
    bio_label = []
    token_temp = ""
    bio_temp = []
    character = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

    for row in data:
        if row == '\n' or len(token_temp) >= 1000:
            token.append(token_temp)
            bio_label.append(bio_temp)
            token_temp = ""
            bio_temp = []
        else:
            row = row.strip('\n').split(' ')
            token_temp = token_temp + row[0] + " "
            bio_temp.append(row[1])

    return token, bio_label

In [223]:
class NerDataset(Dataset):
    def __init__(self, mode, sen_len, trainingset, position, seperate_words_bio, bert_tokenizer):
        self.mode = mode
        self.data = CRFFormatData(trainingset, position)
        self.maxlen = sen_len
        self.seperate_words_bio = seperate_words_bio  # 自己寫的tokenizer
        self.bert_tokenizer = bert_tokenizer

        # 建立data的tokens還有對應到的BIO
        self.tokens, self.bio_label = self.seperate_words_bio(self.data)
        self.len = len(self.tokens) # 有幾個input

    def __getitem__(self, index):
        origin_text = self.tokens[index]

        text_a = self.bert_tokenizer.tokenize(self.tokens[index])
        text_b = None  # for natural language inference、我們的任務不用進行文章推論因此不用輸入bert input的第二句
        label = self.bio_label[index]

        # 將整個 token 序列轉換成索引序列後變成tensor
        word_pieces = ["[CLS]"]
        word_pieces += text_a[:self.maxlen] + ["[SEP]"]
        len_a = len(word_pieces)
        ids = self.bert_tokenizer.convert_tokens_to_ids(word_pieces)
        tokens_tensor = torch.tensor(ids)

        # labal不用建立tensor因為不會input到Bert
        label_out = ["[strart]"]
        label_out += label[:self.maxlen] + ["[end]"]

        # 我們的task只有一個句子，所以segments_tensor的部分會全部給1
        segments_tensor = torch.tensor([1] * len_a, dtype=torch.long)

        return tokens_tensor, segments_tensor, label_out, origin_text

    def __len__(self):
        return self.len

In [234]:
# 建立pytorch dataloader 來一次取一個batch的資料
# collate_fn: 如何將多個樣本的資料連成一個batch丟進 model
# 截長補短後要限制attention只注意非pad 的部分
def create_mini_batch(samples):
    """
    :param: NerDataset的回傳值
            - tokens_tensor  : samples[0]
            - segments_tensor: samples[1]
            - label_out   : samples[2]
            - origin_text    : samples[3]
    :return: 餵給 BERT 時會需要的 3 個 tensors
            - tokens_tensors  : (batch_size, max_seq_len_in_batch)
            - segments_tensors: (batch_size, max_seq_len_in_batch)
            - masks_tensors   : (batch_size, max_seq_len_in_batch)
    """
    tokens_tensors = [s[0] for s in samples]
    segments_tensors = [s[1] for s in samples]

    # zero pad到該batch下最長的長度
    tokens_tensors = pad_sequence(tokens_tensors, batch_first=True)
    segments_tensors = pad_sequence(segments_tensors, batch_first=True)

    # attention masks，將 tokens_tensors 裡頭不為 zero padding
    # 的位置設為 1 讓 BERT 只關注這些位置的 tokens
    masks_tensors = torch.zeros(tokens_tensors.shape, dtype=torch.long)
    masks_tensors = masks_tensors.masked_fill(tokens_tensors != 0, 1)

    return tokens_tensors, segments_tensors, masks_tensors

In [246]:
# Hyperparameters
sen_len = 500
batch_size = 16

In [243]:
# 資料前處理，讀出檔案
file_path='/content/drive/My Drive/aicup/SampleData_deid.txt'
trainingset, position, mentions = loadInputFile(file_path)

# 用pytorch Dataset
init_seeds()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
PRETRAINED_MODEL_NAME = "bert-base-chinese"
tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)
trainset = NerDataset("train", sen_len, trainingset, position, seperate_words_bio=seperate_words_bio, bert_tokenizer=tokenizer)

# split val from trainset
val_size = int(trainset.__len__() * 0.05)  # 切出5%筆當validation
trainset, valset = random_split(trainset, [trainset.__len__() - val_size, val_size])
print('trainset size:', trainset.__len__())
print('valset size:', valset.__len__())

Total complete articles: 0
Total complete articles: 10
Total complete articles: 20
trainset size: 103
valset size: 5


In [247]:
train_loader = DataLoader(trainset, batch_size=batch_size, collate_fn=create_mini_batch, shuffle=True)
val_loader = DataLoader(valset, batch_size=batch_size, collate_fn=create_mini_batch, shuffle=False)

##### 上面是tokenized words
##### 中間是Bert的word embeddings
##### 下面是對應到的BIO

In [195]:
tokens_tensor, segments_tensor, label, origin_text = trainset[0]

['[CLS]', '醫', '師', '：', '你', '有', '做', '超', '音', '波', '嘛', '，', '那', '我', '們', '來', '看', '報', '告', '，', '有', '些', '部', '分', '有', '紅', '字', '耶', '。', '民', '眾', '：', '紅', '字', '是', '甚', '麼', '意', '思', '？', '醫', '師', '：', '就', '是', '肝', '功', '能', '有', '比', '較', '高', '，', '肝', '功', '能', '6', '8', '，', '就', '是', '這', '個', '[UNK]', '[UNK]', '[UNK]', '是', '6', '8', '，', '這', '樣', '比', '較', '高', '，', '正', '常', '應', '是', '5', '0', '以', '下', '，', '另', '外', '就', '是', '你', '之', '前', '說', '你', '有', '[UNK]', '肝', '，', '但', '是', '你', '[UNK]', '肝', '已', '經', '好', '了', '耶', '。', '民', '眾', '：', '它', '會', '自', '動', '修', '復', '阿', '。', '醫', '師', '：', '你', '有', '抗', '體', '了', '阿', '，', '所', '以', '你', '[UNK]', '肝', '已', '經', '沒', '帶', '原', '了', '耶', '。', '民', '眾', '：', '我', '以', '前', '被', '關', '的', '時', '候', '，', '就', '有', '在', '固', '定', '驗', '血', '，', '那', '時', '候', '說', '有', '[UNK]', '肝', '。', '醫', '師', '：', '阿', '你', '現', '在', '已', '經', '有', '保', '護', '的', '抗', '體', '了', '。', '但', '是', '你', '現', '在', '有

##### 分別要餵給BertModel的三個input應該就是下面這樣

In [249]:
for i, data in enumerate(train_loader):
      print(i)
      tokens_tensors, segments_tensors, masks_tensors = [t.to(device) for t in data]
      print(masks_tensors)

tensor([[ 101, 1962, 8024,  ..., 5517, 7128,  102],
        [ 101,  100,  100,  ..., 1343, 6525,  102],
        [ 101,  943, 5052,  ..., 2205, 2205,  102],
        ...,
        [ 101, 2253, 8038,  ...,  784, 7939,  102],
        [ 101,  800,  738,  ..., 5839,  511,  102],
        [ 101, 4707, 8038,  ...,    0,    0,    0]])
0
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        ...,
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0]])
tensor([[ 101, 7015, 2374,  ..., 1914, 5857,  102],
        [ 101, 8038, 1962,  ..., 2218, 1391,  102],
        [ 101, 3221, 3173,  ...,    0,    0,    0],
        ...,
        [ 101, 1762, 6857,  ...,    0,    0,    0],
        [ 101, 6123, 2802,  ..., 2582, 7938,  102],
        [ 101, 4707, 8038,  ...,    0,    0,    0]])
1
tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1,