## BERT

In [1]:
import copy
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.autograd import Variable
from torch.utils import data
import tensorflow as tf
from tqdm import tqdm,trange
from transformers import *
# from ner_eval import classification_report,csv_report,f1_score
from seqeval.scheme import IOB2
from seqeval.metrics import classification_report,f1_score

from sklearn.model_selection import KFold

In [27]:
data_train = pd.read_csv("BCSMM4H_train_IOB2_all.txt",sep = '\t')
data_dev = pd.read_csv("BC_dev_IOB2_all.txt",sep = '\t')

In [39]:
data_train = data_train.fillna('NA')
data_dev = data_dev.fillna('NA')

In [40]:
class SentenceGetter(object):
    
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                     s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence#").apply(agg_func)
        self.sentences = [s for s in self.grouped]
    
    def get_next(self):
        try:
            s = self.grouped[self.n_sent]
            self.n_sent += 1
            return s
        except:
            return None

In [41]:
getter = SentenceGetter(data_train)
dev_getter = SentenceGetter(data_dev)

In [42]:
sentences = [[word[0] for word in sentence] for sentence in getter.sentences]
dev_sentences = [[word[0] for word in sentence] for sentence in dev_getter.sentences]

In [43]:
labels = [[s[1] for s in sent] for sent in getter.sentences]
dev_labels = [[s[1] for s in sent] for sent in dev_getter.sentences]

In [44]:
tag_values = list(set(data_train["Tag"].values))
tag_values.append("PAD")
tag_values

['O', 'B-', 'I-', 'PAD']

In [45]:
bs = 16
model_ty = 'bert-base-cased'
# model_ty = 'bert-base-multilingual-cased'
# tokenizer = BertTokenizer.from_pretrained('bert-base-chinese')
tokenizer = BertTokenizer.from_pretrained(model_ty, do_lower_case=False)

In [46]:
def tokenize_and_preserve_labels(sentence, text_labels):
    tokenized_sentence = []
    labels = []

    for word, label in zip(sentence, text_labels):
        tokenized_word = tokenizer.tokenize(word)
        n_subwords = len(tokenized_word)

        tokenized_sentence.extend(tokenized_word)
        if n_subwords > 1 and 'B-' in label:
            labels.extend([label])
            _ = 'I-' + label.split('B-')[1]
            if _ not in tag_values:
                tag_values.append(_)
                print(_)
            labels.extend([_] * (n_subwords-1))
        else:
            labels.extend([label] * n_subwords)
    return tokenized_sentence, labels

In [47]:
tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(sentences, labels)
]
print('done')
dev_tokenized_texts_and_labels = [
    tokenize_and_preserve_labels(sent, labs)
    for sent, labs in zip(dev_sentences, dev_labels)
]

done


In [48]:
tag2idx = {t: i for i, t in enumerate(tag_values)}
tag2idx

{'O': 0, 'B-': 1, 'I-': 2, 'PAD': 3}

In [49]:
tokenized_texts = [token_label_pair[0] for token_label_pair in tokenized_texts_and_labels]
tokenized_labels = [token_label_pair[1] for token_label_pair in tokenized_texts_and_labels]

dev_tokenized_texts = [token_label_pair[0] for token_label_pair in dev_tokenized_texts_and_labels]
dev_tokenized_labels = [token_label_pair[1] for token_label_pair in dev_tokenized_texts_and_labels]

In [50]:
def check_post_entity(pre_tag,tag):
    error_entity = False
    if pre_tag == 'O' and 'I-' in tag:
        error_entity = True
    if error_entity:
        print(idx+1)
        print(pre_tag,tag)
def inspect_label(lal):
    for n,_ in enumerate(lal):
        if n != 0:
            check_post_entity(lal[n-1],lal[n])
for idx,t_label in enumerate(dev_tokenized_labels):
    inspect_label(t_label)

In [51]:
idx = 3
print(dev_sentences[idx])
print(dev_labels[idx])
print(dev_tokenized_texts[idx],len(dev_tokenized_texts[idx]))
print(dev_tokenized_labels[idx],len(dev_tokenized_labels[idx]))

['Bruh', 'do', 'you', 'wan', 'na', 'fight', 'nah', 'man', 'it', "'s", 'problem']
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
['B', '##ru', '##h', 'do', 'you', 'wa', '##n', 'na', 'fight', 'na', '##h', 'man', 'it', "'", 's', 'problem'] 16
['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O'] 16


In [52]:
input_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in tokenized_texts]
dev_input_ids = [tokenizer.convert_tokens_to_ids(txt) for txt in dev_tokenized_texts]
tags = [[tag2idx.get(l) for l in lab] for lab in tokenized_labels]
dev_tags = [[tag2idx.get(l,tag2idx['O']) for l in lab] for lab in dev_tokenized_labels]

In [53]:
from torch.utils.data import Dataset
class BertNerDataset(Dataset):
    def __init__(self,sentences,labels, word_pad_idx, tag_pad_idx, max_len = 500):
        self.sentences = sentences
        self.labels = labels
        self.word_pad_idx = word_pad_idx
        self.tag_pad_idx = tag_pad_idx
        self.max_len = max_len
    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, index):
        return (self.sentences[index],self.labels[index])
        
    def collate_fn(self, datasets):
        sentences = [dataset[0] for dataset in datasets]
        labels = [dataset[1] for dataset in datasets]
        max_sent = max([len(data) for data in sentences])
        max_len = max([min(len(sentence), self.max_len) for sentence in sentences])
        pad_sentence = []
        pad_label = []
        for sentence,label in zip(sentences,labels):
            
            if len(sentence) > max_len:
#                 print('asd')
                pad_sentence.append(sentence[:max_len])
                pad_label.append(label[:max_len])
                attention_masks = [[float(i != 0.0) for i in ii] for ii in pad_sentence]
            else:
#                 print('zxc')
                pad_sentence.append(sentence+[self.word_pad_idx]*(max_len-len(sentence)))
                pad_label.append(label+[self.tag_pad_idx]*(max_len-len(label)))
                attention_masks = [[float(i != 0.0) for i in ii] for ii in pad_sentence]
        return torch.LongTensor(pad_sentence), torch.LongTensor(pad_label),torch.tensor(attention_masks)

## 使用 trainingset+validset 去做 10-fold CV

In [54]:
from torch.utils.data import TensorDataset, DataLoader , SubsetRandomSampler ,ConcatDataset,Sampler
from transformers import get_linear_schedule_with_warmup
bs = 16
k_folds = 10
kfold = KFold(n_splits=k_folds, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10
max_grad_norm = 3.0

tr_dataset = BertNerDataset(input_ids,tags,tokenizer.convert_tokens_to_ids('[PAD]'),tag2idx['PAD'])
va_dataset = BertNerDataset(dev_input_ids,dev_tags,tokenizer.convert_tokens_to_ids('[PAD]'),tag2idx['PAD'])
dataset = ConcatDataset([tr_dataset, va_dataset])

In [None]:
All_Fold_score = []
for fold, (train_ids, test_ids) in enumerate(kfold.split(dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    train_dataloader = DataLoader(dataset, batch_size=bs,
                                  collate_fn=tr_dataset.collate_fn,
                                  sampler=train_subsampler)
    valid_dataloader = DataLoader(dataset, batch_size=bs,
                                  collate_fn=va_dataset.collate_fn,
                                  sampler=test_subsampler)

    total_steps = len(train_dataloader) * epochs
    model = BertForTokenClassification.from_pretrained(
        model_ty,
        num_labels=len(tag2idx),
        output_attentions = False,
        output_hidden_states = False
    )
    optimizer = AdamW(model.parameters(),lr=3e-5,eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    model.cuda()
    all_loader = {
    "train" : train_dataloader,
    "valid" : valid_dataloader,
    }
    Fold_score = []
    for _ in trange(epochs, desc="Epoch"):
        for loader in all_loader:
            total_loss = 0
            # Training loop
            predictions , true_labels = [], []
            for step, batch in enumerate(all_loader[loader]):
                if loader == 'train':
                    model.train()
                else:
                    model.eval()
                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_labels ,b_input_mask = batch
                model.zero_grad()
                outputs = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)

                loss = outputs[0]
                loss.backward()
                total_loss += loss.item()

                if loader == 'train':
                    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                logits = outputs[1].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
                true_labels.extend(label_ids)

            avg_loss = total_loss / len(all_loader[loader])
            print(f"Average {loader} loss: {avg_loss}")
            pred_tags,valid_tags = [],[]
            for p, l in zip(predictions, true_labels):
                _p = []
                _l = []
                for p_i, l_i in zip(p, l):
                    if tag_values[l_i] != "PAD":
                        _p.append(tag_values[p_i])
                        _l.append(tag_values[l_i])
                pred_tags.append(_p)
                valid_tags.append(_l)
            print(f"{loader} F1-Score: {f1_score(valid_tags , pred_tags,scheme = IOB2)}")
            if loader == 'valid':
                Fold_score.append(f1_score(valid_tags , pred_tags,scheme = IOB2))
    if Fold_score != []:
        All_Fold_score.append(Fold_score)
#             if _ == (epochs - 1):
#                 print(classification_report(valid_tags, pred_tags,mode='strict',scheme = IOB2))

FOLD 0
--------------------------------


Some weights of the model checkpoint at bert-base-cased were not used when initializing BertForTokenClassification: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cas

Average train loss: 0.010023663633511403




train F1-Score: 0.6213314244810307
Average valid loss: 0.00864833544458802
valid F1-Score: 0.7517619420516837


Epoch:  10%|███████▎                                                                 | 1/10 [08:37<1:17:40, 517.87s/it]

Average train loss: 0.004190742562078843
train F1-Score: 0.8119650418956663
Average valid loss: 0.006593502619000899
valid F1-Score: 0.8166259168704157


Epoch:  20%|██████████████▌                                                          | 2/10 [17:08<1:08:30, 513.83s/it]

In [None]:
All_score = len(All_Fold_score[0]) * [0]
for score in All_Fold_score:
    All_score = np.sum([All_score,score], axis = 0)
All_score = np.round((All_score / 10),2)     

In [None]:
All_score

## 單用 trainingset 去做 10-fold CV

In [None]:
kfold = KFold(n_splits=k_folds, shuffle=True)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
epochs = 10
max_grad_norm = 3.0

tr_dataset = BertNerDataset(input_ids,tags,tokenizer.convert_tokens_to_ids('[PAD]'),tag2idx['PAD'])

In [None]:
All_Fold_score = []
for fold, (train_ids, test_ids) in enumerate(kfold.split(tr_dataset)):
    print(f'FOLD {fold}')
    print('--------------------------------')
    train_subsampler = torch.utils.data.SubsetRandomSampler(train_ids)
    test_subsampler = torch.utils.data.SubsetRandomSampler(test_ids)
    train_dataloader = DataLoader(dataset, batch_size=bs,
                                  collate_fn=tr_dataset.collate_fn,
                                  sampler=train_subsampler)
    valid_dataloader = DataLoader(dataset, batch_size=bs,
                                  collate_fn=va_dataset.collate_fn,
                                  sampler=test_subsampler)

    total_steps = len(train_dataloader) * epochs
    model = BertForTokenClassification.from_pretrained(
        model_ty,
        num_labels=len(tag2idx),
        output_attentions = False,
        output_hidden_states = False
    )
    optimizer = AdamW(model.parameters(),lr=3e-5,eps=1e-8)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=0,
        num_training_steps=total_steps
    )
    model.cuda()
    all_loader = {
    "train" : train_dataloader,
    "valid" : valid_dataloader,
    }
    Fold_score = []
    for _ in trange(epochs, desc="Epoch"):
        for loader in all_loader:
            total_loss = 0
            # Training loop
            predictions , true_labels = [], []
            for step, batch in enumerate(all_loader[loader]):
                if loader == 'train':
                    model.train()
                else:
                    model.eval()
                batch = tuple(t.to(device) for t in batch)
                b_input_ids, b_labels ,b_input_mask = batch
                model.zero_grad()
                outputs = model(b_input_ids, token_type_ids=None,
                                attention_mask=b_input_mask, labels=b_labels)

                loss = outputs[0]
                loss.backward()
                total_loss += loss.item()

                if loader == 'train':
                    torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
                    optimizer.step()
                    scheduler.step()
                logits = outputs[1].detach().cpu().numpy()
                label_ids = b_labels.to('cpu').numpy()
                predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
                true_labels.extend(label_ids)

            avg_loss = total_loss / len(all_loader[loader])
            print(f"Average {loader} loss: {avg_loss}")
            pred_tags,valid_tags = [],[]
            for p, l in zip(predictions, true_labels):
                _p = []
                _l = []
                for p_i, l_i in zip(p, l):
                    if tag_values[l_i] != "PAD":
                        _p.append(tag_values[p_i])
                        _l.append(tag_values[l_i])
                pred_tags.append(_p)
                valid_tags.append(_l)
            print(f"{loader} F1-Score: {f1_score(valid_tags , pred_tags,scheme = IOB2)}")
            if loader == 'valid':
                Fold_score.append(f1_score(valid_tags , pred_tags,scheme = IOB2))
    if Fold_score != []:
        All_Fold_score.append(Fold_score)
#             if _ == (epochs - 1):
#                 print(classification_report(valid_tags, pred_tags,mode='strict',scheme = IOB2))

In [None]:
All_score = len(All_Fold_score[0]) * [0]
for score in All_Fold_score:
    All_score = np.sum([All_score,score], axis = 0)
All_score = np.round((All_score / 10) , 2)
All_score

## 全部的 trainingset 訓練 model 後, 透過 validset 評估 model

In [None]:
model = BertForTokenClassification.from_pretrained(
    model_ty,
    num_labels=len(tag2idx),
    output_attentions = False,
    output_hidden_states = False
)
optimizer = AdamW(model.parameters(),lr=3e-5,eps=1e-8)
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=total_steps
    )
model.cuda()
train_dataloader = DataLoader(tr_dataset, batch_size=bs,
                            collate_fn=tr_dataset.collate_fn)
valid_dataloader = DataLoader(va_dataset, batch_size=bs,
                            collate_fn=va_dataset.collate_fn)
all_loader = {
    "train" : train_dataloader,
    "valid" : valid_dataloader,
}

In [None]:
F_Score = []
for _ in trange(epochs, desc="Epoch"):
    for loader in all_loader:
        total_loss = 0
        # Training loop
        predictions , true_labels = [], []
        for step, batch in enumerate(all_loader[loader]):
            if loader == 'train':
                model.train()
            else:
                model.eval()
            batch = tuple(t.to(device) for t in batch)
            b_input_ids, b_labels ,b_input_mask = batch
            model.zero_grad()
            outputs = model(b_input_ids, token_type_ids=None,
                            attention_mask=b_input_mask, labels=b_labels)

            loss = outputs[0]
            loss.backward()
            total_loss += loss.item()

            if loader == 'train':
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=max_grad_norm)
                optimizer.step()
                scheduler.step()
            logits = outputs[1].detach().cpu().numpy()
            label_ids = b_labels.to('cpu').numpy()
            predictions.extend([list(p) for p in np.argmax(logits, axis=2)])
            true_labels.extend(label_ids)

        avg_loss = total_loss / len(all_loader[loader])
        print(f"Average {loader} loss: {avg_loss}")
        pred_tags,valid_tags = [],[]
        for p, l in zip(predictions, true_labels):
            _p = []
            _l = []
            for p_i, l_i in zip(p, l):
                if tag_values[l_i] != "PAD":
                    _p.append(tag_values[p_i])
                    _l.append(tag_values[l_i])
            pred_tags.append(_p)
            valid_tags.append(_l)
        print(f"{loader} F1-Score: {f1_score(valid_tags , pred_tags,scheme = IOB2)}")
        if loader == 'valid':
            F_Score.append(f1_score(valid_tags , pred_tags,scheme = IOB2))

In [None]:
np.round(np.array(F_Score),2)