# BigBird Baseline

Kake - 02/10  
Bigbird based model

**Reference:**  
Articles: [BigBird論文](https://arxiv.org/abs/2007.14062),   
　　　　[BERT論文](https://arxiv.org/abs/1810.04805) ([日本語要約](https://qiita.com/omiita/items/72998858efc19a368e50))  
HuggingFace: https://huggingface.co/google/bigbird-roberta-base  
SourseCode: https://www.kaggle.com/cdeotte/pytorch-bigbird-ner-cv-0-615

In [112]:
%reset -f

import os, sys, tqdm, sklearn, sklearn.metrics
import numpy as np
import pandas as pd

import torch, torch.utils
import transformers

np.random.seed(123)

print("Done Loading Libraries")
print("Versions".center(22, "="))
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")

Done Loading Libraries
Python: 3.7.12 (default, Jan 15 2022, 18:48:18) 
[GCC 7.5.0]
NumPy: 1.19.5
PyTorch: 1.10.0+cu111
Transformers: 4.16.2


## 1. modelのダウンロード

モデルパラメータがローカルに存在しない時に、HuggingFaceからダウンロードする

In [113]:
DOWNLOADED_MODEL_PATH = "/content/model/bigbird_001"
MODEL_NAME = 'google/bigbird-roberta-base'
if DOWNLOADED_MODEL_PATH == None:
    DOWNLOADED_MODEL_PATH = "models"
if DOWNLOADED_MODEL_PATH == "models":
    os.makedirs("models", exist_ok=True)

    # tokenizerのダウンロード
    # tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
    # tokenizer.save_pretrained("models")

    # configのダウンロード
    config_model = transformers.AutoConfig.from_pretrained(MODEL_NAME)
    config_model.num_labels = 15
    config_model.save_pretrained("models")

    # model weightのダウンロード
    backbone = transformers.AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config_model)
    backbone.save_pretrained("models")

In [114]:
config = {'model_name': MODEL_NAME,   
         'max_length': 1024,
         'train_batch_size':4,
         'valid_batch_size':4,
         'epochs':5,
         'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7],
         'max_grad_norm':10}

## 2. DatasetとDataLoaderの準備

### DataをDataframeで読み込む

In [115]:
train_label_df = pd.read_csv("./data/train.csv")

### TrainSetをNERラベルに置き換える

それぞれの単語がどの役割を持つかのラベル  
すでにファイルが存在するときはそれを読み込むだけ

In [116]:
LOAD_TOKEN_FROM = "./data"
if not LOAD_TOKEN_FROM:
    all_entities = []
    for k, items in enumerate(train_text_df.iterrows()):
        # NERラベルのないダミーラベルを作る
        total = items[1]["text"].split().__len__()
        entities = ["O"]*total

        for j in train_label_df[train_label_df["id"] == items[1]["id"]].iterrows():
            # discourseタイプを読み込む
            discourse = j[1]["discourse_type"]
            list_idx = [int(x) for x in j[1]["predictionstring"].split(" ")]

            # 開始タグをつける
            entities[list_idx[0]] = f"B-{discourse}"

            for k in list_idx[1:]:
                entities[k] = f"I-{discourse}"
        all_entities.append(entities)
    train_text_df["entities"] = all_entities
    train_text_df.to_csv("./data/train_NER.csv", index=False)
else:
    from ast import literal_eval
    train_text_df = pd.read_csv(f"{LOAD_TOKEN_FROM}/train_NER.csv")
    train_text_df.entities = train_text_df.entities.apply(lambda x: literal_eval(x))
print(train_text_df.shape)
train_text_df.head()

(15594, 3)


Unnamed: 0,id,text,entities
0,3321A3E87AD3,I do agree that some students would benefit fr...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
1,DFEAEC512BAB,Should students design a summer project for sc...,"[O, O, O, O, O, O, O, O, B-Position, I-Positio..."
2,2E4AFCD3987F,"Dear State Senator\n\n,\n\nIn the ruels of vot...","[O, O, O, O, B-Position, I-Position, I-Positio..."
3,EB6C2AF20BFE,People sometimes have a different opinion than...,"[B-Lead, I-Lead, I-Lead, I-Lead, I-Lead, I-Lea..."
4,A91A08E523D5,"Dear senator,\n\nAs you know the Electoral Col...","[O, O, B-Lead, I-Lead, I-Lead, I-Lead, I-Lead,..."


token化するためのlookup辞書を作る

In [117]:
lookups = ['O', 'B-Lead', 'I-Lead', 'B-Position', 'I-Position', 'B-Claim', 'I-Claim', 'B-Counterclaim', 'I-Counterclaim', 'B-Rebuttal', 'I-Rebuttal', 'B-Evidence', 'I-Evidence', 'B-Concluding Statement', 'I-Concluding Statement']

labels_to_ids = {v:k for k, v in enumerate(lookups)}
ids_to_labels = {k:v for k, v in enumerate(lookups)}

In [70]:
labels_to_ids

{'B-Claim': 5,
 'B-Concluding Statement': 13,
 'B-Counterclaim': 7,
 'B-Evidence': 11,
 'B-Lead': 1,
 'B-Position': 3,
 'B-Rebuttal': 9,
 'I-Claim': 6,
 'I-Concluding Statement': 14,
 'I-Counterclaim': 8,
 'I-Evidence': 12,
 'I-Lead': 2,
 'I-Position': 4,
 'I-Rebuttal': 10,
 'O': 0}

### Datasetクラスの実装

PyTorchで実装しているためDatasetを作る。

sourceではtoken化の際に`.split(is_split_into_word=True)`を用いている。これは`\n`を取り除くので、新しい段落を認識させたいときは[別の方法](https://www.kaggle.com/cdeotte/tensorflow-longformer-ner-cv-0-633?scriptVersionId=83615733)を使う必要がある

In [118]:
class dataset(torch.utils.data.Dataset):
    def __init__(self, dataframe, tokenizer, max_len, get_wids):
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.get_wids = get_wids    # for validation
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, index):
        item = self.pull_item(index)
        return item

    def pull_item(self, index):
        # テキストとラベルを取得する
        text = self.data.text[index]
        word_labels = self.data.entities[index] if not self.get_wids else None

        # tokenize
        encoding = self.tokenizer(text.split(), is_split_into_words=True, padding="max_length", truncation=True, max_length=self.max_len)
        word_ids = encoding.word_ids()

        # ターゲットを作る
        if not self.get_wids:
            previous_word_idx = None
            label_ids = []
            for word_idx in word_ids:
                if word_idx is None:
                    label_ids.append(-100)
                elif word_idx != previous_word_idx:
                    label_ids.append(labels_to_ids[word_labels[word_idx]])
                else:
                    label_ids.append(labels_to_ids[word_labels[word_idx]])
                previous_word_idx = word_idx
            encoding["labels"] = label_ids
        
        # torchTensorに変換
        item = {key: torch.as_tensor(val) for key, val in encoding.items()}
        if self.get_wids:
            word_ids2 = [w if w is not None else -1 for w in word_ids]
            item["wids"] = torch.as_tensor(word_ids2)
        return item

今回はとりあえず90%:10%のホールドアウト法で行う。(後で5-fold-CVもやります...)

In [119]:
IDs = train_label_df.id.unique()
print(f"There are", len(IDs), "train texts. Now we will split 90:10 for hold-out validation")

train_idx = np.random.choice(np.arange(len(IDs)), int(0.9*len(IDs)), replace=False)
val_idx = np.setdiff1d(np.arange(len(IDs)), train_idx)

There are 15594 train texts. Now we will split 90:10 for hold-out validation


In [129]:
data = train_text_df[["id", "text", "entities"]]
train_data = data.loc[data["id"].isin(IDs[train_idx]), ["text", "entities"]].reset_index(drop=True)
val_data = data.loc[data["id"].isin(IDs[val_idx])].reset_index(drop=True)

print(f"Full Dataset: {data.shape}")
print(f"Train Dataset: {train_data.shape}")
print(f"Val Dataset: {val_data.shape}")

tokenizer = transformers.AutoTokenizer.from_pretrained(DOWNLOADED_MODEL_PATH)
train_dataset = dataset(train_data, tokenizer, config["max_length"], False)
val_dataset = dataset(val_data, tokenizer, config["max_length"], True)

Full Dataset: (15594, 3)
Train Dataset: (14034, 2)
Val Dataset: (1560, 3)


### DataLoaderの実装

In [121]:
# TRAIN DATASET AND VALID DATASET
train_params = {'batch_size': config['train_batch_size'], 'shuffle': True, 'num_workers': 2, 'pin_memory':True}

val_params = {'batch_size': config['valid_batch_size'], 'shuffle': False, 'num_workers': 2, 'pin_memory':True}

train_dataloader = torch.utils.data.DataLoader(train_dataset, **train_params)
val_dataloader = torch.utils.data.DataLoader(val_dataset, **val_params)

## 3. Modelの訓練

ロードしたモデルは`batch_size=4`, `learning_rate=[5e-5, 5e-5, 5e-6, 5e-6, 5e-7]`で訓練されている。  
ちょっと色々サボってる(特にvalidation関連)

In [75]:
def train_model(model, dataloader, optimizer, config, epochs):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    print(f"Device:", device)
    model.to(device)

    logs = []

    for epoch in range(epochs):
        epoch_loss = 0.0
        epoch_acc = 0.0
        iters = 0

        for g in optimizer.param_groups:
            g["lr"] = config["learning_rates"][epoch]

        phase = "train"
        if phase == "train":
            model.train()
        with tqdm.tqdm(dataloader, desc=f"Epoch {epoch + 1}/{epochs}") as t:
            for batch in t:
                iters += 1

                ids = batch["input_ids"].to(device, dtype = torch.long)
                mask = batch["attention_mask"].to(device, dtype=torch.long)
                labels = batch["labels"].to(device, dtype=torch.long)

                loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)

                epoch_loss += loss.item()

                # Accuracyの計算
                flatted_labels = labels.view(-1)    # [batch_size * seq_len, ]
                active_logits = tr_logits.view(-1, model.num_labels)    # [batch_size * seqlen, num_labels]
                flatted_predictions = torch.argmax(active_logits, axis=1)   # [batch_size * seq_len, ]

                # only compute accuracy at active labels
                active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

                labels = torch.masked_select(flatted_labels, active_accuracy)
                predictions = torch.masked_select(flatted_predictions, active_accuracy)

                tr_acc = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
                epoch_acc += tr_acc

                # 勾配クリッピング(Normで)
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config["max_grad_norm"])

                # バックプロパゲーション
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()

                t.set_postfix_str(f"Loss: {loss.item():.4f}, Acc: {tr_acc:.4f}")

        torch.cuda.empty_cache()

        print(f"Epoch Loss: {epoch_loss / iters}, Epoch Acc: {epoch_acc / iters}")
        logs.append({"epoch": epoch + 1, "train_loss": epoch_loss / iters})
        df = pd.DataFrame(logs)
        df.to_csv("/content/logs/log_bigbird_001.csv")
        

In [76]:
config_model = transformers.AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH + "/config.json")
model = transformers.AutoModelForTokenClassification.from_pretrained(DOWNLOADED_MODEL_PATH + "/pytorch_model.bin", config=config_model)
optimizer = torch.optim.Adam(params=model.parameters(), lr=config["learning_rates"][0])

In [77]:
import warnings
warnings.simplefilter('ignore')
torch.cuda.empty_cache()
train_model(model, train_dataloader, optimizer, config, config["epochs"])

Device: cuda:0


Epoch 1/5: 100%|██████████| 3509/3509 [51:00<00:00,  1.15it/s, Loss: 1.0310, Acc: 0.6197]


Epoch Loss: 0.7663042591223862, Epoch Acc: 0.7489396775532061


Epoch 2/5: 100%|██████████| 3509/3509 [50:58<00:00,  1.15it/s, Loss: 0.2593, Acc: 0.9056]


Epoch Loss: 0.6004733942272861, Epoch Acc: 0.7928424021601809


Epoch 3/5: 100%|██████████| 3509/3509 [50:56<00:00,  1.15it/s, Loss: 0.6050, Acc: 0.7646]


Epoch Loss: 0.4668715739116841, Epoch Acc: 0.8358751960630695


Epoch 4/5: 100%|██████████| 3509/3509 [50:56<00:00,  1.15it/s, Loss: 0.3531, Acc: 0.8821]


Epoch Loss: 0.4368573189393242, Epoch Acc: 0.8452717076901172


Epoch 5/5: 100%|██████████| 3509/3509 [50:58<00:00,  1.15it/s, Loss: 0.2200, Acc: 0.9362]

Epoch Loss: 0.41067998352664664, Epoch Acc: 0.8540176892133634





In [78]:
torch.save(model.state_dict(), f'/content/model/bigbird_v001.pth')

## 4. 評価

In [130]:
def inference(model, batch):
    device = 'cuda:0' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    ids = batch["input_ids"].to(device)
    mask = batch["attention_mask"].to(device)
    outputs = model(ids, attention_mask=mask, return_dict=False)
    all_preds = torch.argmax(outputs[0], axis=-1).cpu().numpy()

    predictions = []
    for k, text_preds in enumerate(all_preds):
        token_preds = [ids_to_labels[i] for i in text_preds]

        prediction = []
        word_ids = batch["wids"][k].numpy()
        previous_word_idx = -1
        for idx, word_idx in enumerate(word_ids):
            if word_idx == -1:
                pass
            elif word_idx != previous_word_idx:
                prediction.append(token_preds[idx])
                previous_word_idx = word_idx
        predictions.append(prediction)
    return predictions

In [131]:
def get_predictions(model, dataframe, dataloader):
    model.eval()

    y_pred2 = []
    for batch in tqdm.tqdm(dataloader):
        labels = inference(model, batch)
        y_pred2.extend(labels)
    
    final_pred2 = []
    for i in range(len(dataframe)):
        idx = dataframe.id.values[i]
        pred = y_pred2[i]       # "B", "I"などを残す
        j = 0
        while j < len(pred):
            cls = pred[j]
            if cls == "O": j += 1
            else: cls = cls.replace("B", "I")
            end = j + 1
            while end < len(pred) and pred[end] == cls:
                end += 1
            
            if cls != "O" and cls != ""and end - j > 7:
                final_pred2.append((idx, cls.replace("I-", ""), " ".join(map(str, list(range(j, end))))))
            j = end
    
    oof = pd.DataFrame(final_pred2)
    oof.columns = ["id", "class", "predictionstring"]
    return oof

スコアの算出を行う

In [136]:
def calc_overlap(row):
    set_pred = set(row.predictionstring_pred.split(" "))
    set_groundtruth = set(row.predictionstring_gt.split(" "))
    len_pred = len(set_pred)
    len_groundtruth = len(set_groundtruth)
    inter = len(set_groundtruth.intersection(set_pred))
    overlap1 = inter / len_groundtruth
    overlap2 = inter / len_pred
    return [overlap1, overlap2]

def score_feedback_comp(pred_df, gt_df):
    gt_df = gt_df[["id", "discourse_type", "predictionstring"]].reset_index(drop=True).copy()
    pred_df = pred_df[["id", "class", "predictionstring"]].reset_index(drop=True).copy()
    gt_df["gt_id"] = gt_df.index
    pred_df["pred_id"] = pred_df.index
    
    # 1. 全ての正解ラベルと予測を比べる
    joined = pred_df.merge(gt_df, left_on=["id", "class"], right_on=["id", "discourse_type"], how="outer", suffixes=("_pred", "_gt"))
    joined["predictionstring_gt"] = joined["predictionstring_gt"].fillna(" ")
    joined["predictionstring_pred"] = joined["predictionstring_pred"].fillna(" ")
    joined["overlaps"] = joined.apply(calc_overlap, axis=1)

    # 2. 正解と予測のオーバラップが>=0.5でかつ予測と正解のオーバーラップが>=0.5の時TPとする
    joined["overlap1"] = joined["overlaps"].apply(lambda x: eval(str(x))[0])
    joined["overlap2"] = joined["overlaps"].apply(lambda x: eval(str(x))[1])

    joined["potential_TP"] = (joined["overlap1"] >= 0.5) & (joined["overlap2"] >= 0.5)
    joined["max_overlap"] = joined[["overlap1", "overlap2"]].max(axis=1)
    tp_pred_ids = joined.query("potential_TP").sort_values("max_overlap", ascending=False).groupby(["id", "predictionstring_gt"]).first()["pred_id"].values

    # 正解ラベルと一致する予測がない時にFN, 予測と一致する正解ラベルがない時にFPとする
    fp_pred_ids = [p for p in joined["pred_id"].unique() if p not in tp_pred_ids]

    matched_gt_ids = joined.query("potential_TP")["gt_id"].unique()
    unmatched_gt_ids = [c for c in joined["gt_id"].unique() if c not in matched_gt_ids]

    TP = len(tp_pred_ids)
    FP = len(fp_pred_ids)
    FN = len(unmatched_gt_ids)
    # mycroF1スコアを計算
    my_f1_score = TP / (TP + 0.5 * (FP + FN))
    return my_f1_score

スコアを計算して表示してみる

In [137]:
# validation target
val = train_label_df.loc[train_label_df["id"].isin(IDs[val_idx])]

config_model = transformers.AutoConfig.from_pretrained(DOWNLOADED_MODEL_PATH + "/config.json")
model = transformers.AutoModelForTokenClassification.from_pretrained(DOWNLOADED_MODEL_PATH + "/pytorch_model.bin", config=config_model)
model.load_state_dict(torch.load('/content/model/bigbird_v001.pth'))
print('Model loaded.')

torch.cuda.empty_cache()
oof = get_predictions(model, val_data, val_dataloader)

f1s = []
CLASSES = oof["class"].unique()
for c in CLASSES:
    pred_df = oof.loc[oof["class"] == c].copy()
    gt_df = val.loc[val["discourse_type"] == c].copy()
    f1 = score_feedback_comp(pred_df, gt_df)
    print(c, f1)
    f1s.append(f1)
print()
print("Overall:", np.mean(f1s))

Model loaded.


100%|██████████| 390/390 [01:40<00:00,  3.89it/s]


Lead 0.7608267716535433
Position 0.6227303295225286
Evidence 0.6541214972386992
Counterclaim 0.48088360237892946
Rebuttal 0.38215102974828374
Concluding Statement 0.8070417673455299
Claim 0.501190611180406

Overall: 0.6012779441525601
