# BigBird Large

In [14]:
%reset -f

import os, sys, tqdm, sklearn, sklearn.metrics
import numpy as np
import pandas as pd

import torch, torch.utils
import transformers

import utils.preprocessing as preprocessing
import utils.dataset as dataset
import utils.metrics as metrics

print("Done Loading Libraries")
print("Versions".center(22, "="))
print(f"Python: {sys.version}")
print(f"NumPy: {np.__version__}")
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")

Done Loading Libraries
Python: 3.8.0 (default, Jan 20 2022, 17:32:02) 
[Clang 13.0.0 (clang-1300.0.27.3)]
NumPy: 1.22.1
PyTorch: 1.10.2
Transformers: 4.16.2


## 1. Configuration

In [15]:
MODEL_NAME = "google/bigbird-roberta-large"
MODEL_DOWNLOAD = False
MODEL_PATH = "./models/bigbird-roberta-large"

DATA_DAWNLOAD = False

VERSION = "001"

config = {'model_name': MODEL_NAME, 'max_length': 1024, 'train_batch_size':4, 'valid_batch_size':4, 'epochs':5, 'learning_rates': [2.5e-5, 2.5e-5, 2.5e-6, 2.5e-6, 2.5e-7], 'max_grad_norm':10}

## 2. Modelの準備

In [16]:
if MODEL_DOWNLOAD:
    os.makedirs("models/bigbird-roberta-large", exist_ok=True)

    # tokenizerのダウンロード
    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_NAME, add_prefix_space=True)
    tokenizer.save_pretrained("models/bigbird-roberta-large")

    # configのダウンロード
    config_model = transformers.AutoConfig.from_pretrained(MODEL_NAME)
    config_model.num_labels = 15
    config_model.save_pretrained("models/bigbird-roberta-large")

    # model weightのダウンロード
    backbone = transformers.AutoModelForTokenClassification.from_pretrained(MODEL_NAME, config=config_model)
    backbone.save_pretrained("models/bigbird-roberta-large")

    print("Done download!!")
else:
    tokenizer = transformers.AutoTokenizer.from_pretrained(MODEL_PATH)
    config_model = transformers.AutoConfig.from_pretrained(MODEL_PATH + "/config.json")
    model = transformers.AutoModelForTokenClassification.from_pretrained(MODEL_PATH + "/pytorch_model.bin", config=config_model)
    print("Done Load a Modal!!")

Done Load a Modal!!


## 3. Datasetの準備

In [17]:
if DATA_DAWNLOAD:
    preprocessing.make_NER_dataframe("./data", n_splits=10)
else:
    train_df, val_df, lookups = preprocessing.read_kfold_file("./data", fold=7, n_splits=10)

In [18]:
train_dataset = dataset.CustomDataset(train_df, lookups, tokenizer, max_len=config["max_length"], val=False)
val_dataset = dataset.CustomDataset(train_df, lookups, tokenizer, max_len=config["max_length"], val=True)

In [19]:
train_params = {'batch_size': config['train_batch_size'], 'shuffle': True, 'num_workers': 2, 'pin_memory':True}
train_dataloader = torch.utils.data.DataLoader(train_dataset, **train_params)

val_params = {'batch_size': config['valid_batch_size'], 'shuffle': False, 'num_workers': 2, 'pin_memory':True}
val_dataloader = torch.utils.data.DataLoader(val_dataset, **val_params)

## 4. Trainmodel

In [22]:
def train_model(model, dataloader, optimizer, config):
    device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
    print(f"Device:", device)
    model.to(device)

    logs = []

    for epoch in range(config["epochs"]):
        epoch_loss = 0.0
        epoch_acc = 0.0
        iters = 0

        for g in optimizer.param_groups:
            g["lr"] = config["learning_rates"][epoch]

        phase = "train"
        if phase == "train":
            model.train()
        with tqdm.tqdm(dataloader, desc=f"Epoch {epoch + 1}/{config['epochs']}") as t:
            for batch in t:
                iters += 1

                ids = batch["input_ids"].to(device, dtype = torch.long)
                mask = batch["attention_mask"].to(device, dtype=torch.long)
                labels = batch["labels"].to(device, dtype=torch.long)

                loss, tr_logits = model(input_ids=ids, attention_mask=mask, labels=labels, return_dict=False)

                epoch_loss += loss.item()

                # Accuracyの計算
                flatted_labels = labels.view(-1)    # [batch_size * seq_len, ]
                active_logits = tr_logits.view(-1, model.num_labels)    # [batch_size * seqlen, num_labels]
                flatted_predictions = torch.argmax(active_logits, axis=1)   # [batch_size * seq_len, ]

                # only compute accuracy at active labels
                active_accuracy = labels.view(-1) != -100 # shape (batch_size, seq_len)

                labels = torch.masked_select(flatted_labels, active_accuracy)
                predictions = torch.masked_select(flatted_predictions, active_accuracy)

                tr_acc = sklearn.metrics.accuracy_score(labels.cpu().numpy(), predictions.cpu().numpy())
                epoch_acc += tr_acc

                # バックプロパゲーション
                optimizer.zero_grad()
                loss.backward()

                # 勾配クリッピング(Normで)
                torch.nn.utils.clip_grad_norm_(parameters=model.parameters(), max_norm=config["max_grad_norm"])
                
                optimizer.step()

                t.set_postfix_str(f"Loss: {epoch_loss/iters:.4f}, Acc: {epoch_acc/iters:.4f}")

        torch.cuda.empty_cache()

        logs.append({"epoch": epoch + 1, "train_loss": epoch_loss / iters, "train_acc": epoch_acc / iters})
        df = pd.DataFrame(logs)
        df.to_csv(f"./logs/log_bigbird-roberta-large_001.csv")

In [20]:
optimizer = torch.optim.Adam(params=model.parameters(), lr=config["learning_rates"][0])

torch.cuda.empty_cache()
train_model(model, train_dataloader, optimizer, config, config["epochs"])

In [23]:
torch.save(model.state_dict(), MODEL_PATH + f"/bigbird-roberta-large_{VERSION}.pth")