In [1]:
import os

from torch import save
from torch.optim import AdamW
from sklearn.model_selection import train_test_split

from transformers import BertTokenizerFast, BertForSequenceClassification

from src.dataset import load_dataset, Species, Modification
from src.utils.transformers import encode_seq_bunch, make_dataloader, train_epoch, calculate_acc_dataset

In [2]:
DEVICE = 'mps'
MODEL = 'bert-base-uncased'

EXPERIMENT_NAME = 'bert-simple'

In [3]:
tokenizer = BertTokenizerFast.from_pretrained(MODEL)

In [4]:
dataset = load_dataset(Species.human, Modification.psi)

In [5]:
sequences, labels = encode_seq_bunch(dataset, tokenizer, True)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [6]:
x_train, x_test, y_train, y_test = train_test_split(sequences, labels, test_size=0.2)
x_train, x_val, y_train, y_val = train_test_split(x_train, y_train, test_size=0.2)

In [7]:
train_dataloader = make_dataloader(x_train, y_train)
test_dataloader = make_dataloader(x_test, y_test)
val_dataloader = make_dataloader(x_val, y_val)

In [8]:
model = BertForSequenceClassification.from_pretrained(MODEL, num_labels=2)
model.to(DEVICE)

None

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
optimizer = AdamW(model.parameters(), lr=2e-5)

In [10]:
old_val_acc = 0
old_train_acc = 0
old_model_name = ''

In [11]:
TOTAL_EPOCHS = 0

In [12]:
for epoch in range(1, 5 + 1):
    TOTAL_EPOCHS += 1

    train_acc, val_acc = train_epoch(TOTAL_EPOCHS, DEVICE, model, optimizer, train_dataloader, val_dataloader)
    if train_acc > old_train_acc and val_acc > old_val_acc:
        if old_model_name != '':
            os.unlink(old_model_name)
        old_val_acc = val_acc
        old_train_acc = train_acc
        old_model_name = f'{EXPERIMENT_NAME}_ep-{TOTAL_EPOCHS}_tacc-{train_acc:.2}_vacc-{val_acc:.2}.pt'
        save(model, old_model_name)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  0%|          | 0/159 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [None]:
calculate_acc_dataset(DEVICE, model, test_dataloader)