In [1]:
import torch
import pytorch_lightning as pl

from processors import MnliProcessor
from bert_base_model import LightningBertForSequenceClassification

num_gpus = -1 if torch.cuda.is_available() else None

In [2]:
# prepare hyperparameters

max_steps = -1 # if -1 then calculate number of training steps based on the length of the train set
len_train_set = 392702

gradient_accumulation_steps = 1
learning_rate = 2e-5
weight_decay = 0.0
adam_epsilon = 1e-8
warmup_proportion = 0

num_train_epochs = 5
batch_size = 32

if max_steps > 0:
    num_train_epochs = max_steps // (len_train_set // gradient_accumulation_steps) + 1
    num_training_steps = max_steps
else:
    num_training_steps = len_train_set // gradient_accumulation_steps * num_train_epochs
    
warmup_steps = num_training_steps // num_train_epochs * warmup_proportion

In [3]:
num_training_steps

1963510

In [4]:
hparams = { 'learning_rate': learning_rate,
            'adam_epsilon': adam_epsilon,
            'weight_decay': weight_decay,
            'warmup_steps': warmup_steps,
            'num_training_steps': num_training_steps,
            'batch_size': batch_size,
           }

model = LightningBertForSequenceClassification(processor=MnliProcessor(), hparams=hparams)

In [5]:
processor = model.get_processor()

In [6]:
train_dataset, _ = processor.load_and_cache_examples("data/MNLI", example_set='train')

In [7]:
val_dataset, _ = processor.load_and_cache_examples("data/MNLI", example_set='dev')

In [8]:
test_dataset, _ = processor.load_and_cache_examples("data/MNLI", example_set='test')

In [9]:
model.set_train_dataset(train_dataset)
model.set_val_dataset(val_dataset)
model.set_test_dataset(test_dataset)

In [10]:
from pytorch_lightning.logging import TensorBoardLogger

save_root_path ='resources/models/MNLI_on_lightning/'
tensor_logger = TensorBoardLogger(save_dir= save_root_path + 'logs', version=10, name='mnli_finetuning')
checkpoint_save_path = save_root_path + 'checkpoints/'

In [11]:
from pytorch_lightning.callbacks import ModelCheckpoint

checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_save_path,
    verbose=True,
    monitor='val_loss',
    mode='min'
)

  f"Checkpoint directory {filepath} exists and is not empty with save_top_k != 0."


In [12]:
amp_opt_level='O1' # https://nvidia.github.io/apex/amp.html#opt-levels
max_grad_norm = 1.0

In [13]:
trainer = pl.Trainer(default_save_path=checkpoint_save_path, logger=tensor_logger, gpus=num_gpus,
                     max_epochs = num_train_epochs, amp_level=amp_opt_level, gradient_clip_val=max_grad_norm,
                     max_steps = num_training_steps, checkpoint_callback=checkpoint_callback)

In [14]:
trainer.fit(model)

HBox(children=(FloatProgress(value=0.0, description='Validation sanity check', layout=Layout(flex='2'), max=5.…



HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(flex='2'), max=1.0), HTML(value='')), …

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=307.0, style=Pr…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=307.0, style=Pr…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=307.0, style=Pr…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=307.0, style=Pr…

HBox(children=(FloatProgress(value=0.0, description='Validating', layout=Layout(flex='2'), max=307.0, style=Pr…




1

In [15]:
trainer.tqdm_metrics

{'val_loss': 0.7116624712944031, 'avg_val_acc': 0.8278528451919556}

In [16]:
trainer.save_checkpoint(save_root_path + 'training_checkpoint')

torch.save(model.state_dict(), save_root_path + 'pytorch_model.bin')
with open(save_root_path + 'bert_config.json', 'w') as f:
    f.write(model.bert.config.to_json_string())
model.tokenizer.save_pretrained(save_root_path)

('models/MNLI_on_lightning/vocab.txt',
 'models/MNLI_on_lightning/special_tokens_map.json',
 'models/MNLI_on_lightning/added_tokens.json')

In [17]:
trainer.test(model)

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=308.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8330)}
----------------------------------------------------------------------------------------------------



In [18]:
trainer.tqdm_metrics

{'val_loss': 0.7116624712944031,
 'avg_val_acc': 0.8278528451919556,
 'avg_test_acc': 0.8329951167106628}