In [1]:
import torch
import pytorch_lightning as pl

from processors import MnliProcessor
from bert_base_model import LightningBertForSequenceClassification
from firebert_fve import FireBERT_FVE
from firebert_fse import FireBERT_FSE

Instructions for updating:
non-resource variables are not supported in the long term


## Base MNLI model from the paper

In [2]:
# load the base model tuned on MNLI
model_dir = 'resources/models/MNLI/pytorch_model.bin'

# prepare hyperparameters
hparams = {'batch_size': 32 }

# instantiate the model
model = LightningBertForSequenceClassification(load_from=model_dir, 
                                               processor=MnliProcessor(), 
                                               hparams=hparams)

In [3]:
# get the metrics for the model against validation data
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=307.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8398), 'avg_test_f1': tensor(0.8331, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8397846221923828, 'avg_test_f1': 0.8331295236055348}

In [4]:
# check the test set
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=308.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8425), 'avg_test_f1': tensor(0.8352, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8425324559211731, 'avg_test_f1': 0.8351874185857123}

In [5]:
# compare how well the base model does against adversarial dev samples
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=235.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.0371), 'avg_test_f1': tensor(0.0355, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.03710106387734413, 'avg_test_f1': 0.035549824956334554}

In [6]:
# check the adversarial test set
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=237.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.0287), 'avg_test_f1': tensor(0.0273, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.02874472551047802, 'avg_test_f1': 0.027315982757506095}

## MNLI model tuned on lightning

In [7]:
# load our version of the model tuned on the MNLI task
model_dir = 'resources/models/MNLI_on_lightning/pytorch_model.bin'

# prepare hyperparameters
hparams = {'batch_size': 32 }

# instantiate the model
model = LightningBertForSequenceClassification(load_from=model_dir, 
                                               processor=MnliProcessor(), 
                                               hparams=hparams)

In [8]:
# get the metrics for the model against validation data
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=307.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8279), 'avg_test_f1': tensor(0.8211, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8278528451919556, 'avg_test_f1': 0.821133296087285}

In [9]:
# check the test set
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=308.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8330), 'avg_test_f1': tensor(0.8259, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8329951167106628, 'avg_test_f1': 0.8259453206441073}

In [10]:
# compare how well the model does against adversarial dev samples
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=235.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.4900), 'avg_test_f1': tensor(0.4733, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.490026593208313, 'avg_test_f1': 0.47326726672573016}

In [11]:
# check the adversarial test set
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=237.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.5012), 'avg_test_f1': tensor(0.4867, dtype=torch.float64)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.5011747479438782, 'avg_test_f1': 0.48670233734742874}

## FIVE on base MNLI model

In [2]:
# load our co-tuned model
model_dir = 'resources/models/MNLI/pytorch_model.bin'

# prepare hyperparameters
hparams =  {'batch_size': 8, 'use_USE': False, 'stop_words': True, 'perturb_words': 1, 
            'verbose': False, 'vote_avg_logits': True, 'std': 8.139999999999995, 'vector_count': 8}

# instantiate the model
model = FireBERT_FVE(load_from=model_dir, 
                     processor=MnliProcessor(), 
                     hparams=hparams)

In [3]:
# get the metrics for the model against validation data
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='dev')
model.set_test_dataset(dataset, examples=examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=1227.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.7479), 'avg_test_f1': tensor(0.6972)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.7479188442230225, 'avg_test_f1': 0.6971950531005859}

In [4]:
# check the test set
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=1229.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.7563), 'avg_test_f1': tensor(0.7081)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.7563059329986572, 'avg_test_f1': 0.7081065773963928}

In [5]:
# compare how well the model does against adversarial dev samples
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_dev')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=937.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.6029), 'avg_test_f1': tensor(0.5400)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.60285484790802, 'avg_test_f1': 0.5400167107582092}

In [6]:
# check the adversarial test set
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=947.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.6321), 'avg_test_f1': tensor(0.5664)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.6320837736129761, 'avg_test_f1': 0.5663667917251587}

## FuSE on base MNLI model

In [7]:
# load our co-tuned model
model_dir = 'resources/models/MNLI/pytorch_model.bin'

# prepare hyperparameters
hparams =  {'use_USE':True, 'USE_method':"filter", 'USE_multiplier':20, 'stop_words':True, 'perturb_words':2,
            'candidates_per_word':8, 'total_alternatives':14, 'match_pos':True, 'batch_size':1,'verbose':False, 
            'vote_avg_logits':True}

# instantiate the model
model = FireBERT_FSE(load_from=model_dir, 
                     processor=MnliProcessor(), 
                     hparams=hparams)

INFO:absl:Using scratch/tf_cache to cache modules.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [8]:
# get the metrics for the model against validation data
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='dev')
model.set_test_dataset(dataset, examples=examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=9815.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.7145), 'avg_test_f1': tensor(0.7145)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.7145186066627502, 'avg_test_f1': 0.7145186066627502}

In [9]:
# check the test set
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=9832.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.7250), 'avg_test_f1': tensor(0.7250)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.7249796390533447, 'avg_test_f1': 0.7249796390533447}

In [10]:
# compare how well the model does against adversarial dev samples
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_dev')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=7490.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.5893), 'avg_test_f1': tensor(0.5893)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.589319109916687, 'avg_test_f1': 0.589319109916687}

In [11]:
# check the adversarial test set
dataset, examples = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=7574.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.5939), 'avg_test_f1': tensor(0.5939)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.5938737988471985, 'avg_test_f1': 0.5938737988471985}

## Co-tuned MNLI model from FACT

In [12]:
# load our co-tuned model
model_dir = 'resources/models/co-tuned_MNLI_on_lightning_final_filter/pytorch_model.bin'

# prepare hyperparameters
hparams = {'batch_size': 32 }

# instantiate the model
model = LightningBertForSequenceClassification(load_from=model_dir, 
                                               processor=MnliProcessor(), 
                                               hparams=hparams)

In [13]:
# get the metrics for the model against validation data
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=307.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8240), 'avg_test_f1': tensor(0.8167)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8240068554878235, 'avg_test_f1': 0.8167411088943481}

In [14]:
# check the test set
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=308.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8273), 'avg_test_f1': tensor(0.8205)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8273133039474487, 'avg_test_f1': 0.8205042481422424}

In [15]:
# compare how well the model does against adversarial dev samples
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=235.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.7846), 'avg_test_f1': tensor(0.7754)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.7845744490623474, 'avg_test_f1': 0.7753534913063049}

In [16]:
# check the adversarial test set
dataset, _ = model.get_processor().load_and_cache_examples("data/MNLI", example_set='adv_test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=237.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8003), 'avg_test_f1': tensor(0.7908)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8003212213516235, 'avg_test_f1': 0.7907801270484924}