In [1]:
import torch
import pytorch_lightning as pl

from processors import ImdbProcessor
from bert_base_model import LightningBertForSequenceClassification
from firebert_fve import FireBERT_FVE
from firebert_fse import FireBERT_FSE

Instructions for updating:
non-resource variables are not supported in the long term


## Base IMDB model from the paper

In [2]:
# load the base model tuned on IMDB
model_dir = 'resources/models/IMDB/pytorch_model.bin'

# prepare hyperparameters
hparams = {'batch_size': 32 }

# instantiate the model
model = LightningBertForSequenceClassification(load_from=model_dir, 
                                               processor=ImdbProcessor(), 
                                               hparams=hparams)

In [3]:
# get the metrics for the model against validation data
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=157.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.9021), 'avg_test_f1': tensor(0.8991)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.9020700454711914, 'avg_test_f1': 0.899067759513855}

In [4]:
# check the test set
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=157.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.9064), 'avg_test_f1': tensor(0.9035)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.906449019908905, 'avg_test_f1': 0.9035440683364868}

In [5]:
# compare how well the base model does against adversarial dev samples
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=130.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.0046), 'avg_test_f1': tensor(0.0044)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.004567307885736227, 'avg_test_f1': 0.004415193106979132}

In [6]:
# check the adversarial test set
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=129.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.0022), 'avg_test_f1': tensor(0.0021)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.0021802326664328575, 'avg_test_f1': 0.002114164875820279}

## IMDB model tuned on lightning

In [7]:
# load our version of the model tuned on the IMDB task
model_dir = 'resources/models/IMDB_on_lightning/pytorch_model.bin'

# prepare hyperparameters
hparams = {'batch_size': 32 }

# instantiate the model
model = LightningBertForSequenceClassification(load_from=model_dir, 
                                               processor=ImdbProcessor(), 
                                               hparams=hparams)

In [8]:
# get the metrics for the model against validation data
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=157.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.9086), 'avg_test_f1': tensor(0.9066)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.9086385369300842, 'avg_test_f1': 0.9065592288970947}

In [9]:
# check the test set
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=157.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.9045), 'avg_test_f1': tensor(0.9015)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.9044585824012756, 'avg_test_f1': 0.9014690518379211}

In [10]:
# compare how well the model does against adversarial dev samples
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=130.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8215), 'avg_test_f1': tensor(0.8136)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8215236663818359, 'avg_test_f1': 0.8135903477668762}

In [11]:
# check the adversarial test set
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=129.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8266), 'avg_test_f1': tensor(0.8164)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8265503644943237, 'avg_test_f1': 0.8163754343986511}

## FIVE on base IMDB model

In [12]:
# load our co-tuned model
model_dir = 'resources/models/IMDB/pytorch_model.bin'

# prepare hyperparameters
hparams =  {'batch_size': 8, 'use_USE': False, 'stop_words': True, 'perturb_words': 1, 
            'verbose': False, 'vote_avg_logits': True, 'std': 2.29, 'vector_count': 10}

# instantiate the model
model = FireBERT_FVE(load_from=model_dir, 
                     processor=ImdbProcessor(), 
                     hparams=hparams)

In [13]:
# get the metrics for the model against validation data
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='dev')
model.set_test_dataset(dataset, examples=examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=625.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8958), 'avg_test_f1': tensor(0.8796)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.895799994468689, 'avg_test_f1': 0.879625678062439}

In [14]:
# check the test set
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=625.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8840), 'avg_test_f1': tensor(0.8669)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8840000033378601, 'avg_test_f1': 0.8669165968894958}

In [15]:
# compare how well the model does against adversarial dev samples
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_dev')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=518.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.6238), 'avg_test_f1': tensor(0.5933)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.6237934231758118, 'avg_test_f1': 0.5932821035385132}

In [16]:
# check the adversarial test set
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=513.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.6202), 'avg_test_f1': tensor(0.5862)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.620224118232727, 'avg_test_f1': 0.5862066745758057}

## FuSE on base IMDB model

In [17]:
# load our co-tuned model
model_dir = 'resources/models/IMDB/pytorch_model.bin'

# prepare hyperparameters
hparams =  {'use_USE':True, 'USE_method':"filter", 'USE_multiplier':17, 'stop_words':True, 'perturb_words':3,
            'candidates_per_word':13, 'total_alternatives':12, 'match_pos':True, 'batch_size':1,'verbose':False, 
            'vote_avg_logits':True}

# instantiate the model
model = FireBERT_FSE(load_from=model_dir, 
                     processor=ImdbProcessor(), 
                     hparams=hparams)

INFO:absl:Using scratch/tf_cache to cache modules.


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


INFO:tensorflow:Saver not created because there are no variables in the graph to restore


In [18]:
# get the metrics for the model against validation data
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='dev')
model.set_test_dataset(dataset, examples=examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=5000.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.4974), 'avg_test_f1': tensor(0.4974)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.4973999857902527, 'avg_test_f1': 0.4973999857902527}

In [19]:
# check the test set
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=5000.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.5030), 'avg_test_f1': tensor(0.5030)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.503000020980835, 'avg_test_f1': 0.503000020980835}

In [20]:
# compare how well the model does against adversarial dev samples
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_dev')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=4141.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.7687), 'avg_test_f1': tensor(0.7687)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.7686549425125122, 'avg_test_f1': 0.7686549425125122}

In [21]:
# check the adversarial test set
dataset, examples = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_test')
model.set_test_dataset(dataset, examples)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=4101.0, style=Prog…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.7744), 'avg_test_f1': tensor(0.7744)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.7744452357292175, 'avg_test_f1': 0.7744452357292175}

## Co-tuned IMDB model from FACT

In [22]:
# load our co-tuned model
model_dir = 'resources/models/co-tuned_IMDB_on_lightning_final_filter/pytorch_model.bin'

# prepare hyperparameters
hparams = {'batch_size': 32 }

# instantiate the model
model = LightningBertForSequenceClassification(load_from=model_dir, 
                                               processor=ImdbProcessor(), 
                                               hparams=hparams)

In [23]:
# get the metrics for the model against validation data
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=157.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.9009), 'avg_test_f1': tensor(0.8979)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.9008758068084717, 'avg_test_f1': 0.8978500962257385}

In [24]:
# check the test set
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=157.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.9001), 'avg_test_f1': tensor(0.8968)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.900079607963562, 'avg_test_f1': 0.8968209624290466}

In [25]:
# compare how well the model does against adversarial dev samples
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_dev')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=130.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8832), 'avg_test_f1': tensor(0.8801)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8831730484962463, 'avg_test_f1': 0.8800737857818604}

In [26]:
# check the adversarial test set
dataset, _ = model.get_processor().load_and_cache_examples("data/IMDB", example_set='adv_test')
model.set_test_dataset(dataset)
trainer = pl.Trainer(gpus=1 if torch.cuda.is_available() else 0)
trainer.test(model)
trainer.tqdm_metrics

HBox(children=(FloatProgress(value=0.0, description='Testing', layout=Layout(flex='2'), max=129.0, style=Progr…

----------------------------------------------------------------------------------------------------
TEST RESULTS
{'avg_test_acc': tensor(0.8721), 'avg_test_f1': tensor(0.8666)}
----------------------------------------------------------------------------------------------------



{'avg_test_acc': 0.8721414804458618, 'avg_test_f1': 0.8666198253631592}