In [1]:
# Include root directory in module path
import sys
sys.path.append('../')

# Load a Thermostat dataset using the Hugging Face datasets library

This will use the dataset script ("thermostat.py") in the "thermostat" directory.
In this example, we use the test set of the "imdb-bert-lgxa" configuration.
This refers to Layer Gradient x Activation (LGxA) explanations of the predictions by a BERT model that has been fine-tuned on the IMDb (train) dataset and evaluated on the IMDb test dataset.
In other words, we load the 25k test examples from the IMDb test plus the BERT predictions and the feature attributions from a Layer Gradient x Activation explainer.

In [7]:
from datasets import load_dataset

data = load_dataset("thermostat", "imdb-bert-lgxa", split="test")

Using the latest cached version of the module from /home/nfel/.cache/huggingface/modules/datasets_modules/datasets/thermostat/82ada9d63d3c6b421a4ade89adc656b856fe9924abbc5cc94f20d472f5c71e99 (last modified on Sat Jun 12 17:18:24 2021) since it couldn't be found locally at thermostat/thermostat.py or remotely (FileNotFoundError).
Reusing dataset thermostat (/home/nfel/.cache/huggingface/datasets/thermostat/imdb-bert-lgxa/1.0.0/82ada9d63d3c6b421a4ade89adc656b856fe9924abbc5cc94f20d472f5c71e99)


## Now let's inspect a single instance of the loaded dataset
Here, we will stick to the functionality that the datasets library already supplies us with.
For readability purposes, we will not print the whole content of that instance.
Instead, we're showing only the first few entries of the attributions and the input_ids, respectively.

In [26]:
from pprint import pprint

instance = data[0]

print(f'Keys: {instance.keys()}\n')
print(f'Index: {instance["idx"]}')
print(f'Input IDs (first 15): {instance["input_ids"][:15]}')
print(f'Attributions (first 4): {instance["attributions"][:4]}')
print(f'True label: {instance["label"]}')
print(f'Predictions (logits): {instance["predictions"]}')

Keys: dict_keys(['attributions', 'idx', 'input_ids', 'label', 'predictions'])

Index: 0
Input IDs (first 15): [101, 2092, 1010, 1045, 7166, 2000, 3422, 3152, 2005, 2028, 1997, 2093, 4436, 1012, 6854]
Attributions (first 4): [-0.18760254979133606, -0.0315956249833107, 0.04854373633861542, 0.00658783596009016]
True label: 1
Predictions (logits): [-3.4371631145477295, 4.042327404022217]


# Okay, but these are just numbers. Can we make this a bit more readable?
Of course! First, let's select a small subset using the datasets ".select" function:

In [41]:
lgxa_head = data.select(range(20))

Next, we can import the "Thermopack" class from our accompanying library. It inherits all properties from a Hugging Face Dataset, but also instantiates the tokenizer of the downstream model and automatically decodes the Input IDs to words.

In [42]:
from src.thermostat import Thermopack
tp = Thermopack(lgxa_head)
print(tp)

IMDb dataset, BERT model, Layer Gradient x Activation explanations
Explainer: LayerGradientXActivation
Model: textattack/bert-base-uncased-imdb
Dataset: imdb



In [48]:
pprint({k: v for k, v in vars(tp).items() if not k.startswith('_')})

{'dataset': Dataset({
    features: ['label', 'idx', 'attributions', 'predictions', 'input_ids'],
    num_rows: 20
}),
 'dataset_name': 'imdb',
 'explainer_name': 'LayerGradientXActivation',
 'label_names': ['neg', 'pos'],
 'model_name': 'textattack/bert-base-uncased-imdb',
 'tokenizer': PreTrainedTokenizerFast(name_or_path='textattack/bert-base-uncased-imdb', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}),
 'units': [<src.thermostat.data.dataset_utils.Thermounit object at 0x7f4078574cd0>,
           <src.thermostat.data.dataset_utils.Thermounit object at 0x7f3f1ad97fd0>,
           <src.thermostat.data.dataset_utils.Thermounit object at 0x7f3f1ae3d4c0>,
           <src.thermostat.data.dataset_utils.Thermounit object at 0x7f3f1ae3d490>,
           <src.thermostat.data.dataset_utils.Thermounit object at 0x7f3f189c0100>,
           <sr

In [55]:
tu0 = tp[0]
pprint({k: v for k, v in vars(tu0).items() if not k.startswith('_') and k not in ['heatmap', 'instance', 'tokens']})

{'dataset_name': 'imdb',
 'explainer_name': 'LayerGradientXActivation',
 'index': 0,
 'model_name': 'textattack/bert-base-uncased-imdb',
 'predicted_label': {'index': 1, 'name': 'pos'},
 'text': 'well, i tend to watch films for one of three reasons. unfortunately, '
         'there are no transformers in this film, so i can recommend it only '
         'on comedy value and pretty women ( read girls ) < br / > < br / > '
         'yes, it is funny, i know this due to the number of people in the '
         'cinema who were laughing on a regular basis throughout. personally '
         'though, i loved it for laura fraser, who imho is fit!',
 'tokenizer': PreTrainedTokenizerFast(name_or_path='textattack/bert-base-uncased-imdb', vocab_size=30522, model_max_len=512, is_fast=True, padding_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}),
 'true_label': {'index': 1, 'name': 'pos'}}


In [56]:
heatmap = tu0.render(jupyter=True)
heatmap

# Compare heatmaps from different models

In [76]:
xlnet = load_dataset("thermostat", "mnli-electra-occ", split="test")
xlnet_head = xlnet.select(range(20))
tp_xlnet = Thermopack(xlnet_head)

bert = load_dataset("thermostat", "mnli-bert-occ", split="test")
bert_head = bert.select(range(20))
tp_bert = Thermopack(bert_head)

Using the latest cached version of the module from /home/nfel/.cache/huggingface/modules/datasets_modules/datasets/thermostat/82ada9d63d3c6b421a4ade89adc656b856fe9924abbc5cc94f20d472f5c71e99 (last modified on Sat Jun 12 17:18:24 2021) since it couldn't be found locally at thermostat/thermostat.py or remotely (FileNotFoundError).


Downloading and preparing dataset thermostat/mnli-electra-occ (download: Unknown size, generated: Unknown size, post-processed: Unknown size, total: Unknown size) to /home/nfel/.cache/huggingface/datasets/thermostat/mnli-electra-occ/1.0.0/82ada9d63d3c6b421a4ade89adc656b856fe9924abbc5cc94f20d472f5c71e99...


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Downloading', layout=Layout(width='20px…




HBox(children=(FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0), HTML(value=''…

Dataset thermostat downloaded and prepared to /home/nfel/.cache/huggingface/datasets/thermostat/mnli-electra-occ/1.0.0/82ada9d63d3c6b421a4ade89adc656b856fe9924abbc5cc94f20d472f5c71e99. Subsequent calls will reuse this data.


HBox(children=(FloatProgress(value=0.0, description='Downloading', max=916.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=112.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=301.0, style=ProgressStyle(description_…




Using the latest cached version of the module from /home/nfel/.cache/huggingface/modules/datasets_modules/datasets/thermostat/82ada9d63d3c6b421a4ade89adc656b856fe9924abbc5cc94f20d472f5c71e99 (last modified on Sat Jun 12 17:18:24 2021) since it couldn't be found locally at thermostat/thermostat.py or remotely (FileNotFoundError).
Reusing dataset thermostat (/home/nfel/.cache/huggingface/datasets/thermostat/mnli-bert-occ/1.0.0/82ada9d63d3c6b421a4ade89adc656b856fe9924abbc5cc94f20d472f5c71e99)


In [80]:
xlnet[0]['input_ids']

[101,
 1999,
 1996,
 2168,
 3277,
 1010,
 1037,
 6254,
 4709,
 4106,
 4953,
 1996,
 2833,
 1998,
 4319,
 3447,
 1005,
 1055,
 7360,
 2058,
 19332,
 10196,
 1011,
 4820,
 15001,
 1998,
 5610,
 3238,
 9098,
 3688,
 2001,
 2405,
 1998,
 7928,
 2020,
 7303,
 1012,
 102,
 1037,
 6254,
 2001,
 2405,
 2055,
 1996,
 17473,
 1005,
 1055,
 7360,
 2058,
 15001,
 1998,
 7928,
 2020,
 2025,
 3039,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,

In [79]:
bert[0]['input_ids']

[101,
 2021,
 3728,
 1010,
 1996,
 12143,
 2038,
 26588,
 1012,
 102,
 1996,
 12143,
 2038,
 2042,
 4359,
 3728,
 1012,
 102,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0