In [1]:
import torch

from transformers import AutoTokenizer, GenerationConfig
from models.bart_extractor import BartExtractor, ExtractedFactLoss
from dataset.msc_summary_turns import MSC_Turns
from dataset.msc_summary import MSC_Summaries
from metrics.terp import TerpMetric
from metrics.nli import NLIMetric

import utils.logging as logging

In [2]:
logging.set_log_level(logging.SPAM)

# Settings for dataset
datadir = '/Users/FrankVerhoef/Programming/PEX/data/'
basedir = 'msc/msc_personasummary/'
bart_base = 'facebook/bart-base'
sessions = [1]
len_context = 2
speaker_prefixes = ["<other>", "<self>"]
nofact_token = '' #'<nofact>'
add_tokens = None #speaker_prefixes + [nofact_token]
test_samples = 20
subset = 'train'

# config for TerpMetric
JAVA_HOME = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home"
TERPDIR = "/Users/FrankVerhoef/Programming/terp/"
TMPDIR = "/Users/FrankVerhoef/Programming/PEX/output/"
TerpMetric.set(terp_dir=TERPDIR, java_home=JAVA_HOME, tmp_dir=TMPDIR)

# Settings for model
checkpoint_dir = '/Users/FrankVerhoef/Programming/PEX/checkpoints/'
load = 'trained_nll05_bart'

# Setup
tokenizer = AutoTokenizer.from_pretrained(bart_base)
if add_tokens is not None:
    num_added_toks = tokenizer.add_tokens(add_tokens)
nofact_token_id = tokenizer.convert_tokens_to_ids(nofact_token) if nofact_token != '' else tokenizer.eos_token_id
assert nofact_token_id != tokenizer.unk_token_id, "nofact_token '{}' must be known token".format(nofact_token)

model = BartExtractor(bart_base=bart_base, nofact_token_id=nofact_token_id)
model.bart.resize_token_embeddings(len(tokenizer))

MSC_Turns.set(tokenizer=tokenizer, len_context=len_context, speaker_prefixes=speaker_prefixes, nofact_token=nofact_token)
msc_turns = MSC_Turns(basedir=datadir + basedir, sessions=sessions, subset=subset, max_samples=10)

logging.info("Loading model from {}".format(checkpoint_dir + load))
model.load_state_dict(torch.load(checkpoint_dir + load, map_location=torch.device('cpu')))

2023-09-02 20:28:48,035 INFO     | Loading model from /Users/FrankVerhoef/Programming/PEX/checkpoints/trained_nll05_bart


<All keys matched successfully>

In [3]:
model.bart.config

BartConfig {
  "_name_or_path": "facebook/bart-base",
  "activation_dropout": 0.1,
  "activation_function": "gelu",
  "add_bias_logits": false,
  "add_final_layer_norm": false,
  "architectures": [
    "BartModel"
  ],
  "attention_dropout": 0.1,
  "bos_token_id": 0,
  "classif_dropout": 0.1,
  "classifier_dropout": 0.0,
  "d_model": 768,
  "decoder_attention_heads": 12,
  "decoder_ffn_dim": 3072,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 6,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "early_stopping": true,
  "encoder_attention_heads": 12,
  "encoder_ffn_dim": 3072,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 6,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "id2label": {
    "0": "LABEL_0",
    "1": "LABEL_1",
    "2": "LABEL_2"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "LABEL_0": 0,
    "LABEL_1": 1,
    "LABEL_2": 2
  },
  "max_position_embeddings": 1024,
  "model_ty

In [4]:
model.bart.generation_config

GenerationConfig {
  "_from_model_config": true,
  "bos_token_id": 0,
  "decoder_start_token_id": 2,
  "early_stopping": true,
  "eos_token_id": 2,
  "forced_bos_token_id": 0,
  "forced_eos_token_id": 2,
  "no_repeat_ngram_size": 3,
  "num_beams": 4,
  "pad_token_id": 1,
  "transformers_version": "4.27.4"
}

In [5]:
for i in range(3):
    print(msc_turns[i])

("<self>Really? Do you do anything to assist you with coming out?\n<other>I've tried bringing it up before, but something always gets in the way.", '')
('<self>Flying is really fun, but clouds are amazing to look at\n<other>I agree. I like having a picnic and looking at the cloud shapes.', 'I enjoy picnics.')
("<self>Oh, what is that? I'm not sure I know what you are talking about?\n<other>Ll, I'm off of alot of coffee. Sorry. I can tell this will be interesting", 'I drink a lot of coffee.')


In [6]:
# config for TerpMetric
JAVA_HOME = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home"
TERPDIR = "/Users/FrankVerhoef/Programming/terp/"
TMPDIR = "/Users/FrankVerhoef/Programming/PEX/output/"
TerpMetric.set(terp_dir=TERPDIR, java_home=JAVA_HOME, tmp_dir=TMPDIR)

# set config for NLIMetric
NLIMetric.set(nli_model='facebook/bart-large-mnli', device='cpu', batch_size=8)

eval_kwargs = {
    'generation_config': {
        "max_new_tokens": 20,
    },
    'device': 'cpu', 
    'log_interval': 10
}

logging.info("Evaluating model on {} samples of testdata in {} with arguments {}".format(len(msc_turns), basedir, eval_kwargs))
eval_stats, result_dict = msc_turns.evaluate(model, **eval_kwargs)

2023-09-02 20:28:52,410 INFO     | Evaluating model on 10 samples of testdata in msc/msc_personasummary/ with arguments {'generation_config': {'max_new_tokens': 20}, 'device': 'cpu', 'log_interval': 10}
2023-09-02 20:28:53,295 SPAM     | Generate: pred_fact=tensor([False])
2023-09-02 20:28:53,296 SPAM     | Generate: gen_out=tensor([[2, 0, 2]])
context:     <self>Really? Do you do anything to assist you with coming out?
<other>I've tried bringing it up before, but something always gets in the way.
target:      
prediction:  
----------------------------------------
2023-09-02 20:28:54,023 SPAM     | Generate: pred_fact=tensor([True])
2023-09-02 20:28:54,024 SPAM     | Generate: gen_out=tensor([[    2,     0,   100,   101,   519,    10, 22297,     4,     2]])
context:     <self>Flying is really fun, but clouds are amazing to look at
<other>I agree. I like having a picnic and looking at the cloud shapes.
target:      I enjoy picnics.
prediction:  I like having a picnic.
-----------------

In [7]:
eval_stats

{'acc': 0.8999999761581421,
 'f1': 0.9230769276618958,
 'precision': 0.8571428656578064,
 'recall': 1.0,
 'cm': [[3, 1], [0, 6]],
 'nli_predictions': 0.7534764965031562,
 'nli_targets': 0.9005989134311676,
 'nli_preds_to_targets': 0.5029634467937285,
 'nli_targets_to_preds': 0.8067612039546171,
 'numwords_factor': 0.9877344877344877,
 'bleu_2': 0.4276726543903351,
 'bleu_4': 0.31776177883148193,
 'terp': 0.5354940493901571,
 'rouge1_fmeasure': 0.5622718930244446,
 'rouge1_precision': 0.6261904835700989,
 'rouge1_recall': 0.5313853025436401,
 'rouge2_fmeasure': 0.3629629611968994,
 'rouge2_precision': 0.42499998211860657,
 'rouge2_recall': 0.32777777314186096,
 'rougeL_fmeasure': 0.5426640510559082,
 'rougeL_precision': 0.5984126925468445,
 'rougeL_recall': 0.5162338018417358}

In [8]:
MSC_Summaries.set(tokenizer=tokenizer, speaker_prefixes=speaker_prefixes, nofact_token=nofact_token)
msc_summaries = MSC_Summaries(
    basedir=datadir + basedir, 
    session=1, 
    subset="test",   
    max_samples=5 #test_samples      
)

In [9]:
for i in range(3):
    print(msc_summaries[i])

(['<self>Hi how are you?\n<other>Great, yourself?', "<self>Good, do you like to paint? I do!\n<other>I'm not much of a painter, I prefer horses. What do you paint?", '<self>People playing music since I like to do that as well!\n<other>My ancestor had a song written about him. He was an american civil war general.', '<self>Wow that is really interesting.\n<other>Do you like fruit? I love to eat fruit.', "<self>I do like fruit! It's not expensive and it helps me to live within a budget.\n<other>Budgets are important! Especially when dealing with such large animals as horses.", "<self>Oh I'm sure. I love animals, like people!\n<other>I like people, unlike my father. He rarely smiles which makes socializing difficult.", "<self>Sounds like my boss. I wish I didn't have one.\n<other>Oh, what do you do? Bosses can be tough to deal with."], "I like horses.\nI'm descended from a civil war general.\nI love fruit.")
(['<self>Hello, just got back from fishing, how are you?\n<other>Fine. I love to 

In [10]:
eval_kwargs = {'metrics': ['ter'], 'device': 'cpu', 'log_interval': 10, "generation_config": {"max_new_tokens": 20}}
eval_stats = msc_summaries.evaluate(model, **eval_kwargs)

2023-09-02 20:29:06,466 INFO     | Start evaluation of model BartExtractor on metrics: ter
2023-09-02 20:29:09,727 SPAM     | Generate: pred_fact=tensor([False,  True,  True,  True, False,  True, False])
2023-09-02 20:29:09,728 SPAM     | Generate: gen_out=tensor([[    2,     0,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [    2,     0,   100,   101,  8087,     4,     2,     1,     1,     1,
             1,     1],
        [    2,     0,  2387, 40701,    56,    10,  2214,  1982,    59,   123,
             4,     2],
        [    2,     0,   100,   657,  6231,     4,     2,     1,     1,     1,
             1,     1],
        [    2,     0,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1],
        [    2,     0,   100,   218,    75,   101,    82,     4,     2,     1,
             1,     1],
        [    2,     0,     2,     1,     1,     1,     1,     1,     1,     1,
             1,     1]])
Utterances: 
	<self>H

In [11]:
s1 = "I go there"
s2 = "I go\nthere"
s3 = "I go\n there"
s4 = "I go \nthere"
s5 = "I go \n there"
s6 = "<nofact>"
encoded_utterances = tokenizer(text=[s1, s2, s3, s4, s5, s6], return_tensors='pt', padding=True)
print(encoded_utterances)
for enc in encoded_utterances['input_ids']:
    print(tokenizer.convert_ids_to_tokens(enc))


{'input_ids': tensor([[    0,   100,   213,    89,     2,     1,     1],
        [    0,   100,   213, 50118,  8585,     2,     1],
        [    0,   100,   213, 50118,    89,     2,     1],
        [    0,   100,   213,  1437, 50118,  8585,     2],
        [    0,   100,   213,  1437, 50118,    89,     2],
        [    0, 41552,   282,  1116,  7257, 15698,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}
['<s>', 'I', 'Ġgo', 'Ġthere', '</s>', '<pad>', '<pad>']
['<s>', 'I', 'Ġgo', 'Ċ', 'there', '</s>', '<pad>']
['<s>', 'I', 'Ġgo', 'Ċ', 'Ġthere', '</s>', '<pad>']
['<s>', 'I', 'Ġgo', 'Ġ', 'Ċ', 'there', '</s>']
['<s>', 'I', 'Ġgo', 'Ġ', 'Ċ', 'Ġthere', '</s>']
['<s>', '<', 'n', 'of', 'act', '>', '</s>']


In [12]:
s0 = "I go there\nWhy"
s0b = "I go there Why"
s1 = "<self>I go there <other>Why"
s2 = "<self>I go there\n<other>Why"
s3 = "<self>I go there\n<other> Why"
s4 = "<self> I go there\n <other> Why"
s5 = "<self>I go there \n <other> Why"
encoded_utterances = tokenizer(text=[s0, s0b, s1, s2, s3, s4, s5], return_tensors='pt', padding=True)
print(encoded_utterances)
for enc in encoded_utterances['input_ids']:
    print(tokenizer.convert_ids_to_tokens(enc))


{'input_ids': tensor([[    0,   100,   213,    89, 50118,  7608,     2,     1,     1,     1,
             1,     1,     1,     1],
        [    0,   100,   213,    89,  2612,     2,     1,     1,     1,     1,
             1,     1,     1,     1],
        [    0, 41552, 13367, 15698,   100,   213,    89, 28696,  7443, 15698,
          7608,     2,     1,     1],
        [    0, 41552, 13367, 15698,   100,   213,    89, 50118, 41552,  7443,
         15698,  7608,     2,     1],
        [    0, 41552, 13367, 15698,   100,   213,    89, 50118, 41552,  7443,
         15698,  2612,     2,     1],
        [    0, 41552, 13367, 15698,    38,   213,    89, 50118, 28696,  7443,
         15698,  2612,     2,     1],
        [    0, 41552, 13367, 15698,   100,   213,    89,  1437, 50118, 28696,
          7443, 15698,  2612,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [13]:
print(s1.split())
print(s2.split())
print(s5.split())

['<self>I', 'go', 'there', '<other>Why']
['<self>I', 'go', 'there', '<other>Why']
['<self>I', 'go', 'there', '<other>', 'Why']


In [14]:
s1 = "<self> Are you settling in the city at all or do you still really miss the country? <other> I am settling in, but I really miss it."
s2 = "<self> Are you settling in the city? <other> No, I really miss it."
encoded_utterances = tokenizer(text=[s1, s2], return_tensors='pt', padding=True)
encoded_utterances


{'input_ids': tensor([[    0, 41552, 13367, 15698,  3945,    47, 15433,    11,     5,   343,
            23,    70,    50,   109,    47,   202,   269,  2649,     5,   247,
           116, 28696,  7443, 15698,    38,   524, 15433,    11,     6,    53,
            38,   269,  2649,    24,     4,     2],
        [    0, 41552, 13367, 15698,  3945,    47, 15433,    11,     5,   343,
           116, 28696,  7443, 15698,   440,     6,    38,   269,  2649,    24,
             4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [15]:
input_ids = torch.cat([encoded_utterances['input_ids'], torch.ones(20, dtype=torch.long).view(2, 10)], dim=1)
attn_mask = torch.cat([encoded_utterances['attention_mask'], torch.zeros(20, dtype=torch.long).view(2,10)], dim=1)
pred_tokens_2 = model.generate(
    input_ids=input_ids.to('cpu'), 
    # attention_mask=attn_mask.to('cpu'),    # attention_mask is not necessary, is defined within the generatie function
    min_length=2,
    max_new_tokens=20, 
    num_beams=1,
    do_sample=False,
    forced_eos_token_id=list(set([tokenizer.eos_token_id, model.nofact_token_id]))
)
tokenizer.batch_decode(pred_tokens_2)

2023-09-02 20:29:23,258 SPAM     | Generate: pred_fact=tensor([True, True])
2023-09-02 20:29:23,259 SPAM     | Generate: gen_out=tensor([[    2,     0,   100,   524, 15433,    11,     5,   343,     4,     2],
        [    2,     0,   100,   524,  1375,     7,     5,   343,     4,     2]])


['</s><s>I am settling in the city.</s>',
 '</s><s>I am moving to the city.</s>']

In [16]:
encoded_utterances['input_ids']

tensor([[    0, 41552, 13367, 15698,  3945,    47, 15433,    11,     5,   343,
            23,    70,    50,   109,    47,   202,   269,  2649,     5,   247,
           116, 28696,  7443, 15698,    38,   524, 15433,    11,     6,    53,
            38,   269,  2649,    24,     4,     2],
        [    0, 41552, 13367, 15698,  3945,    47, 15433,    11,     5,   343,
           116, 28696,  7443, 15698,   440,     6,    38,   269,  2649,    24,
             4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1]])

In [17]:
encoded_utterances

{'input_ids': tensor([[    0, 41552, 13367, 15698,  3945,    47, 15433,    11,     5,   343,
            23,    70,    50,   109,    47,   202,   269,  2649,     5,   247,
           116, 28696,  7443, 15698,    38,   524, 15433,    11,     6,    53,
            38,   269,  2649,    24,     4,     2],
        [    0, 41552, 13367, 15698,  3945,    47, 15433,    11,     5,   343,
           116, 28696,  7443, 15698,   440,     6,    38,   269,  2649,    24,
             4,     2,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [18]:
pred_tokens = model.generate(
    input_ids=encoded_utterances['input_ids'].to('cpu'), 
    attention_mask=encoded_utterances['attention_mask'].to('cpu'),
    min_length=2,
    max_new_tokens=20, 
    num_beams=1,
    do_sample=False,
)

2023-09-02 20:29:23,811 SPAM     | Generate: pred_fact=tensor([True, True])
2023-09-02 20:29:23,811 SPAM     | Generate: gen_out=tensor([[    2,     0,   100,   524, 15433,    11,     5,   343,     4,     2],
        [    2,     0,   100,   524,  1375,     7,     5,   343,     4,     2]])


In [19]:
pred_tokens

tensor([[    2,     0,   100,   524, 15433,    11,     5,   343,     4,     2],
        [    2,     0,   100,   524,  1375,     7,     5,   343,     4,     2]])

In [20]:
tokenizer.batch_decode(pred_tokens)

['</s><s>I am settling in the city.</s>',
 '</s><s>I am moving to the city.</s>']

In [21]:
model.bart.config.pad_token_id

1

In [22]:
criterion = ExtractedFactLoss(nofact_token_id=nofact_token_id, ignore_index=-100, lm_weight=0.5)

In [23]:
p = torch.randint(0, 100, (2,3,8)).float()
t = torch.randint(0,7, (2,3))

In [24]:
p, t

(tensor([[[67., 35., 31., 28., 79., 76., 92., 83.],
          [ 0., 45., 21., 48., 90., 17., 44., 97.],
          [12., 38., 22., 20., 90., 50., 72., 65.]],
 
         [[46., 32., 10., 43., 92., 96., 97., 59.],
          [52.,  1., 26., 36., 69.,  7.,  2., 58.],
          [80., 61., 45., 27., 20., 25., 18., 83.]]]),
 tensor([[0, 6, 0],
         [2, 0, 0]]))

In [25]:
criterion.nllloss(p.permute(0,2,1), t)

tensor([[-67., -44., -12.],
        [-10., -52., -80.]])

In [26]:
p = torch.tensor([[[-6, 0, -5, 7], [-.5, -.5, -8, 0]]]).float()
t = torch.tensor([[0, -100]])

In [27]:
criterion.nllloss.ignore_index

-100