In [1]:
import torch

from transformers import AutoTokenizer
from models.bart_extractor import BartExtractor, ExtractedFactLoss
from dataset.msc_summary_turns import MSC_Turns
from dataset.msc_summary import MSC_Summaries
from metrics.terp import TerpMetric

import utils.logging as logging

In [2]:
logging.set_log_level(logging.SPAM)

# Settings for dataset
datadir = '/Users/FrankVerhoef/Programming/PEX/data/'
basedir = 'msc/msc_personasummary/'
sessions = [1]
len_context = 2
speaker_prefixes = ["<other>", "<self>"]
nofact_token = '<nofact>'
add_tokens = speaker_prefixes + [nofact_token]
test_samples = 20
subset = 'train'

# config for TerpMetric
JAVA_HOME = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home"
TERPDIR = "/Users/FrankVerhoef/Programming/terp/"
TMPDIR = "/Users/FrankVerhoef/Programming/PEX/output/"
TerpMetric.set(terp_dir=TERPDIR, java_home=JAVA_HOME, tmp_dir=TMPDIR)

# Settings for model
checkpoint_dir = '/Users/FrankVerhoef/Programming/PEX/checkpoints/'
load = 'trained_bart'

# Setup
tokenizer = AutoTokenizer.from_pretrained('facebook/bart-large-cnn')
if add_tokens is not None:
    num_added_toks = tokenizer.add_tokens(add_tokens)
nofact_token_id = tokenizer.convert_tokens_to_ids(nofact_token) if nofact_token != '' else tokenizer.eos_token_id
assert nofact_token_id != tokenizer.unk_token_id, "nofact_token '{}' must be known token".format(nofact_token)

model = BartExtractor(bart_base='facebook/bart-large-cnn', nofact_token_id=nofact_token_id)
model.bart.resize_token_embeddings(len(tokenizer))

MSC_Turns.set(tokenizer=tokenizer, len_context=len_context, speaker_prefixes=speaker_prefixes, nofact_token=nofact_token)
msc_turns = MSC_Turns(basedir=datadir + basedir, sessions=sessions, subset=subset, max_samples=10)

logging.info("Loading model from {}".format(checkpoint_dir + load))
model.load_state_dict(torch.load(checkpoint_dir + load, map_location=torch.device('cpu')))

2023-05-29 20:52:43,314 INFO     | Loading model from /Users/FrankVerhoef/Programming/PEX/checkpoints/trained_bart


<All keys matched successfully>

In [10]:
from collections import Counter
token_counter = Counter()

batch_size = 32
for session in [[1]]:
    msc_turns = MSC_Turns(basedir=datadir + basedir, sessions=sessions, subset=subset)
    for i in range(len(msc_turns)):
        tokens = tokenizer.tokenize(msc_turns[i][1])
        # print(tokens)
        token_counter.update(tokens) #({tokens[0]: 1})
token_counter_sorted = sorted(token_counter.items(), key=lambda x: x[1], reverse=True)
token_counter_sorted


[('.', 44464),
 ('I', 30688),
 ('<nofact>', 20769),
 ('Ġa', 11176),
 ('ĠI', 9453),
 ('Ġhave', 7593),
 ('Ġlike', 7288),
 ('Ġto', 6935),
 ('Ġam', 6723),
 ('Ġin', 4319),
 ('Ġthe', 3502),
 ('Ġmy', 3213),
 ('My', 3175),
 ('Ġis', 3145),
 ('Ġand', 2979),
 ('Ġlove', 2889),
 ('Ġwork', 2070),
 ('Ġnot', 1836),
 ('Ġlive', 1741),
 ('Ġof', 1561),
 ('Ġat', 1497),
 ("'t", 1473),
 ('Ġfavorite', 1380),
 ('Ġdo', 1332),
 ('Ġfor', 1317),
 ('Ġdon', 1294),
 ('Ġan', 1174),
 ('Ġmusic', 1126),
 ('Ġwith', 1122),
 (',', 1037),
 ('Ġon', 1008),
 ('Ġplay', 995),
 ("'m", 984),
 ('Ġenjoy', 918),
 ('ĠMy', 851),
 ('Ġschool', 767),
 ('Ġgo', 767),
 ('Ġwant', 740),
 ('Ġfrom', 717),
 ('Ġjob', 717),
 ('Ġwas', 700),
 ('Ġare', 665),
 ('Ġdog', 642),
 ('Ġfood', 625),
 ('Ġas', 624),
 ('Ġkids', 623),
 ('Ġdogs', 622),
 ('Ġbe', 595),
 ('Ġyears', 574),
 ('Ġown', 570),
 ('Ġbeen', 543),
 ('Ġcollege', 534),
 ('Ġread', 518),
 ('Ġtwo', 506),
 ('Ġmarried', 503),
 ('Ġeat', 499),
 ('Ġold', 479),
 ('Ġlot', 463),
 ('Ġme', 433),
 ('Ġparents', 4

In [None]:
model.bart.config

In [None]:
model.bart.generation_config

In [3]:
for i in range(3):
    print(msc_turns[i])

('<self>I do not, I like to dance though <other>Dancing is cool. I dance when I work out sometimes.', 'I like to dance and work out.')
('<self>Absolutely, last time I was in a mall was for senior prom photos! <other>Wow. Where are you from?', '<nofact>')
('<self>Where do you want to visit? <other>Anywhere with a lot of hiking trails!', '<nofact>')


In [3]:
# config for TerpMetric
JAVA_HOME = "/opt/homebrew/opt/openjdk/libexec/openjdk.jdk/Contents/Home"
TERPDIR = "/Users/FrankVerhoef/Programming/terp/"
TMPDIR = "/Users/FrankVerhoef/Programming/PEX/output/"
TerpMetric.set(terp_dir=TERPDIR, java_home=JAVA_HOME, tmp_dir=TMPDIR)

eval_kwargs = {'device': 'cpu', 'log_interval': 10, 'decoder_max': 20}

logging.info("Evaluating model on {} samples of testdata in {} with arguments {}".format(len(msc_turns), basedir, eval_kwargs))
eval_stats = msc_turns.evaluate(model, **eval_kwargs)

2023-05-18 23:42:55,529 INFO     | Evaluating model on 10 samples of testdata in msc/msc_personasummary/ with arguments {'device': 'cpu', 'log_interval': 10, 'decoder_max': 20}
2023-05-18 23:42:57,287 SPAM     | Generate: pred_fact=tensor([True])
2023-05-18 23:42:57,288 SPAM     | Generate: gen_out=tensor([[   2,    0,    0,    0,  100,   33,   57,   11,    5,  831,    4,   38,
           21,   11,    5, 3835,   77,   38,   21, 3240,    2]])
context:     <self>My mother work as a nurse <other>Have you ever been in the military? I was when I was younger
target:      I was in the military when I was young.
prediction:  I have been in the military. I was in the army when I was younger
----------------------------------------
2023-05-18 23:42:58,329 SPAM     | Generate: pred_fact=tensor([True])
2023-05-18 23:42:58,330 SPAM     | Generate: gen_out=tensor([[   2,    0,    0,    0,  100,  173,  608, 4861,  751,    4,    2]])
context:     <self>It is working with kids, yours <other>I work doin

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


2023-05-18 23:43:14,421 SPAM     | TERp output
Loading parameters from /Users/FrankVerhoef/Programming/terp/data/terpa.param
Loading parameters from /Users/FrankVerhoef/Programming/terp/data/data_loc.param
"/Users/FrankVerhoef/Programming/PEX/output/hyp.trans" was successfully parsed as Trans text
"/Users/FrankVerhoef/Programming/PEX/output/ref.trans" was successfully parsed as Trans text
Creating Segment Phrase Tables From DB
Processing [sys][000000][000000]
Processing [sys][000000][000001]
Processing [sys][000000][000002]
Processing [sys][000000][000003]
Processing [sys][000000][000004]
Processing [sys][000000][000005]
Finished Calculating TERp
Total TER: 0,63 (29,48 / 47,00)



In [4]:
eval_stats

{'acc': 0.800000011920929,
 'f1': 0.8571428656578064,
 'precision': 0.8571428656578064,
 'recall': 0.8571428656578064,
 'cm': [[2, 1], [1, 6]],
 'bleu_2': 0.3472660183906555,
 'bleu_4': 0.2208770364522934,
 'bert_f1': 0.6035879502693812,
 'terp': 0.5634329319000244,
 'rouge1_fmeasure': 0.48769572377204895,
 'rouge1_precision': 0.465277761220932,
 'rouge1_recall': 0.6134259104728699,
 'rouge2_fmeasure': 0.3181818425655365,
 'rouge2_precision': 0.3095238208770752,
 'rouge2_recall': 0.4482323229312897,
 'rougeL_fmeasure': 0.47380685806274414,
 'rougeL_precision': 0.4541666507720947,
 'rougeL_recall': 0.5949074029922485}

In [5]:
MSC_Summaries.set(tokenizer=tokenizer, speaker_prefixes=speaker_prefixes, nofact_token=nofact_token)
msc_summaries = MSC_Summaries(
    basedir=datadir + basedir, 
    session=1, 
    subset="test",   
    max_samples=test_samples      
)

In [6]:
for i in range(3):
    print(msc_summaries[i])

(['<other> Hi, tracy here. I love people and fast cars. You sing? <self> Hi tracy. Amanda here. I enjoy going to the gym alongside my vegan diet.', "<other> I see. I love being helpful and I paint art. Are you an artist? <self> I don't sing, but love art. My lawyer husband is an artist as well.", '<other> Get out! I am a paralegal. I love helping people too. You bake? <self> Whoa! Funny you say that, my jeep is loaded with baked goods right now!', '<other> Great! I would love to paint that picture. You dance? <self> If you call zumba dancing! Lol! What is your favorite vacations spot?', '<other> Niagra falls. It is an artist paradise. You like to travel? <self> I love traveling, especially to the mountains. They are a good workout!', '<other> Me too! I like fast cars and new people. Nice to meet you. <self> You as well. I love vehicles that can tackle rough terrain, like my jeep wrangler.'], 'My name is Amanda.\nI enjoy going to the gym.\nI am a vegan.\nI have a husband who is a lawyer

In [7]:
eval_kwargs = {'metrics': 'ter', 'device': 'cpu', 'log_interval': 10, 'decoder_max': 20}
eval_stats = msc_summaries.evaluate(model, **eval_kwargs)

2023-05-18 23:43:51,383 INFO     | Start evaluation of model BartExtractor on metrics: t,e,r
2023-05-18 23:43:57,331 SPAM     | Generate: pred_fact=tensor([ True,  True, False,  True,  True,  True])
2023-05-18 23:43:57,332 SPAM     | Generate: gen_out=tensor([[    2,     0,     0,     0,   100,   101,   164,     7,     5,  6545,
             4,    38,  3529,    10, 15848,  5626,     4,     2,     1],
        [    2,     0,     0,     0,   100,   657,  1808,     4,    38,   218,
            75,  7884,     4,    38,    33,    10,  1623,     4,     2],
        [    2,     0,     0,     0, 50267,     2,     1,     1,     1,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1],
        [    2,     0,     0,     0,     4,    38,   101,     7,  3836,     4,
             2,     1,     1,     1,     1,     1,     1,     1,     1],
        [    2,     0,     0,     0,   100,   101,     7,  1504,     4,    38,
           101,     7,   213,     7,     5,  9787,     4,    

KeyError: 't'

In [13]:
s1 = "I go there"
s2 = "I go\nthere"
s3 = "I go\n there"
s4 = "I go \nthere"
s5 = "I go \n there"
encoded_utterances = tokenizer(text=[s1, s2, s3, s4, s5], return_tensors='pt', padding=True)
print(encoded_utterances)
for enc in encoded_utterances['input_ids']:
    print(tokenizer.convert_ids_to_tokens(enc))


{'input_ids': tensor([[    0,   100,   213,    89,     2,     1,     1],
        [    0,   100,   213, 50118,  8585,     2,     1],
        [    0,   100,   213, 50118,    89,     2,     1],
        [    0,   100,   213,  1437, 50118,  8585,     2],
        [    0,   100,   213,  1437, 50118,    89,     2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1]])}
['<s>', 'I', 'Ġgo', 'Ġthere', '</s>', '<pad>', '<pad>']
['<s>', 'I', 'Ġgo', 'Ċ', 'there', '</s>', '<pad>']
['<s>', 'I', 'Ġgo', 'Ċ', 'Ġthere', '</s>', '<pad>']
['<s>', 'I', 'Ġgo', 'Ġ', 'Ċ', 'there', '</s>']
['<s>', 'I', 'Ġgo', 'Ġ', 'Ċ', 'Ġthere', '</s>']


In [16]:
s0 = "I go there\nWhy"
s0b = "I go there Why"
s1 = "<self>I go there <other>Why"
s2 = "<self>I go there\n<other>Why"
s3 = "<self>I go there\n<other> Why"
s4 = "<self> I go there\n <other> Why"
s5 = "<self>I go there \n <other> Why"
encoded_utterances = tokenizer(text=[s0, s0b, s1, s2, s3, s4, s5], return_tensors='pt', padding=True)
print(encoded_utterances)
for enc in encoded_utterances['input_ids']:
    print(tokenizer.convert_ids_to_tokens(enc))


{'input_ids': tensor([[    0,   100,   213,    89, 50118,  7608,     2,     1,     1,     1,
             1],
        [    0,   100,   213,    89,  2612,     2,     1,     1,     1,     1,
             1],
        [    0, 50266,   100,   213,    89,  1437, 50265,  7608,     2,     1,
             1],
        [    0, 50266,   100,   213,    89, 50118, 50265,  7608,     2,     1,
             1],
        [    0, 50266,   100,   213,    89, 50118, 50265,  2612,     2,     1,
             1],
        [    0, 50266,    38,   213,    89, 50118,  1437, 50265,  2612,     2,
             1],
        [    0, 50266,   100,   213,    89,  1437, 50118,  1437, 50265,  2612,
             2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 

In [17]:
print(s1.split())
print(s2.split())
print(s5.split())

['<self>I', 'go', 'there', '<other>Why']
['<self>I', 'go', 'there', '<other>Why']
['<self>I', 'go', 'there', '<other>', 'Why']


In [7]:
s1 = "<self> Are you settling in the city at all or do you still really miss the country? <other> I am settling in, but I really miss it."
s2 = "<self> Are you settling in the city? <other> No, I really miss it."
encoded_utterances = tokenizer(text=[s1, s2], return_tensors='pt', padding=True)
encoded_utterances


{'input_ids': tensor([[    0, 50266,  3945,    47, 15433,    11,     5,   343,    23,    70,
            50,   109,    47,   202,   269,  2649,     5,   247,   116,  1437,
         50265,    38,   524, 15433,    11,     6,    53,    38,   269,  2649,
            24,     4,     2],
        [    0, 50266,  3945,    47, 15433,    11,     5,   343,   116,  1437,
         50265,   440,     6,    38,   269,  2649,    24,     4,     2,     1,
             1,     1,     1,     1,     1,     1,     1,     1,     1,     1,
             1,     1,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
         1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0]])}

In [None]:
input_ids = torch.cat([encoded_utterances['input_ids'], torch.ones(20, dtype=torch.long).view(2, 10)], dim=1)
attn_mask = torch.cat([encoded_utterances['attention_mask'], torch.zeros(20, dtype=torch.long).view(2,10)], dim=1)
pred_tokens_2 = model.generate(
    input_ids=input_ids.to('cpu'), 
    # attention_mask=attn_mask.to('cpu'),    # attention_mask is not necessary, is defined within the generatie function
    min_length=2,
    max_new_tokens=20, 
    num_beams=1,
    do_sample=False,
    forced_eos_token_id=list(set([tokenizer.eos_token_id, model.nofact_token_id]))
)
tokenizer.batch_decode(pred_tokens_2)

In [None]:
encoded_utterances['input_ids']

In [None]:
encoded_utterances

In [None]:
pred_tokens = model.generate(
    input_ids=encoded_utterances['input_ids'].to('cpu'), 
    attention_mask=encoded_utterances['attention_mask'].to('cpu'),
    min_length=2,
    max_new_tokens=20, 
    num_beams=1,
    do_sample=False,
)

In [None]:
pred_tokens

In [None]:
tokenizer.batch_decode(pred_tokens)

In [None]:
model.bart.config.pad_token_id

In [None]:
criterion = ExtractedFactLoss(nofact_token_id=nofact_token_id, ignore_index=-100, lm_weight=0.5)

In [None]:
p = torch.randint(0, 100, (2,3,8)).float()
t = torch.randint(0,7, (2,3))

In [None]:
p, t

In [None]:
criterion.nllloss(p.permute(0,2,1), t)

In [None]:
p = torch.tensor([[[-6, 0, -5, 7], [-.5, -.5, -8, 0]]]).float()
t = torch.tensor([[0, -100]])

In [None]:
criterion.nllloss.ignore_index

In [None]:
criterion.nllloss.reduction='mean'