## Notebook to test and experiment with KnowledgeGroundedDecoder model

In [None]:
import torch
import torch.nn as nn
from transformers import AutoTokenizer, PretrainedConfig, GenerationConfig
import utils.logging as logging
logging.set_log_level("DEBUG")

In [None]:
from models.knowledge_grounded_generator.kg_model import KnowledgeGroundedDecoder, KG_loss
from models.knowledge_grounded_generator.kg_utils import ConceptGraph
from dataset.msc_kg_sessions import KG_enriched_MSC_Session

### Define a mini test dataset

In [None]:
# This is a small handmade (mini-)dataset, to facilitate testing

class Mini_dataset:

    def __init__(self, speaker_prefixes=['', '']):
        self.speaker_prefixes = speaker_prefixes
        self.data = [
            {
                "text": "Hi, how are you doing?", 
                "labels": ["I'm good, how are you?"],
            }, {
                "text": "Shall we play soccer?", 
                "labels": ["It is fun and a great sport to play as a team"],
            }, {
                "text": "The dinner was great, but now I want to go home.", 
                "labels": ["Yes, the food was delicious"],
            }
        ]
        # print("0: <{}>, 1: <{}>".format(self.speaker_prefixes[0], self.speaker_prefixes[1]))
    def __getitem__(self, i):
        x = self.speaker_prefixes[1] + self.data[i]['text']
        y = self.speaker_prefixes[0] + self.data[i]['labels'][0]
        return x, y
    
    def __len__(self):
        return len(self.data)

### Test impact of left versus right padding with just GPT2 model

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
gpt2 = AutoModelForCausalLM.from_pretrained("gpt2")

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

tok_padleft = AutoTokenizer.from_pretrained("gpt2", padding_side='left')
tok_padleft.pad_token = tok_padleft.eos_token

In [None]:
gpt2.config

In [None]:
generation_config=GenerationConfig(
    pad_token_id=tokenizer.pad_token_id,
    output_hidden_states=True,
    use_cache=True,
    num_beams=1,
    do_sample=False,
    max_new_tokens=20
)

Test with very simple input tensor

In [None]:
ids=torch.tensor(tokenizer(" I you we").input_ids, dtype=torch.long).view(3,1)
attn=torch.ones((3,1), dtype=torch.long)
pos=torch.zeros((3,1), dtype=torch.long)
# ids, attn, pos
logits=gpt2(input_ids=ids, attention_mask=attn, position_ids=pos).logits
gen_out = gpt2.generate(input_ids=ids, attention_mask=attn, generation_config=generation_config)
logits[:, 0, :5]

In [None]:
ids=torch.tensor([[50256, 314], [50256, 345], [50256, 356]], dtype=torch.long).view(3,2)
attn=torch.tensor([[0, 1], [0, 1], [0, 1]], dtype=torch.long).view(3,2)
pos=torch.zeros((3,2), dtype=torch.long)
# ids, attn, pos
logits2=gpt2(input_ids=ids, attention_mask=attn, position_ids=pos).logits
gen_out2 = gpt2.generate(input_ids=ids, attention_mask=attn, generation_config=generation_config)
logits2[:, 1, :5]

In [None]:
((logits2[:, 1, :]-logits[:, 0, :]).abs() > 0.001).sum().item()

Since output of logits and logits2 is (almost) identical, the appears that using left_padding combined with attention works

In [None]:
ids=torch.tensor([[314, 50256], [345, 50256], [356, 50256]], dtype=torch.long).view(3,2)
attn=torch.tensor([[1, 0], [1, 0], [1, 0]], dtype=torch.long).view(3,2)
pos=torch.zeros((3,2), dtype=torch.long)
# ids, attn, pos
logits3=gpt2(input_ids=ids, attention_mask=attn, position_ids=pos).logits
gen_out3 = gpt2.generate(input_ids=ids, attention_mask=attn, generation_config=generation_config)
logits3[:, 0, :5], logits3[:, 1, :5]

Two two tensors are different. So even though attention value for those tokens is zero, and the token itself is padding token, the forward function still generates a different output.

Conclusion: left padding and right padding gives different results from the forward function. Even when position ids and attention mask is adjusted to 'correct' for the differences

Test with short sentence

In [None]:
sentence = "The weather is"
enc = tokenizer(sentence, return_tensors='pt')
enc_pad_left = enc.copy()
enc_pad_left = {
    'input_ids': torch.cat([enc.input_ids, torch.tensor([tokenizer.pad_token_id] * 5).view(1,5)], dim=1),
    'attention_mask': torch.cat([enc.attention_mask, torch.zeros((1,5), dtype=torch.long)], dim=1)
}
enc_pad_right = {
    'input_ids': torch.cat([torch.tensor([tokenizer.pad_token_id] * 5).view(1,5), enc.input_ids], dim=1),
    'attention_mask': torch.cat([torch.zeros((1,5), dtype=torch.long), enc.attention_mask], dim=1)
}
enc, enc_pad_left, enc_pad_right


In [None]:
gen_out = gpt2.generate(**enc, generation_config=generation_config)
gen_padleft = gpt2.generate(**enc_pad_left, generation_config=generation_config)
gen_padright = gpt2.generate(**enc_pad_right, generation_config=generation_config)
gen_out, gen_padleft, gen_padright

In [None]:
resp = tokenizer.batch_decode(gen_out)
resp_padleft = tokenizer.batch_decode(gen_padleft)
resp_padright = tokenizer.batch_decode(gen_padright)
resp, resp_padleft, resp_padright

Test with mini dataset

In [None]:
testdata = Mini_dataset()
text_batch = [testdata[i] for i in range(len(testdata))]
text_batch

In [None]:
x, y = zip(*text_batch)
encoded_x = tokenizer(text=x, padding=True, return_tensors='pt')
lens = torch.cumsum(encoded_x.attention_mask, dim=1)[:, -1]
position_ids = (torch.cumsum(encoded_x.attention_mask, dim=1) - 1).clip(0)
position_ids = position_ids[:, -encoded_x.input_ids.shape[1]:]
encoded_x.input_ids.shape, lens, encoded_x, position_ids

In [None]:
encoded_x_padleft = tok_padleft(text=[item + tokenizer.eos_token for item in x], padding=True, return_tensors='pt')
# encoded_x_padleft.input_ids = torch.cat([encoded_x_padleft.input_ids, torch.full((3,1), tokenizer.eos_token_id)], dim=1)
# encoded_x_padleft.attention_mask = torch.cat([encoded_x_padleft.attention_mask, torch.ones((3,1))], dim=1)

position_ids_padleft = (torch.cumsum(encoded_x_padleft.attention_mask, dim=1) - 1).clip(0)
position_ids_padleft = position_ids_padleft[:, -encoded_x_padleft.input_ids.shape[1]:]
encoded_x_padleft.input_ids.shape, encoded_x_padleft, position_ids_padleft

In [None]:
fwd = gpt2(**encoded_x, position_ids=position_ids)
fwd.logits[:, -1, :10]

In [None]:
fwd_left = gpt2(**encoded_x_padleft, position_ids=position_ids_padleft)
fwd_left.logits[:, -1, :10]

The logits for the last token in the batch is diffent, despite passing attention_mask and position_ids.

In [None]:
fwd.logits[1, 5, :10], fwd_left.logits[1, -2, :10]

In [None]:
fwd.logits[2, 12, :10], fwd_left.logits[2, -2, :10]

Comparing the 6th token of second sentence (right padded), with the last token of second sentence, before the oes_token (right padded) --> equal !!
Same for 12th token of third sentence

In [None]:
gen_out = gpt2.generate(**encoded_x, generation_config=generation_config)
gen_out

In [None]:
gen_out_padleft = gpt2.generate(**encoded_x_padleft, generation_config=generation_config)
gen_out_padleft

In [None]:
response = tokenizer.batch_decode(gen_out)
response

In [None]:
response_padleft = tokenizer.batch_decode(gen_out_padleft)
response_padleft

### Test with DialoGPT

In [None]:
from transformers import AutoModelForCausalLM

In [None]:
dialogpt_tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small", padding_side='left')
dialogpt = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
dialogpt_tokenizer.pad_token = tokenizer.eos_token
dialogpt_tokenizer.bos_token = tokenizer.eos_token

In [None]:
dialogpt.config

In [None]:
testdata = Mini_dataset()
text_batch = [testdata[i] for i in range(len(testdata))]
text_batch

In [None]:
generation_config=GenerationConfig(
    pad_token_id=tokenizer.pad_token_id,
    output_hidden_states=True,
    use_cache=True,
    num_beams=1,
    do_sample=False,
    max_new_tokens=20
)

In [None]:
x, y = zip(*text_batch)
encoded_x_dgpt = dialogpt_tokenizer(text=[item + tokenizer.eos_token for item in x], padding=True, return_tensors='pt')
encoded_x_dgpt.input_ids.shape, encoded_x_dgpt

In [None]:
encoded_x_dgpt.input_ids - encoded_x_padleft.input_ids

In [None]:
gen_out_dgpt = dialogpt.generate(**encoded_x_dgpt, generation_config=generation_config)
gen_out_dgpt

In [None]:
response_dgpt = tokenizer.batch_decode(gen_out_dgpt)
response_dgpt

In [None]:
gen_out_list = [dialogpt.generate(**dialogpt_tokenizer(item + tokenizer.eos_token, return_tensors='pt'), generation_config=generation_config) for item in x]
gen_out_list

In [None]:
resp_list = [tokenizer.batch_decode(g) for g in gen_out_list]
resp_list

### Now use same mini dataset, but with speaker prefixes added before the utterances

In [None]:
testdata = Mini_dataset(['<me>', '<you>'])
text_batch = [testdata[i] for i in range(len(testdata))]
text_batch

### Now use same mini dataset, but with extra tokens added to tokenizer

In [None]:
speaker_prefixes = ['<me>', '<you>']
tok_padleft.add_tokens(speaker_prefixes)
dialogpt_tokenizer.add_tokens(speaker_prefixes)
gpt2.resize_token_embeddings(len(tok_padleft))
dialogpt.resize_token_embeddings(len(tokenizer))
tok_padleft.convert_tokens_to_ids(speaker_prefixes), dialogpt_tokenizer.convert_tokens_to_ids(speaker_prefixes)

In [None]:
x, y = zip(*text_batch)
encoded_x = tokenizer(text=x, padding=True, return_tensors='pt')
encoded_x.input_ids.shape, encoded_x

In [None]:
gen_out2 = gpt2.generate(
    **encoded_x,
    generation_config=GenerationConfig(
        pad_token_id=gpt2.config.eos_token_id,
        output_hidden_states=True,
        use_cache=True,
        num_beams=1,
        do_sample=False,
        max_new_tokens=20
    ))
gen_out2

In [None]:
tokenizer.batch_decode(gen_out2)

In [None]:
x = ('I', 'You', 'We')
encoded_x = tokenizer(text=x, padding=True, return_tensors='pt')
encoded_x.input_ids.shape, encoded_x

### Set up KnowledgeGroundedDecoder model

In [None]:
# Set up the model
# NOTE: Tokenizer uses LEFT padding

# lm = "microsoft/DialoGPT-small"
lm = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(lm, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token
tokenizer.bos_token = tokenizer.eos_token
speaker_prefixes = None # ['<self>', '<other>']
add_tokens = None # speaker_prefixes
if add_tokens is not None:
    num_added_toks = tokenizer.add_tokens(add_tokens)

opt = {
    "lm": lm,
    "bos_token_id": tokenizer.bos_token,
    "num_hops": 2,
    "aggregate_method": "max",
    "alpha": 0.7,
    "beta": 0.2,
    "gamma": 0.33,
    'fixed_lm': False,
    'block_src': True,
    'gate': 0.0 # Gate=0.0 means output should be equal to regular GPT2 output
}

model = KnowledgeGroundedDecoder(opt, tokenizer, config=PretrainedConfig())
model.gpt2model.resize_token_embeddings(len(tokenizer))

In [None]:
model.gpt2model.config

In [None]:
# Set up the dataset

kg_datadir = '/users/FrankVerhoef/Programming/PEX/data/kg_data/'
opt_dataset = {
    'kg_datadir': kg_datadir, 
    'dataset_concepts': 'dataset_concepts.txt', 
    'kg': 'kg.graph', 
    "speaker_prefixes": speaker_prefixes,
    "include_persona": False,
    "max_concepts": 256,
    "max_triples": 768,
    "max_branch": 64,
    "overlapping_concepts": "excl-src-in-tgt",
    "num_hops": 2,
}

kg = ConceptGraph(path=kg_datadir, graph='kg.graph')
kg.build_reduced_graph(kg_datadir + 'dataset_concepts.txt')

basedir = '/Users/FrankVerhoef/Programming/PEX/data/msc/msc_dialogue/'
dataset = KG_enriched_MSC_Session(
    opt_dataset, 
    basedir=basedir, 
    sessions=['1-both-revised-no_cands'],
    subset='valid',
    tokenizer=tokenizer, 
    kg=kg,
    max_samples=None, 
    batch_format="huggingface", 
    batch_pad_id=tokenizer.pad_token_id
)

### First test with small dataset

In [None]:
# This is a small handmade (mini-)dataset, to facilitate testing

class Mini_dataset:

    def __init__(self):
        self.data = [
            {
                "text": "I like my mother and sister. It is good to be with them.", 
                "labels": ["Your family is important since birth"],
            }, {
                "text": "Shall we play soccer?", 
                "labels": ["It is fun and a great sport to play as a team"],
            }, {
                "text": "The dinner was great, but now I want to go home.", 
                "labels": ["Yes, the food was delicious"],
            }
        ]
    def __getitem__(self, i):
        return self.data[i]['text'], self.data[i]['labels']
    def __len__(self):
        return len(self.data)

testdata = Mini_dataset()

# Enrich the minidataset with information about related concepts, from the knowledge graph
enriched = [(*testdata[i], dataset._get_kg_info(*testdata[i])) for i in range(len(testdata))]
enriched

In [None]:
# Check the concept_token_ids that are marked with label==1 occur in the target sentence

tokenizer.decode([
    c_id 
    for c_id, label in zip(enriched[0][2]['concept_token_ids'], enriched[0][2]['concept_labels'])
    if label == 1
])

In [None]:
# Check the output of the tokenizer
# NOTE: tensors are LEFT-padded

tokenizer(text=[testdata[i][0] for i in range(len(testdata))], padding=True, return_tensors='pt')

In [None]:
# Convert the dataset items to a batch

batch = dataset.batchify(enriched)
inputs, labels, kg_input = batch
L = inputs.input_ids.shape[1]
input_ids = inputs.input_ids

# Check if the output equals result in previous cell
print(input_ids.shape)
print(input_ids)

In [None]:
# Use the input_ids and the KG-input to generate text
output = model.generate(
    inputs=torch.cat([input_ids, torch.full((3,1), tokenizer.bos_token_id)], dim=1),
    kg_input=kg_input,
    generation_config=GenerationConfig(
        pad_token_id=model.gpt2model.config.eos_token_id,
        output_hidden_states=True,
        use_cache=True,
        num_beams=1,
        do_sample=False,
        max_new_tokens=10
    )
)

# Check output size: dimension 1 must be at most 10 tokens larger (=max_new_tokens)
print(output.shape)

# Output the newly generated tokens are concatenated after the original input_ids
for context, out in zip(enriched, output):
    print("Context:  ", context[0])
#     print("Label:    ", context[1])
    print("Tensor:   ", out)
    print("Response: ", dataset.tokenizer.batch_decode(out))
    print("-" * 20)

In [None]:
# This is exactly the same 'generate', but now using the generate function of gpt2model directly
output = model.gpt2model.generate(
    inputs=input_ids,
    generation_config=GenerationConfig(
        pad_token_id=model.gpt2model.config.eos_token_id,
        output_hidden_states=True,
        output_scores=True,
        use_cache=True,
        num_beams=1,
        do_sample=False,
        max_new_tokens=10,
        return_dict_in_generate=True
    )
)
print(output.keys())
for context, out in zip(enriched, output.sequences):
    print("Context:  ", context[0])
#     print("Label:    ", context[1])
    print("Tensor:   ", out)
    print("Response: ", dataset.tokenizer.batch_decode(out))
    print("-" * 20)


In [None]:
# Scores is a tuple with length 10 (because 10 generated tokens). 
# Each element a batch of the scores
scores = torch.cat(output.scores).reshape((3, 10, -1))
print(scores.shape)
top_5_indices = torch.topk(scores, k=5, dim=2, sorted=True).indices
print(top_5_indices)
for sequence in top_5_indices:
    for top5 in sequence:
        print(' '.join(["{:10s}".format(token) for token in tokenizer.convert_ids_to_tokens(top5)]))
    print()

In [None]:
# Use forward to generate logits to determine the next token

output = model.forward(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    kg_input=kg_input
)
print(output.logits.shape)
print(inputs.input_ids)
print(inputs.attention_mask)
print(output.last_hidden_state.shape)
print(output.logits.argmax(dim=-1))

# The next token of a sequence is determined by the last hidden state of the last token of each sequence
print(tokenizer.batch_decode(output.logits[:, -1, :].argmax(dim=-1)))

In [None]:
# This is the same check, but using the forward function of the gpt2model directly

attention_mask = inputs.attention_mask
position_ids = (torch.cumsum(attention_mask, dim=1) - 1).clip(0)
position_ids = position_ids[:, -input_ids.shape[1]:]
output = model.gpt2model.forward(
    input_ids=input_ids,
    attention_mask=attention_mask,
    position_ids=position_ids
)
print(output.logits.argmax(dim=-1))

### Now test with trained model and MSC dataset

In [None]:
criterion = KG_loss(ignore_index=tokenizer.pad_token_id, invalid=-1, alpha=opt['alpha'], beta=opt['beta'])

In [None]:
valid_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=3, shuffle=False, collate_fn=dataset.batchify)

In [None]:
batch = next(iter(valid_loader))
batch

In [None]:
print("--- input ---")
inp = tokenizer.batch_decode(batch[0].input_ids)
for i in inp:
    print(i)
print("--- labels ---")
lbl = tokenizer.batch_decode(batch[1].input_ids)
for i in lbl:
    print(i)
# batch[0].input_ids.shape

In [None]:
model.valid_step(batch, criterion=criterion, device='cpu')

In [None]:
checkpoint_dir = '/users/FrankVerhoef/Programming/PEX/checkpoints/'
load = 'test_kgg'
logging.info("Loading model from {}".format(checkpoint_dir + load))
model.load_state_dict(torch.load(checkpoint_dir + load, map_location=torch.device('cpu')))

In [None]:
model.valid_step(batch, criterion=criterion, device='cpu')

In [None]:

inputs, labels, kg_input = batch

with torch.no_grad():
    output = model.forward(
        input_ids=torch.cat([inputs.input_ids, labels.input_ids], dim=1),
        attention_mask=torch.cat([inputs.attention_mask, labels.attention_mask], dim=1),
        kg_input=kg_input
    )
    len_labels = labels.input_ids.shape[1]
    loss, gen_loss, triple_loss, gate_loss = criterion(
        output.logits[:, -len_labels:], labels.input_ids, 
        output.triple_prob[:, -len_labels:], kg_input.triple_labels, 
        output.gate[:, -len_labels:], kg_input.gate_labels
    )

pred = output.logits[:, -len_labels:].argmax(dim=-1)


In [None]:
pred

In [None]:
labels

In [None]:
print("--- responses ---")
resp = tokenizer.batch_decode(pred)
for i in resp:
    print(i)

In [None]:
# LM accuracy
token_correct = labels['input_ids'].eq(pred) * labels['attention_mask']
token_acc = (token_correct.sum() / labels['attention_mask'].sum()).item() 
token_acc

In [None]:
# Use the input_ids and the KG-input to generate text
gen_output = model.generate(
    inputs=inputs.input_ids,
    kg_input=kg_input,
    generation_config=GenerationConfig(
        pad_token_id=model.gpt2model.config.eos_token_id,
        output_hidden_states=True,
        use_cache=True,
        num_beams=1,
        do_sample=False,
        # decoder_start_token_id=tokenizer.convert_tokens_to_ids('<self>'),
        max_new_tokens=5
    )
)

# Check output size: dimension 1 must be at most 10 tokens larger (=max_new_tokens)
print(gen_output.shape)

# Output the newly generated tokens are concatenated after the original input_ids
for inp, lbl, out in zip(inputs.input_ids, labels.input_ids, gen_output):
    print("Context:  ", inp)
    print("Label:    ", lbl)
    print("Tensor:   ", out)
    print("Response: ", dataset.tokenizer.batch_decode(out))
    print("-" * 20)

In [None]:
torch.cat([inputs.input_ids, labels.input_ids[:, 0].view(-1, 1)], dim=1)

In [None]:
print("--- input ---")
inp = tokenizer.batch_decode(inputs.input_ids)
for i in inp:
    print(i)
print("--- labels ---")
lbl = tokenizer.batch_decode(labels.input_ids)
for i in lbl:
    print(i)
# batch[0].input_ids.shape

In [None]:
model.valid_step(batch, criterion=criterion, device='cpu')

In [None]:
print("--- responses ---")
resp = tokenizer.batch_decode(pred)
for i in resp:
    print(i)

In [None]:
gen_output[:,-2:]

In [None]:
inputs.input_ids.shape

In [None]:
tokenizer.convert_tokens_to_ids('<self>')

### Test DialoGPT (example from Huggingface)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch


tokenizer = AutoTokenizer.from_pretrained("microsoft/DialoGPT-medium")
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-medium")


In [None]:

# Let's chat for 5 lines
for step in range(5):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = tokenizer.encode(input(">> User:") + tokenizer.eos_token, return_tensors='pt')

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat([chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids
    print(bot_input_ids)

    # generated a response while limiting the total chat history to 1000 tokens, 
    chat_history_ids = model.generate(bot_input_ids, max_length=1000, pad_token_id=tokenizer.eos_token_id)

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(tokenizer.decode(chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))
