In [None]:
from phi.modeling_phi import PhiForCausalLM

from transformers import AutoTokenizer, AutoConfig

config = AutoConfig.from_pretrained("microsoft/phi-1_5", trust_remote_code = True)
config.know_type = "kformer"
config.enc_dim = 768
config.know_layer = [5,8,11,14,17,20,23]

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
model = PhiForCausalLM.from_pretrained("microsoft/phi-1_5", config=config)

In [None]:
prompt = """Write a detailed analogy between mathematics and a lighthouse.

Answer:"""
inputs = tokenizer(prompt, return_tensors = "pt")
out = model.generate(**inputs, max_length = 200)
tokenizer.batch_decode(out)

In [None]:
import datasets

df = datasets.load_from_disk("msmacro_wellformed_split")
df_pd = df["train"].to_pandas()

In [None]:
from modeling_gpt2 import GPT2LMHeadModel
from transformers import AutoTokenizer, AutoConfig 

config = AutoConfig.from_pretrained("gpt2-medium")
config.know_type = "gated_cross"
config.know_layer = [5,8,11,14,17,20,23]
config.know_proj_bias = False
config.hidden_dropout = 0.1

model = GPT2LMHeadModel.from_pretrained("gpt2-medium", config=config)

In [13]:
from datasets import load_dataset
from torch.utils.data import DataLoader
from train_utils.eval import EvalCollator
from transformers import AutoTokenizer, AutoModel

dec_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
dec_tokenizer.pad_token = dec_tokenizer.eos_token
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

#tokenizer = AutoTokenizer.from_pretrained("roberta-base")
dataset = load_dataset("squad", split = "validation")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})    
loader = DataLoader(dataset, batch_size = 4, collate_fn = EvalCollator(dec_tokenizer,
                                                                                enc_tokenizer, 
                                                                                mode = "q", 
                                                                                context_enc = True, 
                                                                                cover_labels=False, 
                                                                                context_column = "qc",
                                                                                answer_column = "answers"))

In [14]:
loss_batch, gen_batch, answers = next(iter(loader))

In [43]:
from modeling_gpt2 import GPT2LMHeadModel
from transformers import AutoConfig, AutoModel
config = AutoConfig.from_pretrained("gpt2-medium")
config.know_proj_bias = False
config.know_type = "gated_cross"
config.know_pos = "mlp"
config.know_layer = [5,8,11,14,17,20,23]
config.hidden_dropout = 0.1

enc_model2 = AutoModel.from_pretrained("roberta-base")
dec_model2 = GPT2LMHeadModel.from_pretrained("gpt2-medium", config = config)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['h.5.crossattention.attention.proj_k.weight', 'h.23.crossattention.attention.proj_o.weight', 'h.11.crossattention.attention.proj_o.weight', 'h.5.ln_cross_attn.weight', 'h.14.crossattention.attention.proj_v.weight', 'h.14.crossattention.attn_gate', 'h.20.crossattention.attention.proj_q.weight', 'h.11.crossattention.attention.proj_v.weight', 'h.17.crossattention.attention.proj_v.weight', 'h.11.crossattention.attn_gate', 'h.14.crossattention.attention.proj_q.weight', 'h.17.crossattention.attention.proj_q.weight', 'h.8.crossattention.attention.proj_k.weight', 'h.23.ln_cross_attn.weigh

In [47]:
enc_states = enc_model2(batch["input_ids"], batch["attention_mask"]).last_hidden_state
enc_states = nn.Linear(768,1024)(enc_states)
dec_model2(input_ids = batch["decoder_input_ids"], 
           attention_mask = batch["decoder_attention_mask"], 
           encoder_hidden_states = enc_states, 
           labels = batch["labels"], 
           encoder_attention_mask= batch["attention_mask"]).loss

tensor(4.2498, grad_fn=<NllLossBackward0>)

In [57]:
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer,AutoConfig,TrainingArguments
from tokenizers.processors import TemplateProcessing
import wandb
from omegaconf import OmegaConf
import os

from transformers import EncoderDecoderConfig, AutoModel
from eval_utils.loading_utils import load_encdec_model
from modeling_gpt2 import GPT2LMHeadModel

from train_utils.utils import prepare_dataset, CustomCollator, prompt_qca, prompt_qa, prompt_article, prompt_article_summary, prompt_qc_enc
from train_utils.eval import evaluate
from train_utils.Trainer import CustomTrainer, UpdateOutputDirCallback, AdditionalEvalCallback


wandb.login(key = "f190694cef6354f5205256582202a2b16502a236")
args = OmegaConf.load("configs/default.yaml")


train_enc_dec = args.model_args.is_enc_dec
freeze_decoder = args.model_args.freeze_decoder
freeze_encoder = args.model_args.freeze_encoder
create_labels = args.data_args.cover_labels
encoder_model = args.model_args.encoder_name
decoder_checkpoint = args.model_args.decoder_name
if args.data_args.prompt_type == "qc":
    prompt = prompt_qca
elif args.data_args.prompt_type == "q":
    prompt = prompt_qa
elif args.data_args.prompt_type == "article":
    prompt = prompt_article
elif args.data_args.prompt_type == "article_summary":
    prompt = prompt_article_summary


config = AutoConfig.from_pretrained(args.model_args.decoder_base_name, trust_remote_code = True)
config.know_type = args.model_args.adapter_args.adapter_type
config.enc_dim = args.model_args.adapter_args.enc_dim
config.know_layer = OmegaConf.to_container(args.model_args.adapter_args.know_layer)
config.hidden_dropout = args.model_args.adapter_args.hidden_dropout
config.know_proj_bias = args.model_args.adapter_args.proj_bias
config.know_pos = args.model_args.adapter_args.know_pos
config.know_norm = args.model_args.adapter_args.know_norm



tokenizer = AutoTokenizer.from_pretrained(args.model_args.decoder_base_name)
tokenizer.pad_token = tokenizer.eos_token
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= tokenizer.bos_token + " $A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.bos_token, tokenizer.bos_token_id),(tokenizer.eos_token, tokenizer.eos_token_id)],
)

if os.path.isdir(decoder_checkpoint) and isinstance((conf:=AutoConfig.from_pretrained(decoder_checkpoint)), EncoderDecoderConfig):
    print("---- Loading Encoder Decoder Model ----	")
    model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model(decoder_checkpoint, enc_model_class= AutoModel, dec_model_class= GPT2LMHeadModel)
else:
    if "phi" in args.model_args.decoder_base_name:
        from phi.modeling_phi import PhiForCausalLM
        model = PhiForCausalLM.from_pretrained(decoder_checkpoint, config=config)
    elif "gpt2" in args.model_args.decoder_base_name:
        from modeling_gpt2 import GPT2LMHeadModel
        model = GPT2LMHeadModel.from_pretrained(decoder_checkpoint, config=config)
    else:
        from transformers import AutoModelForCausalLM
        model = AutoModelForCausalLM.from_pretrained(decoder_checkpoint, config=config)

    if train_enc_dec:
        from transformers import EncoderDecoderConfig, AutoModel, AutoConfig, AutoTokenizer
        from train_utils.EncoderDecoder import CustomEncoderDecoderModel

        if os.path.isdir(encoder_model) and "config_sentence_transformers.json" in os.listdir(encoder_model):
            print("---- loading Sentence Transformer Encoder ----")
            from train_utils.encoder import PrefixEncoder
            enc_model, enc_tokenizer = PrefixEncoder.from_sentenc_checkpoint(encoder_model)
        else:
            enc_model = AutoModel.from_pretrained(encoder_model)
            enc_tokenizer = AutoTokenizer.from_pretrained(encoder_model)


        config = EncoderDecoderConfig(**{"encoder": enc_model.config.to_dict(), "decoder": AutoConfig.from_pretrained("gpt2").to_dict()})
        config.decoder= model.config
        model = CustomEncoderDecoderModel(encoder=enc_model, decoder=model, config = config)
    else:
        enc_tokenizer = None

if freeze_encoder:
    for n,p in model.encoder.named_parameters():
        p.requires_grad = False

if freeze_decoder:
    for n, p in (model.decoder if hasattr(model, "decoder") else model).named_parameters():
        if not "proj_k" in n and not "proj_v" in n and "gated_attn" not in n and "cross" not in n:
            p.requires_grad = False

model.args = args

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: C:\Users\Daniel/.netrc


Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2-medium and are newly initialized: ['h.5.crossattention.attention.proj_k.weight', 'h.23.crossattention.attention.proj_o.weight', 'h.11.crossattention.attention.proj_o.weight', 'h.5.ln_cross_attn.weight', 'h.14.crossattention.attention.proj_v.weight', 'h.14.crossattention.attn_gate', 'h.20.crossattention.attention.proj_q.weight', 'h.11.crossattention.attention.proj_v.weight', 'h.17.crossattention.attention.proj_v.weight', 'h.11.crossattention.attn_gate', 'h.14.crossattention.attention.proj_q.weight', 'h.17.crossattention.attention.proj_q.weight', 'h.8.crossattention.attention.proj_k.weight', 'h.23.ln_cross_attn.weight', 'h.23.crossattention.attention.proj_k.weight', 'h.8.crossattention.attention.proj_v.weight', 'h.5.crossattention.attention.proj_v.weight', 'h.23.crossattention.attention.proj_q.weight', 'h.14.ln_cross_attn.weight', 'h.11.ln_cross_attn.bias', 'h.20.crossattention.attention.proj_k.weight'

In [37]:
dataset = load_from_disk(args.data_args.dataset_path) if os.path.isdir(args.data_args.dataset_path) else load_dataset(args.data_args.dataset_path)
if "squad" in args.data_args.dataset_path:
    if args.data_args.context_column == "answer_sentence" and "answer_sentence" not in dataset["train"].column_names:
        from train_utils.utils import extract_sentence
        dataset = dataset.map(lambda x: {"answer_sentence": extract_sentence(x["context"], x["answers"]["answer_start"][0])})

    dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})
    dataset = dataset.map(lambda x: {k:v.strip() for k,v in x.items()})
elif "cnn" in args.data_args.dataset_path:
    column_names = dataset["train"].column_names
    dataset = dataset.map(lambda x: {"summary": [" ".join(entry.split()[:100]) for entry in x["highlights"]]}, batched=True)
    dataset = dataset.map(lambda x: {"article": [" ".join(entry.split()[:200]) for entry in x["article_half"]]}, batched=True, remove_columns=column_names)

df_qca = dataset.map(prepare_dataset, 
                     fn_kwargs={"prompt": prompt, 
                                "tokenizer": tokenizer, 
                                "create_labels" : create_labels, 
                                "enc_tokenizer": enc_tokenizer, 
                                "context_enc": train_enc_dec, 
                                "context_column": args.data_args.context_column,
                                "answer_column": args.data_args.answer_column,
                                "enc_prompt": prompt_qc_enc if args.data_args.context_column == "qc" else None,
                                "num_prefix_token": model.encoder.num_prefix_token if train_enc_dec and hasattr(model.encoder, "num_prefix_token") else 0}, 
                     batched=True, 
                     remove_columns=dataset["train"].column_names)

Map: 100%|██████████| 87599/87599 [00:10<00:00, 8029.70 examples/s]
Map: 100%|██████████| 87599/87599 [00:21<00:00, 4123.17 examples/s]
Map: 100%|██████████| 10570/10570 [00:02<00:00, 4201.12 examples/s]


In [38]:
from torch.utils.data import DataLoader

loader = DataLoader(df_qca["train"], batch_size=3, collate_fn=CustomCollator(tokenizer, enc_tokenizer = enc_tokenizer))

In [39]:
batch = next(iter(loader))

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [48]:
model.decoder(input_ids = batch["decoder_input_ids"],
              attention_mask = batch["decoder_attention_mask"],
              labels = batch["labels"]).loss

tensor(4.2498)

In [56]:
enc_states = model.encoder(input_ids = batch["input_ids"],
              attention_mask = batch["attention_mask"]).last_hidden_state
enc_states = nn.Linear(768,1024)(enc_states)

import torch 

enc_states_rand = torch.rand_like(enc_states)


model.decoder(input_ids = batch["decoder_input_ids"],
            attention_mask = batch["decoder_attention_mask"],
            labels = batch["labels"],
            encoder_hidden_states = enc_states_rand,
            encoder_attention_mask = batch["attention_mask"]).loss

tensor(nan, grad_fn=<NllLossBackward0>)

In [59]:
model(**batch).loss.backward()

In [61]:
for n,p in model.named_parameters():
    if "gate" in n:
        print(n)
        print(p.grad)

decoder.transformer.h.5.crossattention.attn_gate
tensor([-0.0132])
decoder.transformer.h.8.crossattention.attn_gate
tensor([0.0058])
decoder.transformer.h.11.crossattention.attn_gate
tensor([0.0018])
decoder.transformer.h.14.crossattention.attn_gate
tensor([-0.0041])
decoder.transformer.h.17.crossattention.attn_gate
tensor([0.0036])
decoder.transformer.h.20.crossattention.attn_gate
tensor([0.0002])
decoder.transformer.h.23.crossattention.attn_gate
tensor([-0.0030])


In [2]:
from eval_utils.loading_utils import load_encdec_model
from transformers import AutoModel
from modeling_gpt2 import GPT2LMHeadModel

model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model("checkpoints/checkpoint-5475_test_cros_only_gated/", enc_model_class= AutoModel, dec_model_class= GPT2LMHeadModel)

In [4]:
import pandas as pd

pd.read_pickle("checkpoints/eval_output.pkl")

Unnamed: 0,gen_batch,loss_batch,loss,generated,reference,answer_logits,gen_logits,adapter_attn,adapter_mean,exact_match,...,solution_present,bleu,rouge,bert_score,id,batch_id,decoder_loss,decoder_generated,decoder_answer_logits,decoder_gen_logits
0,"{'decoder_input_ids': [50256, 13828, 5134, 107...","{'decoder_input_ids': [50256, 13828, 5134, 107...",0.670158,New England Patriots,Denver Broncos,"[[9.86386, 9.707694, 6.367945, 4.290615, 6.433...","[[-83.98282, -84.14194, -87.02244, -89.86123, ...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0....","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.866573691368103], 'recall': [...",0,0,2.963347,The New England Patriots.\n\nThe Patriots won...,"[[-31.766266, -36.457012, -37.86325, -40.3044,...","[[-113.81351, -114.37096, -116.42429, -118.155..."
1,"{'decoder_input_ids': [50256, 13828, 5134, 107...","{'decoder_input_ids': [50256, 13828, 5134, 107...",0.670158,New Orleans Saints\nAnswer: New Orleans Saint...,Carolina Panthers,"[[-7.804997, -9.273602, -13.947753, -14.553748...","[[-76.7976, -76.62437, -79.1884, -82.336716, -...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0....","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.7514113187789917], 'recall': ...",1,0,2.963347,The New England Patriots.\n\nThe Patriots won...,"[[-35.658188, -40.345284, -42.434048, -43.9317...","[[-110.67171, -111.104675, -112.957855, -114.8..."
2,"{'decoder_input_ids': [50256, 50256, 50256, 50...","{'decoder_input_ids': [50256, 8496, 750, 3115,...",0.670158,New Orleans\nAnswer: Super Bowl 50 was held i...,"Santa Clara, California","[[-21.079342, -19.898584, -24.010008, -28.6909...","[[-53.4031, -52.51433, -57.01871, -57.266476, ...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.018181818181818...","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.7864809036254883], 'recall': ...",2,0,2.963347,In the United States.\n\nThe Super Bowl was h...,"[[-69.505455, -72.853065, -77.54536, -79.49573...","[[-98.00317, -98.19375, -101.61287, -102.07279..."
3,"{'decoder_input_ids': [50256, 50256, 50256, 50...","{'decoder_input_ids': [50256, 13828, 5134, 107...",0.670158,New England Patriots\n\nAnswer: New England P...,Denver Broncos,"[[1.0915244, 0.23584893, -2.9224572, -4.822131...","[[-78.3024, -77.840324, -81.18579, -84.52092, ...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0....","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.7575564384460449], 'recall': ...",3,0,2.963347,The New England Patriots.\n\nThe Patriots won...,"[[-40.301838, -44.63461, -46.536575, -48.8318,...","[[-117.03637, -117.29913, -119.36191, -121.301..."
4,"{'decoder_input_ids': [50256, 2061, 3124, 373,...","{'decoder_input_ids': [50256, 2061, 3124, 373,...",1.075779,white,gold,"[[13.914301, 13.181443, 10.802457, 7.491674, 8...","[[-78.75067, -79.23787, -78.86608, -83.4689, -...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0....","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.9015482664108276], 'recall': ...",0,1,3.473165,Black.\n\nWhat color was used to emphasize th...,"[[-29.425053, -35.962685, -35.373478, -38.9149...","[[-97.693825, -98.63494, -98.22438, -100.67448..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199,"{'decoder_input_ids': [50256, 50256, 50256, 50...","{'decoder_input_ids': [50256, 818, 644, 614, 7...",1.871112,2011\nAnswer: 2011\nAnswer: 2011\nAnswer: 201...,1978,"[[38.77231, 38.684677, 35.875156, 31.979534, 3...","[[-157.15797, -156.95673, -160.17026, -160.888...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0....","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.6898300051689148], 'recall': ...",3,49,3.407411,1999.\n\nThe NFL's first 16-game regular seas...,"[[3.700407, -1.9532802, -2.7853441, -6.6790795...","[[-145.63051, -145.64934, -148.73933, -148.765..."
200,"{'decoder_input_ids': [50256, 8241, 550, 262, ...","{'decoder_input_ids': [50256, 8241, 550, 262, ...",1.265450,Carolina Panthers\nAnswer: Carolina Panthers\...,Carolina Panthers,"[[-5.6393676, -6.2195373, -10.557196, -11.2515...","[[-87.54248, -87.23083, -89.15566, -92.85159, ...",,,0,...,1,"{'bleu': 0.0, 'precisions': [0.041666666666666...","{'rouge1': 0.10526315789473684, 'rouge2': 0.05...","{'precision': [0.74830561876297], 'recall': [0...",0,50,3.763394,The Packers.\n\nThe Packers had the best reco...,"[[-59.192986, -63.50041, -66.40582, -67.6587, ...","[[-113.9354, -114.28802, -116.65331, -118.2434..."
201,"{'decoder_input_ids': [50256, 2437, 867, 17782...","{'decoder_input_ids': [50256, 2437, 867, 17782...",1.265450,five\nAnswer: five\nAnswer: five\nAnswer: fiv...,Ten,"[[15.71775, 13.562393, 13.220857, 9.056921, 13...","[[-147.28789, -147.46153, -147.65422, -149.279...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0....","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.6993039846420288], 'recall': ...",1,50,3.763394,\n\nThe Panthers went to the Pro Bowl in each ...,"[[-27.30269, -33.06101, -34.622677, -37.73731,...","[[-131.08069, -131.3189, -132.47438, -133.3770..."
202,"{'decoder_input_ids': [50256, 2437, 867, 17782...","{'decoder_input_ids': [50256, 2437, 867, 17782...",1.265450,five\nAnswer: five\nAnswer: five\nAnswer: fiv...,eight,"[[10.745259, 10.183949, 8.075109, 3.6677494, 8...","[[-144.49245, -144.50192, -144.66614, -146.638...",,,0,...,0,"{'bleu': 0.0, 'precisions': [0.0, 0.0, 0.0, 0....","{'rouge1': 0.0, 'rouge2': 0.0, 'rougeL': 0.0, ...","{'precision': [0.6994197368621826], 'recall': ...",2,50,3.763394,\n\nThe Panthers designated 11 players to the ...,"[[-55.743523, -60.843544, -62.542656, -66.7097...","[[-129.40627, -129.72078, -130.62915, -131.725..."


In [5]:
from datasets import load_dataset

df = load_dataset("squad")

In [6]:
df

DatasetDict({
    train: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 87599
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 10570
    })
})

In [3]:
for n,p in model.named_parameters():
    if "gate" in n:
        print(n)
        print(p)

decoder.transformer.h.5.crossattention.attn_gate
Parameter containing:
tensor([-0.0336], requires_grad=True)
decoder.transformer.h.8.crossattention.attn_gate
Parameter containing:
tensor([-0.0369], requires_grad=True)
decoder.transformer.h.11.crossattention.attn_gate
Parameter containing:
tensor([-0.0686], requires_grad=True)
decoder.transformer.h.14.crossattention.attn_gate
Parameter containing:
tensor([0.0792], requires_grad=True)
decoder.transformer.h.17.crossattention.attn_gate
Parameter containing:
tensor([-0.0314], requires_grad=True)
decoder.transformer.h.20.crossattention.attn_gate
Parameter containing:
tensor([-0.0364], requires_grad=True)
decoder.transformer.h.23.crossattention.attn_gate
Parameter containing:
tensor([0.0500], requires_grad=True)


In [52]:
enc_states.shape

torch.Size([3, 179, 1024])

In [40]:
model(**batch).loss

tensor(nan, grad_fn=<NllLossBackward0>)

In [29]:
from torch import nn

proj_layer = nn.Linear(768, 1024)

enc_states = enc_model(input_ids = loss_batch["input_ids"], attention_mask = loss_batch["attention_mask"]).last_hidden_state

enc_states = proj_layer(enc_states)

out = model(input_ids = loss_batch["decoder_input_ids"], 
            attention_mask = loss_batch["decoder_attention_mask"], 
            encoder_hidden_states = enc_states, 
            encoder_attention_mask = loss_batch["attention_mask"],
            labels = loss_batch["labels"])

dec_loss = model(input_ids = loss_batch["decoder_input_ids"], 
            attention_mask = loss_batch["decoder_attention_mask"], 
            labels = loss_batch["labels"]).loss

print(f"dec loss {dec_loss}")
print(f"enc dec loss {out.loss}")

dec loss 2.964246988296509
enc dec loss 2.964246988296509


In [None]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
from transformers import AutoModel

enc_model = AutoModel.from_pretrained("roberta-base")
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

test = tokenizer("This is a test", return_tensors = "pt")

test_enc = enc_tokenizer("This is a test", return_tensors = "pt")
enc_states = enc_model(**test_enc).last_hidden_state

from torch import nn
proj_test  = nn.Linear(768,1024)

enc_states = proj_test(enc_states)


model(**test, encoder_hidden_states = enc_states, encoder_attention_mask = test_enc["attention_mask"])

In [None]:
from transformers import AutoConfig

import torch



from phi.adapters import SharedAttention



attn = SharedAttention(config)

hidden_dim= 1024
feed_forward_hidden_states = torch.rand(5,6,hidden_dim)
cross_attn_out = torch.rand(5,6,hidden_dim)

In [None]:
attn(feed_forward_hidden_states, cross_attn_out)[0].shape

In [None]:
max_positions = 512
bias = torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
                1, 1, max_positions, max_positions)


def apply_causal_mask(attn_scores):
    # From GPT2 attention
    query_length, key_length = attn_scores.size(-2), attn_scores.size(-1)
    causal_mask = bias[:, :, key_length - query_length : key_length, :key_length]
    mask_value = torch.finfo(attn_scores.dtype).min
    # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
    # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
    mask_value = torch.full([], mask_value, dtype=attn_scores.dtype).to(attn_scores.device)
    attn_scores = torch.where(causal_mask, attn_scores.to(attn_scores.dtype), mask_value)
    return attn_scores

def _split_heads(tensor, num_heads, attn_head_size):
    """
    Splits hidden_size dim into attn_head_size and num_heads
    """
    new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
    tensor = tensor.view(new_shape)
    return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)

def _merge_heads(tensor, num_heads, attn_head_size):
    """
    Merges attn_head_size dim and num_attn_heads dim into hidden_size
    """
    tensor = tensor.permute(0, 2, 1, 3).contiguous()
    new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
    return tensor.view(new_shape)

In [None]:
out.shape

In [None]:
concat_values.shape

In [None]:
concat_scores[0,0,:].sum()

In [None]:
torch.concat([feed_forward_hidden_states, cross_attn_out], dim = -1).shape

In [None]:
df.bert_score.apply(lambda x: x["f1"]).mean()

In [None]:
dataset["train"].to_pandas()

In [None]:
!pip install "git+https://github.com/AIPHES/DiscoScore.git"

In [None]:
from disco_score import DiscoScorer

disco_scorer = DiscoScorer(device='cpu', model_name='bert-base-uncased')

In [None]:
system = ["Paul Merson has restarted his row with andros townsend after the Tottenham midfielder was brought on with only seven minutes remaining in his team 's 0-0 draw with burnley. Townsend was brought on in the 83rd minute for Tottenham as they drew 0-0 against Burnley ."]

references = [["Paul Merson has restarted his row with burnley on sunday. Townsend was brought on in the 83rd minute for tottenham. Andros Townsend scores england 's equaliser in their 1-1 friendly draw. Townsend hit a stunning equaliser for england against italy."]]
references = [system]
for s, refs in zip(system, references):
   s = s.lower()
   refs = [r.lower() for r in refs]
   print(disco_scorer.DS_Focus_NN(s, refs))

In [None]:
model.encoder(input_ids = torch.tensor([[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]]), attention_mask = torch.tensor([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])).last_hidden_state.shape

In [None]:
enc_input = enc_tokenizer(["hellow world this is a test", "this is a test"], return_tensors = "pt", padding = True)

In [None]:
model.encoder.num_prefix_token

In [None]:
import torch
test_input = {"input_ids": torch.tensor([[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]]), "attention_mask": torch.tensor([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])}
test_input2 = {"decoder_input_ids": torch.tensor([[0,1,2,3,4,5,6,7,8,9,10,11,12,13,14]]), "decoder_attention_mask": torch.tensor([[1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]])}
model(**test_input, **test_input2)

In [None]:
from 

In [None]:
enc_model.encoder.get_output_embeddings()

In [None]:


PrefixEncoder("checkpoints/2725_prefix10/")

In [None]:
out["token_embeddings"]

In [None]:
trainer.enc_tokenizer

In [None]:
trainer.tokenizer

In [None]:
from eval_utils.loading_utils import load_encdec_model
from transformers import AutoModel, AutoConfig
from modeling_gpt2 import GPT2LMHeadModel

model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model("checkpoints/checkpoint-10950_589/", AutoModel, GPT2LMHeadModel)

In [None]:
dec_tokenizer.pad_token = dec_tokenizer.eos_token

In [None]:
batch = next(iter(loader))
loss_batch, gen_batch, answers = batch
out = model.generate(**gen_batch, max_new_tokens = 30, eos_token_id = dec_tokenizer.eos_token_id)

In [None]:
answers

In [None]:
dec_tokenizer.batch_decode(out, skip_special_tokens=True)

In [None]:
dec_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

prompt = """Write a detailed analogy between mathematics and a lighthouse."""
input_example = dec_tokenizer(prompt, return_tensors = "pt")

import torch

encoder_states = torch.rand(1,5,1024)
encoder_attn_mask = torch.ones(1,5).long()

model(**input_example, encoder_hidden_states = encoder_states, encoder_attention_mask = encoder_attn_mask)

In [None]:
from datasets import load_from_disk

dataset = load_from_disk("squad_with_answer_sentence")

In [None]:
dataset

In [None]:
from datasets import DatasetDict
from datasets import load_from_disk

dataset = load_from_disk("msmacro_wellformed")

split =dataset["train"].train_test_split(test_size = 30_000, seed = 42)

dataset_split = DatasetDict({"train": split["train"], "validation": dataset["dev"], "test": split["test"]})

In [None]:
dataset_split.save_to_disk("msmacro_wellformed_split")

In [None]:
dataset

In [None]:
from train_utils.utils import prepare_dataset, prompt_q
from datasets import load_dataset, load_from_disk
from transformers import AutoTokenizer
from tokenizers.processors import TemplateProcessing


tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= tokenizer.bos_token + " $A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.bos_token, tokenizer.bos_token_id),(tokenizer.eos_token, tokenizer.eos_token_id)],
)
dataset = load_from_disk("squad_with_answer_sentence")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})
dataset = dataset.map(lambda x: {k:v.strip() for k,v in x.items()})

df_qca = dataset.map(prepare_dataset, 
                     fn_kwargs={"prompt": prompt_q, 
                                "tokenizer": tokenizer, 
                                "create_labels" : False, 
                                "enc_tokenizer": None, 
                                "context_enc": False, 
                                "context_column": "context"}, 
                     batched=True)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_from_disk
from train_utils.eval import EvalCollator
from torch.utils.data import DataLoader

checkpoint = "checkpoints/gpt2_run_cosmic-butterfly-490/checkpoint-5475/"
mode = "q"
checkpoint = "checkpoints/gpt2_run_bumbling-energy-489/checkpoint-5475/"
mode = "qc"
checkpoint = "checkpoints/gpt2_q_msmacro_run_amber-donkey-491/checkpoint-7659/"
mode = "q"
checkpoint = "checkpoints/gpt2_qc_msmacro_run_solar-sponge-492/checkpoint-7659/"
mode = "qc"

model = AutoModelForCausalLM.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= tokenizer.bos_token + " $A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.bos_token, tokenizer.bos_token_id),(tokenizer.eos_token, tokenizer.eos_token_id)],
)


dataset = load_from_disk("msmacro_wellformed_split")
#dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})
#dataset = dataset.map(lambda x: {k:v.strip() for k,v in x.items()})

collate_fn = EvalCollator(tokenizer, None, mode=mode, context_enc = False, cover_labels=False)

loader = DataLoader(dataset["validation"], batch_size = 2, collate_fn = collate_fn)

loss_batch, gen_batch, answers = next(iter(loader))

In [None]:
out = model.generate(**gen_batch, max_new_tokens = 30, eos_token_id = tokenizer.eos_token_id)
tokenizer.batch_decode(out)

In [None]:
from datasets import load_from_disk

dataset = load_from_disk("cnn_processed_50_150_300_max_all_cols")

column_names = dataset["train"].column_names
dataset = dataset.map(lambda x: {"summary": [" ".join(entry.split()[:100]) for entry in x["highlights"]]}, batched=True)
dataset = dataset.map(lambda x: {"article": [" ".join(entry.split()[:200]) for entry in x["article_half"]]}, batched=True, remove_columns=column_names)

In [None]:
from datasets import load_from_disk

dataset = load_from_disk("msmacro_wellformed_split")

In [None]:
from evaluate import load
bertscore = load("bertscore")
bleu = load("bleu")
rouge = load("rouge")
predictions = ["hello there", "general konobi"]
references = ["hello there", "general kenobi"]
results = bertscore.compute(predictions=predictions, references=references, lang="en")
results_bleu = bleu.compute(predictions=predictions, references=references)
results_rouge = rouge.compute(predictions=predictions, references=references)


In [None]:
from datasets import load_from_disk

dataset = load_from_disk("cnn_processed_50_150_300_max_all_cols")
dataset = dataset.remove_columns(["input_ids", "enc_input_ids"])
dataset = dataset.map(lambda x: {"article_init": [" ".join(entry.split()[:30]) for entry in x["article"]]}, batched=True)

In [None]:
dataset.save_to_disk("cnn_processed_50_150_300_max_init_gen")

In [None]:
from train_utils.eval import EvalCollator
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from datasets import load_from_disk

dataset = load_from_disk("cnn_processed_50_150_300_max_init_gen")
column_names = dataset["train"].column_names
column_names.remove("article_init")
dataset = dataset.map(lambda x: {"summary": [" ".join(entry.split()[:100]) for entry in x["highlights"]]}, batched=True)
dataset = dataset.map(lambda x: {"article": [" ".join(entry.split()[:200]) for entry in x["article_half"]]}, batched=True, remove_columns=column_names)

tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

collate_fn = EvalCollator(tokenizer, None, mode = "article", answer_column="article")
loader = DataLoader(dataset["train"], batch_size = 100, collate_fn = collate_fn)

In [None]:
from train_utils.utils import prepare_dataset, prompt_q, prompt_article
from datasets import load_dataset, load_from_disk

dataset = load_from_disk("cnn_processed_50_150_300_max_init_gen")
column_names = dataset["train"].column_names
dataset = dataset.map(lambda x: {"summary": [" ".join(entry.split()[:100]) for entry in x["highlights"]]}, batched=True)
dataset = dataset.map(lambda x: {"article": [" ".join(entry.split()[:200]) for entry in x["article_half"]]}, batched=True, remove_columns=column_names)

df_qca = dataset.map(prepare_dataset, 
                     fn_kwargs={"prompt": prompt_article, 
                                "tokenizer": tokenizer, 
                                "create_labels" : False, 
                                "enc_tokenizer": None, 
                                "context_enc": False, 
                                "context_column": "summary",
                                "answer_column": "article"}, 
                     batched=True, 
                     remove_columns=dataset["train"].column_names)

# Indices resulting in really long input sequences
#indices_to_drop = [60486, 69092, 98277, 157444, 173621]

#def filter_indices(row, index):
#    return index not in indices_to_drop

#df_qca = df_qca.filter(filter_indices, with_indices=True)

In [None]:
import os 
from glob import glob
from omegaconf import OmegaConf
from transformers import AutoTokenizer, AutoModelForCausalLM
from train_utils.eval import  evaluate
from phi.modeling_phi import PhiForCausalLM

CHECKPOINT_FOLDER = "checkpoints"
CHECKPOINT_PATHS = ["phi_qc_squad_run_treasured-salad-497"]
OUTPUT_FOLDER = ""

for checkpoint in CHECKPOINT_PATHS:
    full_path = glob(os.path.join(CHECKPOINT_FOLDER, checkpoint,"checkpoint-*/"))[0]
    args = OmegaConf.load(os.path.join(full_path, "model_config.yaml"))

    tokenizer = AutoTokenizer.from_pretrained(args.model_args.decoder_base_name)
    enc_tokenizer = AutoTokenizer.from_pretrained(args.model_args.encoder_name) if args.model_args.is_enc_dec else None

    if "phi" in args.model_args.decoder_base_name:
        model = PhiForCausalLM.from_pretrained(full_path)
    else:
        model = AutoModelForCausalLM.from_pretrained(full_path)

    eval_result = evaluate(model = model, 
                          tokenizer = tokenizer,
                          enc_tokenizer = enc_tokenizer,
                          dataset_path = args.data_args.dataset_path,
                          prompt_type = args.data_args.prompt_type,
                          context_enc = args.model_args.is_enc_dec,
                          cover_labels = args.data_args.cover_labels,
                          context_column=args.data_args.context_column,
                          answer_column = args.data_args.answer_column if "answer_column" in args.data_args else "answers",
                          run_decoder_only = args.model_args.is_enc_dec,
                          max_batches = -1)
    



In [None]:
args.data_args.answer_column if "answer_column" in args.data_args else "answers"

In [None]:
from disco_score import DiscoScorer

disco_scorer = DiscoScorer(device='cpu', model_name='bert-base-uncased')
print(disco_scorer.DS_Focus_NN(s, refs)) # FocusDiff 

In [None]:
results

In [None]:
from train_utils.utils import prepare_dataset, prompt_article, prompt_article_summary
from transformers import AutoTokenizer
from tokenizers.processors import TemplateProcessing

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.pad_token = tokenizer.eos_token
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= tokenizer.bos_token + " $A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.bos_token, tokenizer.bos_token_id),(tokenizer.eos_token, tokenizer.eos_token_id)],
)

df_qca = dataset["train"].map(prepare_dataset, fn_kwargs={"prompt": prompt_article, 
                                                 "tokenizer": tokenizer, 
                                                 "create_labels" : False, 
                                                 "enc_tokenizer": tokenizer, 
                                                 "context_enc": True, 
                                                 "context_column": "summary",
                                                 "answer_column": "article",
                                                 "apply_tokenization":True}, batched=True)

In [None]:
# Indices resulting in really long input sequences
indices_to_drop = [60486, 69092, 98277, 157444, 173621]

def filter_indices(row, index):
    return index not in indices_to_drop

filtered_dataset = df_qca.filter(filter_indices, with_indices=True)


In [None]:
df_pd = df_qca.to_pandas()
df_pd["len"] = df_pd.decoder_input_ids.apply(len)

In [None]:
subset = df_pd[df_pd.len > 500]

In [None]:
df_pd.len.describe()

In [None]:
list(subset.index)

In [None]:
df_qca.to_pandas().decoder_input_ids.apply(len).describe()

In [None]:
tokenizer.batch_decode(df_qca.to_pandas().input_ids[0])

In [None]:
print(df_qca.to_pandas().text[0])

In [None]:
df_qca.to_pandas().input_ids.apply(len).describe()

In [None]:
dataset["validation"].to_pandas().article


In [None]:
df_qca.to_pandas()["input_ids"][0]

In [None]:
" ".join(tokenizer.batch_decode(df_qca.to_pandas()["input_ids"][0]))

In [None]:
from phi.modeling_phi import PhiForCausalLM
from transformers import AutoTokenizer

model = PhiForCausalLM.from_pretrained("checkpoints/phi_qc_squad_run_treasured-salad-497/checkpoint-5475/")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.pad_token = tokenizer.eos_token

In [None]:
from train_utils.eval import evaluate

out = evaluate(model, tokenizer, enc_tokenizer =None, dataset_path = "squad", prompt_type = "qc", batch_size = 1, max_batches = 10)

In [None]:
out

In [None]:
df_pd["id_len"].describe()

In [None]:
df_pd[df_pd.id_len >500].iloc[0].question.strip()

In [None]:
from train_utils.eval import EvalCollator
from torch.utils.data import DataLoader
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.pad_token = tokenizer.eos_token

enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

eval_collator = EvalCollator(tokenizer, enc_tokenizer, mode = "q", context_enc=True, cover_labels=True)

eval_loader = DataLoader(dataset_split["validation"], batch_size = 1, collate_fn = eval_collator)

In [None]:
loss_batch, gen_batch, answers = next(iter(eval_loader))

In [None]:
loss_batch["decoder_input_ids"]

In [None]:
loss_batch["labels"]

#### Testing Implementations
---

In [None]:
from phi.configuration_phi import PhiConfig
from phi.modeling_phi import PhiForCausalLM
from transformers import AutoTokenizer, AutoModel

config = PhiConfig(n_layer=3)
config.know_type = "kformer"
config.enc_dim = 1024
config.know_layer = [0, 1, 2]

dummy_model = PhiForCausalLM(config)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.pad_token = tokenizer.eos_token

enc = AutoModel.from_pretrained("roberta-base")
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
from transformers import EncoderDecoderModel, AutoModel, EncoderDecoderConfig, AutoConfig

conf = EncoderDecoderConfig(**{"encoder": enc.config.to_dict(), "decoder": AutoConfig.from_pretrained("gpt2").to_dict()})
conf.decoder= dummy_model.config

model = EncoderDecoderModel(encoder = enc, decoder= dummy_model, config=conf)


In [None]:
model = EncoderDecoderModel.from_pretrained("checkpoints\checkpoint-5475_encdec")

In [None]:
model.save_pretrained("test_enc_dec")

In [None]:
from datasets import load_dataset
from utils import prepare_dataset, prompt_qa

dataset = load_dataset("squad")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})

df_qa = dataset["validation"].map(prepare_dataset, 
                     fn_kwargs={"prompt": prompt_qa, "tokenizer": tokenizer, "create_labels" : True, "enc_tokenizer": enc_tokenizer, "context_enc": True}, 
                     batched=True, 
                     remove_columns=dataset["train"].column_names)

from torch.utils.data import DataLoader
from utils import CustomCollator

loader = DataLoader(df_qa, batch_size=2, collate_fn= CustomCollator(dec_tokenizer= tokenizer, enc_tokenizer= enc_tokenizer))
batch = next(iter(loader))

#model(**batch)

In [None]:
dummy_model(input_ids = dec_input["input_ids"], 
            encoder_hidden_states=enc_hidden_state, 
            encoder_attention_mask=enc_input["attention_mask"])

#### Prepare Squad dataset
---

In [None]:
from datasets import load_dataset

dataset = load_dataset("squad")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})

In [None]:
prompt_qca = """\
{context}
{question}

Answer: {answers}"""

prompt_qc  = """\
{context}
{question}

Answer:"""

prompt_qa = """\
{question}

Answer: {answers}"""

def prepare_dataset(examples, prompt, tokenizer, create_labels = False, return_answers = False):
    input_text = [prompt.format(question = q, context = c, answers = a) for q, c, a in zip(examples["question"], examples["context"], examples["answers"])]
    input_ids = tokenizer(input_text, return_attention_mask=False)
    answer_ids = tokenizer(examples["answers"], return_attention_mask=False)["input_ids"]
    if create_labels:
        labels = input_ids["input_ids"].copy()
        labels = [(len(l)-(len(a)))*[-100] + l[-len(a):] for l, a in zip(labels, answer_ids)]
        input_ids.update({"labels": labels})
    if return_answers:
        input_ids.update({"answer": examples["answers"]})
    return input_ids
    

In [None]:
df_qca = dataset.map(prepare_dataset, fn_kwargs={"prompt": prompt_qca, "tokenizer": tokenizer, "create_labels" : True}, batched=True, remove_columns=dataset["train"].column_names)
df_qa = dataset.map(prepare_dataset, fn_kwargs={"prompt": prompt_qa, "tokenizer": tokenizer, "create_labels" : True}, batched=True, remove_columns=dataset["train"].column_names)

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

class CustomCollator:
    def __init__(self, dec_tokenizer,enc_tokenizer = None):
        self.enc_pad_token_id = enc_tokenizer.pad_token_id if enc_tokenizer is not None else None
        self.dec_pad_token_id = dec_tokenizer.pad_token_id
        self.IGNORE_INDEX = -100

    def __call__(self, batch):
        input_ids, labels, attention_mask  = None, None, None
        # Extract and pad sequences for each column
        if "input_ids" in batch[0]:
            input_ids = pad_sequence([torch.tensor(item['input_ids']) for item in batch], batch_first=True, padding_value = self.dec_pad_token_id)
            if "labels" in batch[0]:
                labels = pad_sequence([torch.tensor(item['labels']) for item in batch], batch_first=True, padding_value = self.IGNORE_INDEX)
            else:
                labels = pad_sequence([torch.tensor(item['input_ids']) for item in batch], batch_first=True, padding_value = self.IGNORE_INDEX)
            attention_mask = input_ids.ne(self.dec_pad_token_id)

        return {"input_ids": input_ids, 
                "attention_mask" : attention_mask, 
                "labels": labels}

In [None]:
from torch.utils.data import DataLoader

loader= DataLoader(df_qca["train"], batch_size=2, collate_fn=CustomCollator(tokenizer))
next(iter(loader))

#### Train Script
---

In [None]:
from transformers import Trainer, TrainingArguments
# paged_adamw_8bit

training_args= TrainingArguments(
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    warmup_steps=500,
    num_train_epochs=1,
    learning_rate=1e-4,
    fp16=True,
    logging_steps=100,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    output_dir="/netscratch/roeder/phi_train",
    optim="adamw_bnb_8bit"
)
trainer = Trainer(
    model=dummy_model,
    args=training_args,
    train_dataset=df_qca["train"],
    eval_dataset=df_qca["validation"],
    tokenizer=tokenizer,
    data_collator=CustomCollator(tokenizer),
)

#### Evaluate 
---

In [None]:
from phi.modeling_phi import PhiForCausalLM
from transformers import AutoTokenizer, AutoModel, AutoConfig
from tokenizers.processors import TemplateProcessing

model_path = "checkpoints/checkpoint-5475_boseos/"
model_path = "checkpoints/checkpoint-5475_boseos_qa_full/"

config = AutoConfig.from_pretrained("microsoft/phi-1_5", trust_remote_code = True)
config.know_type = "gated_cross_attn"
config.enc_dim = 2048
config.know_layer =[5,8,11,14,17,20,23]
config.know_proj_bias = False

model = PhiForCausalLM.from_pretrained(model_path, config = config)
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= tokenizer.bos_token + " $A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.bos_token, tokenizer.bos_token_id),(tokenizer.eos_token, tokenizer.eos_token_id)],
)

enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

In [None]:
for n,p in model.transformer.named_parameters():
    if "gated_attn" not in n:
        p.requires_grad = False

In [None]:
out = model(input_ids = loss_batch["decoder_input_ids"], attention_mask = loss_batch["decoder_attention_mask"], encoder_hidden_states = torch.rand(3,10,2048), encoder_attention_mask = torch.ones(3,10), labels = loss_batch["labels"])



In [None]:
out.loss.backward()

In [None]:
from transformers import get_linear_schedule_with_warmup

In [None]:
from datasets import load_dataset

dataset = load_dataset("ms_marco", "v2.1")

In [None]:
model.transformer.h[8].gated_attn.attn_gate.grad

In [None]:
from train_utils.EncoderDecoder import CustomEncoderDecoderModel

#checkpoint_path = "checkpoints\checkpoint-2737"
#checkpoint_path = "checkpoints\checkpoint-8211_enc_dec_fullprecsion-1e-4_8batch_4gradacc"
#model_path = "checkpoints/checkpoint-5000_low_loss_polar_dragon/"
checkpoint_path= "checkpoints/run_scarlet-bee-459/checkpoint-5475"
checkpoint_path = "checkpoints/run_leafy-plasma-470/checkpoint-5475/"

model = CustomEncoderDecoderModel.from_pretrained(checkpoint_path)

from transformers import AutoTokenizer

enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")
tokenizer = AutoTokenizer.from_pretrained("microsoft/phi-1_5")
tokenizer.pad_token = tokenizer.eos_token
from tokenizers.processors import TemplateProcessing
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= tokenizer.bos_token + " $A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.bos_token, tokenizer.bos_token_id),(tokenizer.eos_token, tokenizer.eos_token_id)],
)

model.config.decoder_start_token_id = tokenizer.bos_token_id

for n, [p in model.decoder.named_parameters():
    if not "proj_k" in n and not "proj_v" in n:
        p.requires_grad = False

from train_utils.eval import EvalCollator
from torch.utils.data import DataLoader
from datasets import load_dataset

dataset = load_dataset("squad")
dataset = dataset["validation"].map(lambda x: {"answers": x["answers"]["text"][0]})
loader = DataLoader(dataset, batch_size = 3, collate_fn = EvalCollator(tokenizer,enc_tokenizer, mode = "q", context_enc = True, cover_labels = True, context_column = "answers"))

loss_batch, gen_batch, answers = next(iter(loader))


In [None]:
hidden_states = model.encoder(loss_batch["input_ids"], attention_mask = loss_batch["attention_mask"])[0]

out = model.enc_to_dec_proj(hidden_states)
v = model.decoder.transformer.h[11].proj_v(out)
k = model.decoder.transformer.h[11].proj_k(out)
print(hidden_states.mean())
print(out.mean())
print(v.mean())
print(k.mean())

In [None]:
from torch import nn
class Attention(nn.Module):
    def __init__(self, config) -> None:
        super().__init__()
        self.proj_q = nn.Linear(config.enc_dim, config.n_embd, bias= config.know_proj_bias)
        self.proj_k = nn.Linear(config.enc_dim, config.n_embd, bias= config.know_proj_bias)
        self.proj_v = nn.Linear(config.enc_dim, config.n_embd, bias= config.know_proj_bias)
        self.attn_dropout = nn.Dropout(config.attn_pdrop)

    def forward(self, query, key, value, attention_mask=None):
        query = self.proj_q(query)
        key = self.proj_k(key)
        value = self.proj_v(value) 

        attn_weights = torch.matmul(query, key.transpose(-1, -2))

        attn_weights = attn_weights / torch.full(
            [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
        )

        if attention_mask is not None:
            # Apply the attention mask
            attn_weights = attn_weights + attention_mask

        attn_weights = nn.functional.softmax(attn_weights, dim=-1)

        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
        attn_weights = attn_weights.type(value.dtype)
        attn_weights = self.attn_dropout(attn_weights)

        attn_output = torch.matmul(attn_weights, value)

        return attn_output, attn_weights


class GatedCrossAttentionBlock(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.attn_gate = nn.Parameter(torch.tensor([0.]))
        self.attention = Attention(config)

    def forward(
        self,
        hidden_states,
        attention_mask,
        encoder_hidden_states,
        encoder_attention_mask
    ):
        attn_out, attn_weights = self.attention(hidden_states, encoder_hidden_states, encoder_hidden_states, encoder_attention_mask)

        hidden_states = attn_out * self.attn_gate.tanh() + hidden_states

        return hidden_states, attn_weights

In [None]:
from phi.adapters import GatedCrossAttentionBlock

In [None]:
import pandas as pd
pd.read_pickle("checkpoints/run_leafy-plasma-470/eval_output.pkl")

In [None]:
model.enc_to_dec_proj.bias

In [None]:
(hidden_states[0][0] - hidden_states[0][4]).mean()

In [None]:
model.decoder.transformer.h[11].proj_v.bias is None

In [None]:
res = model.encoder(loss_batch["input_ids"], attention_mask = loss_batch["attention_mask"])[0][0][1] - model.encoder(loss_batch["input_ids"], attention_mask = loss_batch["attention_mask"])[0][1][1]

In [None]:
res.mean()

In [None]:
model.decoder.transformer.h[0].mixer.Wqkv.weight.requires_grad

In [None]:
from train_utils.eval import evaluate

output = evaluate(model, 
                  tokenizer, 
                  enc_tokenizer, 
                  prompt_type = "q", 
                  context_enc = True, 
                  cover_labels = True, 
                  context_column = "answers", 
                  run_decoder_only = True,
                  max_batches = 10)

In [None]:
from torch.utils.data import DataLoader
from datasets import load_dataset
dataset = load_dataset("squad")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})
from train_utils.eval import EvalCollator

loader = DataLoader(dataset["validation"], batch_size= 4, collate_fn= EvalCollator(tokenizer, enc_tokenizer, mode = "q", context_enc=True,))

In [None]:
loss_batch, gen_batch, answers = next(iter(loader))

In [None]:
decoder_batch = {k.replace("decoder_",""):v for k,v in loss_batch.items() if "decoder" in k or "labels" in k}
decoder_gen_batch = {k.replace("decoder_",""):v for k,v in gen_batch.items() if "decoder" in k or "labels" in k}
decoder_gen_batch

In [None]:
out = model.decoder(**decoder_batch)

In [None]:
# Get the logits for only the answer tokens
len_answers = [len(t) for t in tokenizer(answers).input_ids]
answer_logits = [logits[-l_answer:,:] for logits, l_answer in zip(out.logits, len_answers)]

answer_logits[0].detach().cpu().numpy()

In [None]:
from datasets import load_dataset
from utils import prepare_dataset, prompt_qc, prompt_q, prompt_qa, CustomCollator

dataset = load_dataset("squad")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})
samples_qc = dataset["validation"].map(prepare_dataset, 
                                       fn_kwargs={"prompt": prompt_qc, "tokenizer": tokenizer, "apply_tokenization":False}, 
                                       batched=True, 
                                       remove_columns=dataset["train"].column_names)

samples_q_c = dataset["validation"].map(prepare_dataset, 
                                       fn_kwargs={"prompt": prompt_qa, "tokenizer": tokenizer, "apply_tokenization":True, "enc_tokenizer": enc_tokenizer, "context_enc": True, "create_labels": False}, 
                                       batched=True, 
                                       remove_columns=dataset["train"].column_names)

from torch.utils.data import DataLoader

loader = DataLoader(samples_q_c, batch_size = 1, collate_fn=CustomCollator(tokenizer, enc_tokenizer= enc_tokenizer))

In [None]:
# Decoder only
out = model.decoder.generate(input_ids = batch["decoder_input_ids"][:,:-3], max_new_tokens = 3, eos_token_id = tokenizer.eos_token_id)
tokenizer.batch_decode(out)

In [None]:
import torch
def dec_ids_to_gen_input(tensor):
    """Cuts of the answer and applies left side padding"""
    _,final_ids = torch.where(tensor == 25)
    max_length = max(final_ids)
    samples = []
    for token_ids, max_id in zip(tensor,final_ids):
        left_padding = torch.tensor([50256] * (max_length -max_id), dtype = token_ids.dtype)
        input_tokens = token_ids[:max_id+1]
        token_ids = torch.concat([left_padding, input_tokens])
        samples.append(token_ids)
    return torch.stack(samples)


batch = next(iter(loader))

labels = batch["labels"]
labels[labels == -100] = 50256
answers = [label.split("Answer:")[1].replace("<|endoftext|>","") for label in tokenizer.batch_decode(labels)]

model.eval()
out = model.generate(input_ids=batch["input_ids"], decoder_input_ids = dec_ids_to_gen_input(batch["decoder_input_ids"]), max_new_tokens=30, eos_token_id = tokenizer.eos_token_id)
print(f"Loss: {model(**batch).loss}")
print(f"Prompt: {tokenizer.batch_decode(batch['decoder_input_ids'][:,:-3])}")
print(f"Generated Text: {tokenizer.batch_decode(out, skip_special_tokens=True)}")
print(f"Answers:  {answers}")

In [None]:
input_enc_dec = samples_q_c.to_pandas().iloc[0].to_dict()
decoder_input_ids = list(input_enc_dec["decoder_input_ids"])
decoder_input_ids.insert(0, tokenizer.bos_token_id)
import torch 
input_enc_dec = {"decoder_input_ids" :torch.tensor(decoder_input_ids).unsqueeze(0), "input_ids": torch.tensor(input_enc_dec["input_ids"]).unsqueeze(0)}

output = model.generate(input_ids = input_enc_dec["input_ids"], decoder_input_ids = input_enc_dec["decoder_input_ids"], max_new_tokens=30, eos_token_id = tokenizer.eos_token_id)
tokenizer.batch_decode(output)

In [None]:
from transformers import pipeline

pipe = pipeline(task = "text-generation", 
                model=model, 
                tokenizer=tokenizer, 
                max_new_tokens = 30, 
                return_full_text = False, 
                stop_sequence= tokenizer.eos_token,
                prefix = tokenizer.eos_token,
                batch_size = 2)

input_text = samples_qc["text"]
labels = samples_qc["answer"]

outputs = pipe(input_text[:5])
outputs

In [None]:
output = model.generate(tokenizer(input_text[0], return_tensors="pt")["input_ids"][:,1:-1], max_new_tokens=30, eos_token_id = tokenizer.eos_token_id)
tokenizer.batch_decode(output)

In [None]:
import pandas as pd

df = pd.read_csv("train_eval_phi_output_new.csv")
df2 = pd.read_csv("train_eval_phi_output.csv")

In [None]:
df2.head(10)

### DFKI LM benchmarking prompts

In [None]:
from datasets import load_dataset
splits = ['xquad.ar', 'xquad.de', 'xquad.zh', 'xquad.vi', 'xquad.en', 'xquad.es', 'xquad.hi', 'xquad.el', 'xquad.th', 'xquad.tr', 'xquad.ru', 'xquad.ro']
dataset = load_dataset("xquad", splits[4])

In [None]:
dataset

In [None]:
df_pd = dataset["validation"].to_pandas()
df_pd.answers = df_pd.apply(lambda x: x["answers"]["text"][0], axis = 1)

In [None]:
sample = df_pd.iloc[0]

In [None]:
user_prompt = """\
Context: {context}

Question: {question}

Answer:"""

llama2_prompt = """\
<s>[INST] <<SYS>>
{system_prompt}
<</SYS>>

{user_msg_1} [/INST] {model_answer_1} </s><s>[INST] {user_msg_2} [/INST]"""

In [None]:
system_prompt = "Your are a helpful assitant that extracts answers from a context passage given a question."
user_msg_1 = user_prompt.format(context = sample.context, question = sample.question)
model_answer = sample.answers
user_msg2 = user_prompt.format(context = "blub", question = "blub")


llama2_prompt = llama2_prompt.format(system_prompt = system_prompt, user_msg_1 = user_msg_1, model_answer_1 = model_answer, user_msg_2 = user_msg2)

print(llama2_prompt)

In [None]:
one_shot_prompt = """\
<s>[INST] <<SYS>>
Your are a helpful assitant that extracts answers from a context passage given a question.
<</SYS>>

Context: The Panthers defense gave up just 308 points, ranking sixth in the league, while also leading the NFL in interceptions with 24 and boasting four Pro Bowl selections. Pro Bowl defensive tackle Kawann Short led the team in sacks with 11, while also forcing three fumbles and recovering two. Fellow lineman Mario Addison added 6½ sacks. The Panthers line also featured veteran defensive end Jared Allen, a 5-time pro bowler who was the NFL's active career sack leader with 136, along with defensive end Kony Ealy, who had 5 sacks in just 9 starts. Behind them, two of the Panthers three starting linebackers were also selected to play in the Pro Bowl: Thomas Davis and Luke Kuechly. Davis compiled 5½ sacks, four forced fumbles, and four interceptions, while Kuechly led the team in tackles (118) forced two fumbles, and intercepted four passes of his own. Carolina's secondary featured Pro Bowl safety Kurt Coleman, who led the team with a career high seven interceptions, while also racking up 88 tackles and Pro Bowl cornerback Josh Norman, who developed into a shutdown corner during the season and had four interceptions, two of which were returned for touchdowns.

Question: How many points did the Panthers defense surrender?

Answer: [/INST] 308 </s><s>[INST] Context: {{context}}

Question: {{question}}

Answer: [/INST]"""

zero_shot_prompt = """\
<s>[INST] <<SYS>>
Your are a helpful assitant that extracts answers from a context passage given a question.
<</SYS>>

Context: {{context}}

Question: {{question}}

Answer: [/INST]"""

default_prompt = """\
Context: {{context}}

Question: {{question}}

Answer:"""

In [None]:
default_prompt

In [None]:
one_shot_prompt

### Results
---
- Prompting format only affects llama-chat model?
- One-shot chat: 0.3 exact, 0.83 solution present
- 0-shot chat: 0.0 exact, 0.85 solution present
- dfault prompt chat: 0.0 exact, 0.86 solution present
- 0-shot llama: 0.0 exact, 0.63 solution present
-1-shot llama: 0.

In [None]:
from train_utils.eval import EvalCollator
from datasets import load_dataset
from torch.utils.data import DataLoader

dataset = load_dataset("squad")
dataset = dataset["validation"].map(lambda x: {"answers": x["answers"]["text"][0]})

collate_fn = EvalCollator(tokenizer, enc_tokenizer, mode = "q", context_enc = True, cover_labels = True, context_column="answers")

loader = DataLoader(dataset, batch_size = 4, collate_fn=collate_fn)

In [None]:
loss_batch, gen_batch, answers = next(iter(loader))

In [None]:
input_ids = loss_batch["decoder_input_ids"][0]
attn_mask = loss_batch["decoder_attention_mask"][0]
input_ids = input_ids[attn_mask == 1]

In [None]:
gen = model.generate(**gen_batch, max_new_tokens=30, eos_token_id = tokenizer.eos_token_id, output_scores  = True, return_dict_in_generate = True)

In [None]:
torch.stack(gen.scores)[:,0,:].shape

In [None]:
import pandas as pd

pd.read_pickle("checkpoints/run_leafy-plasma-470/eval_output.pkl")

In [None]:
torch.stack(gen.scores).argmax(dim= -1).t()

In [None]:
gen.scores[1].argmax(-1)

In [None]:
gen.sequences

In [None]:
tokenizer.batch_decode(gen.sequences)

In [None]:
import pandas as pd

df = pd.read_pickle("eval_output.pkl")

#pd.read_csv("eval_output.csv")

In [None]:
batch = {k:torch.tensor(v).unsqueeze(0) for k,v in df["gen_batch"][0].items()}
out_test = model.generate(**batch, max_new_tokens=30, eos_token_id = tokenizer.eos_token_id)

In [None]:
tokenizer.batch_decode(out_test)

In [None]:
from modeling_gpt2 import GPT2LMHeadModel, GPT2Config
from transformers import AutoModel, AutoTokenizer
from tokenizers.processors import TemplateProcessing
from torch import nn

class EncoderDecoder(nn.Module):
    def __init__(self, encoder, decoder):
        super().__init__()
        self.encoder = encoder
        self.decoder = decoder
    
    def forward(self, batch):
        enc_hidden_state = encoder(input_ids = batch["input_ids"], 
                                   attention_mask = batch["attention_mask"]).last_hidden_state
        
        out = decoder(input_ids = batch["decoder_input_ids"], 
                     attention_mask = batch["decoder_attention_mask"],
                     encoder_hidden_states = enc_hidden_state, 
                     encoder_attention_mask = batch["attention_mask"], 
                     labels = batch["labels"])
        return out

def freeze_decoder(model):
    for n,p in model.named_parameters():
        if not "cross" in n:
            p.requires_grad = False

config = GPT2Config.from_pretrained("gpt2")
config.add_cross_attention = True
config.cross_attn_layer_idx = [11]

decoder = GPT2LMHeadModel.from_pretrained("gpt2", config = config)
encoder = AutoModel.from_pretrained("roberta-base")

dec_tokenizer = AutoTokenizer.from_pretrained("gpt2")
dec_tokenizer.pad_token = dec_tokenizer.eos_token
dec_tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= dec_tokenizer.bos_token + " $A " + dec_tokenizer.eos_token,
    special_tokens=[(dec_tokenizer.bos_token, dec_tokenizer.bos_token_id),(dec_tokenizer.eos_token, dec_tokenizer.eos_token_id)],
)
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

freeze_decoder(decoder)

model  = EncoderDecoder(encoder = encoder,decoder = decoder)

In [None]:
past_length = 0
input_ids = dec_tokenizer("hello this is me and my cat and we be doing a thing", return_tensors = "pt")["input_ids"]

device = "cpu"



def get_embeds(decoder, input_ids):
    input_shape = input_ids.size()
    position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=decoder.device)
    position_ids = position_ids.unsqueeze(0)
    inputs_embeds = decoder.transformer.wte(input_ids)
    position_embeds = decoder.transformer.wpe(position_ids)
    hidden_states = inputs_embeds + position_embeds
    return hidden_states

In [None]:
input_ids = dec_tokenizer("The Space Needle is in the city of ", return_tensors = "pt")["input_ids"]

out =model.decoder(input_ids = input_ids[:,:-1])

In [None]:
dec_tokenizer.decode(out.logits.squeeze()[-1].argmax())

In [None]:
out = model.decoder.generate(input_ids = input_ids[:,:-1], max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id)

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model = AutoModelForCausalLM.from_pretrained("gpt2")
tokenizer = AutoTokenizer.from_pretrained("gpt2")

input_ids = tokenizer("The Space Needle is in the city of ", return_tensors = "pt")["input_ids"]
out = model.generate(input_ids = input_ids)

In [None]:
from tokenizers.processors import TemplateProcessing
from transformers import AutoModelForCausalLM, AutoTokenizer
model_path = "checkpoints/gpt2_q_squad_run_cosmic-butterfly-490/checkpoint-5475/"
model_path = "checkpoints/gpt2medium_q_squad_run_summer-cloud-510/checkpoint-5475/"
model_path = "checkpoints/gpt2medium_run_good-puddle-513/checkpoint-5475/"


model = AutoModelForCausalLM.from_pretrained(model_path)
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token
tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= tokenizer.bos_token + " $A " + tokenizer.eos_token,
    special_tokens=[(tokenizer.bos_token, tokenizer.bos_token_id),(tokenizer.eos_token, tokenizer.eos_token_id)],
)

from datasets import load_dataset

dataset = load_dataset("squad")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})

from train_utils.eval import EvalCollator

collate_fn = EvalCollator(tokenizer, enc_tokenizer= None, mode = "q", context_enc = False, cover_labels = False)
from torch.utils.data import DataLoader

# Calculate indices for the last 15 rows
dataset_train = dataset["train"]
total_rows = len(dataset["train"])
last_15_indices = list(range(total_rows - 15, total_rows))

# Select the last 15 rows, keeping it as a Hugging Face dataset
last_15_rows = dataset_train.select(last_15_indices)

loader = DataLoader(last_15_rows, batch_size = 10, collate_fn=collate_fn)
batch_loss, batch_gen, answers = next(iter(loader))

In [None]:
from eval_utils.loading_utils import load_encdec_model, load_batches_from_evaldf
from transformers import AutoModel
from modeling_gpt2 import GPT2LMHeadModel

model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475", AutoModel, GPT2LMHeadModel)

In [None]:
import pandas as pd
df = pd.read_pickle("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/eval_output.pkl")
loss_batch, gen_batch, dec_loss_batch, dec_gen_batch = load_batches_from_evaldf(df, 0)

In [None]:
from nethook import TraceDict
import torch


last_hidden_state_init = model.encoder(input_ids = loss_batch["input_ids"], attention_mask = loss_batch["attention_mask"]).last_hidden_state
last_hidden_state = model.enc_to_dec_proj(last_hidden_state_init)


layer_names = [n for n,p in model.named_modules() if "cross" not in n and "internal_dropout" not in n][2:]

with TraceDict(model, layer_names) as ret:
    out = model.decoder(input_ids = loss_batch["decoder_input_ids"], 
                        attention_mask = loss_batch["decoder_attention_mask"], 
                        encoder_hidden_states = last_hidden_state, 
                        encoder_attention_mask = loss_batch["attention_mask"], 
                        labels = loss_batch["labels"])
    
with TraceDict(model, layer_names) as ret_dec:
    out_dec = model.decoder(input_ids = dec_loss_batch["input_ids"],
                            attention_mask = dec_loss_batch["attention_mask"],
                            labels = dec_loss_batch["labels"])
    
ret_f = {k:v.output if isinstance(v.output, torch.Tensor) else v.output[0] for k,v in ret.items() if hasattr(v, "output")}
ret_dec_f = {k:v.output if isinstance(v.output, torch.Tensor) else v.output[0] for k,v in ret_dec.items() if hasattr(v, "output")}

In [None]:
for (k1,v1), (k2,v2) in zip(ret_f.items(), ret_dec_f.items()):
    pass

In [None]:
from transformers import AutoModel

enc_base = AutoModel.from_pretrained("roberta-base")

In [None]:
t_init = enc_base(loss_batch["input_ids"], attention_mask = loss_batch["attention_mask"]).last_hidden_state
t = model.enc_to_dec_proj(t_init)

In [None]:
print(f"Mean tensor diff t_init {torch.tensor([(t_init[0][9] - t_init[0][i]).abs().mean() for i in range(30)]).mean()}")
print(f"Mean tensor diff t {torch.tensor([(t[0][9] - t[0][i]).abs().mean() for i in range(30)]).mean()}")
print(f"Mean tensor diff last_h_init {torch.tensor([(last_hidden_state_init[0][9] - last_hidden_state_init[0][i]).abs().mean() for i in range(30)]).mean()}")
print(f"Mean tensor diff last_h {torch.tensor([(last_hidden_state[0][9] - last_hidden_state[0][i]).abs().mean() for i in range(30)]).mean()}")

In [None]:
from transformers import BertModel, BertTokenizer, AutoTokenizer
import torch
from sklearn.metrics.pairwise import cosine_similarity

# Load pre-trained model and tokenizer

enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

# Function to get embeddings
def get_embedding(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt')
    outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1)

# Example words
words = ['king', 'queen', 'apple', 'oof', 'table']

# Get embeddings
embeddings = [get_embedding(word, enc_base, enc_tokenizer).detach().squeeze().numpy() for word in words]
embeddings2  = [get_embedding(word, model.encoder, enc_tokenizer).detach().squeeze().numpy() for word in words]

# Calculate cosine similarity
cos_sim = cosine_similarity(embeddings)

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt

sns.heatmap(cos_sim, annot=True, xticklabels=words, yticklabels=words)
plt.title('Cosine Similarity between Word Embeddings')
plt.show()


## Investigating Embedding Collapse in Enc model
---
- We see that the vector space of the finetuned encoder collapsed as the cosine sim is 1 of embeds and the mean abs diff is extremly low
- The pca plot shows how the distribution collapsed onto a single principal component. Also consider the scale of the pca
- We are not using tsne or umap as they scale the values for dim reduction thus hiding the true scale

In [None]:
from transformers import AutoModel, AutoTokenizer

enc_base = AutoModel.from_pretrained("roberta-base")
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

from eval_utils.loading_utils import load_encdec_model, load_batches_from_evaldf
from transformers import AutoModel
from modeling_gpt2 import GPT2LMHeadModel

checkpoint_path = "checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475"
# Checkpoing with var type norm of cross attn
checkpoint_path = "checkpoints/run_divine-field-529/checkpoint-5475/"

model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model(checkpoint_path, AutoModel, GPT2LMHeadModel)

In [None]:
input_text = """DeepMind Technologies Limited,[4] doing business as Google DeepMind, is a British-American artificial intelligence research laboratory which serves as a subsidiary of Google. Founded in the UK in 2010, it was acquired by Google in 2014,[5] The company is based in London, with research centres in Canada,[6] France,[7] Germany and the United States.

Google DeepMind has created neural network models that learn how to play video games in a fashion similar to that of humans,[8] as well as Neural Turing machines (neural networks that can access external memory like a conventional Turing machine),[9] resulting in a computer that loosely resembles short-term memory in the human brain.[10][11]

DeepMind made headlines in 2016 after its AlphaGo program beat a human professional Go player Lee Sedol, a world champion, in a five-game match, which was the subject of a documentary film.[12] A more general program, AlphaZero, beat the most powerful programs playing go, chess and shogi (Japanese chess) after a few days of play against itself using reinforcement learning.[13] In 2020, DeepMind made significant advances in the problem of protein folding with AlphaFold.[14] In July 2022, it was announced that over 200 million predicted protein structures, representing virtually all known proteins, would be released on the AlphaFold database.[15][16] """

input_tokens = enc_tokenizer(input_text, return_tensors="pt")

In [None]:
from eval_utils.eval_tools import get_similarities

base_states = enc_base(**input_tokens).last_hidden_state.squeeze().detach()
base_states_np = base_states.numpy()

fine_states = model.encoder(**input_tokens).last_hidden_state.squeeze().detach()
fine_states_np = fine_states.numpy()

print(f"Pretrained Model")
diff_matrix, cos_sim = get_similarities(base_states)
print(f"Avg abs dist: {diff_matrix.mean(dim=-1).mean()}")
print(f"Avg abs cos dist: {cos_sim.mean(dim=-1).mean()}")

print("--------------------------------------------------")

print(f"Fine-Tuned Model")
diff_matrix2, cos_sim2 = get_similarities(fine_states)
print(f"Avg abs dist: {diff_matrix2.mean(dim=-1).mean()}")
print(f"Avg abs cos dist: {cos_sim2.mean(dim=-1).mean()}")

In [None]:
from eval_utils.eval_tools import plot_tsne, plot_pca

#plot_tsne(base_states_np)
#plot_tsne(fine_states_np)

plot_pca(base_states_np)
plot_pca(fine_states_np)

## Investigating decoder logits
---
- What we observe is that the signal of the encoder even in a randomly initalized state is really weak
- The enc signal has almost no effect on the loss and almost no effect on the logits
- Implemented different positions of cross-attn: almost no effect: There is almost no difference in loss between cross attn and no cross attn
- Loss explosion is caused by applying layer norm on residual when applying cross attn

- As an analysis we perform jaccard dist and rob sim between the rankings of the top 10 logits
- We also extract the logprobs of the first ref token to see how it changes


In [None]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from eval_utils.loading_utils import load_encdec_model, load_batches_from_evaldf
from transformers import AutoModel
from modeling_gpt2 import GPT2LMHeadModel
import pandas as pd

enc_base = AutoModel.from_pretrained("roberta-base")
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

#config = AutoConfig.from_pretrained("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475").decoder
#dec_base = GPT2LMHeadModel.from_pretrained("gpt2-medium", config = config)

model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475", AutoModel, GPT2LMHeadModel)
dec_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

In [None]:
import pandas as pd
from eval_utils.loading_utils import load_batches_from_evaldf

df = pd.read_pickle("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/eval_output.pkl")

loss_batch, gen_batch, dec_loss_batch, dec_gen_batch = load_batches_from_evaldf(df, 0)

In [None]:
import torch
from torch.functional import F
from modeling_gpt2 import GPT2Attention
from torch import nn

dec_conf = model.decoder.config

# Default enc_dec forward
output_encdec = model.generate(**gen_batch, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id, return_dict_in_generate = True, output_scores = True)

# Encdec with random cross attention
model.decoder.transformer.h[7].crossattention = GPT2Attention(dec_conf, layer_idx=7, is_cross_attention=True)
model.decoder.transformer.h[8].crossattention = GPT2Attention(dec_conf, layer_idx=8, is_cross_attention=True)
model.decoder.transformer.h[9].crossattention = GPT2Attention(dec_conf, layer_idx=9, is_cross_attention=True)
model.decoder.transformer.h[10].crossattention = GPT2Attention(dec_conf, layer_idx=10, is_cross_attention=True)
model.decoder.transformer.h[7].ln_cross_attn =nn.LayerNorm(1024, eps=dec_conf.layer_norm_epsilon)
model.decoder.transformer.h[8].ln_cross_attn =nn.LayerNorm(1024, eps=dec_conf.layer_norm_epsilon)
model.decoder.transformer.h[9].ln_cross_attn =nn.LayerNorm(1024, eps=dec_conf.layer_norm_epsilon)
model.decoder.transformer.h[10].ln_cross_attn =nn.LayerNorm(1024, eps=dec_conf.layer_norm_epsilon)
output_encdec_rand = model.generate(**gen_batch, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id, return_dict_in_generate = True, output_scores = True)

# With random cross attention and random enc
model.encoder = enc_base
output_encdec_base = model.generate(**gen_batch, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id, return_dict_in_generate = True, output_scores = True)

# Decoder only
output_dec = model.decoder.generate(**dec_gen_batch, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id, return_dict_in_generate = True, output_scores = True)


# Transpose to shape (batch_size, num_tokens, vocab_size)
softmax_scores_encdec = F.softmax(torch.stack(output_encdec.scores),dim = -1).transpose(0,1)
softmax_scores_dec = F.softmax(torch.stack(output_dec.scores),dim = -1).transpose(0,1)
softmax_scores_encdec_base = F.softmax(torch.stack(output_encdec_base.scores),dim = -1).transpose(0,1)
softmax_scores_encdec_rand = F.softmax(torch.stack(output_encdec_rand.scores),dim = -1).transpose(0,1)

log_prob, index_encdec_base = softmax_scores_encdec_base.topk(10)
log_prob, index_encdec_rand = softmax_scores_encdec_rand.topk(10)
log_prob, index_encdec = softmax_scores_encdec.topk(10)
log_prob, index_dec = softmax_scores_dec.topk(10)


min_token = min(len(index_encdec_base), len(index_encdec), len(index_dec), len(index_encdec_rand))
index_encdec_base = index_encdec_base[:,:min_token,:]
index_encdec_rand = index_encdec_rand[:,:min_token,:]
index_encdec = index_encdec[:,:min_token,:]
index_dec = index_dec[:,:min_token,:]

In [None]:
import numpy as np
from eval_utils.rbo import rbo

def get_ref_logprobs(softmax_scores, first_ref_token_idx):
    """Inputs:
    softmax_scores: Tensor of shape (batch_size, num_tokens, vocab_size) !this is a transpose of the original tensor!
    first_ref_token_idx: List of shape (batch_size) containing the index of the first reference token in each batch
    """

    ref_logprobs = []
    for log_probs, ref_id in zip(softmax_scores, first_ref_token_idx):
        ref_logprobs.append(log_probs[0][ref_id].item())
    return ref_logprobs

def calculate_logit_stats(tensor1, tensor2):
    # Calculate jaccard similarity for top k logit indices
    jaccard_similarities = np.zeros((tensor1.shape[0], tensor1.shape[1]))
    rbo_data = np.zeros((tensor1.shape[0], tensor1.shape[1]))

    # Iterate over the tensor elements
    for i in range(tensor1.shape[0]):
        for j in range(tensor1.shape[1]):
            # Convert tensor slices to sets
            set1 = set(tensor1[i, j].numpy())
            set2 = set(tensor2[i, j].numpy())

            # RBO
            rbo_res = rbo(list(tensor1[i][j].numpy()), list(tensor2[i][j].numpy()), p = 0.9)
            rbo_data[i, j] = rbo_res.ext

            # Calculate Jaccard similarity
            intersection = len(set1.intersection(set2))
            union = len(set1.union(set2))
            jaccard_sim = intersection / union if union != 0 else 0

            # Store the similarity
            jaccard_similarities[i, j] = jaccard_sim

    return jaccard_similarities, rbo_data

jaccard, rbo = calculate_logit_stats(index_enc_dec2, index_dec2)
#jaccard, rbo = calculate_logit_stats(index_encdec, index_dec)

print(jaccard.mean())
print(rbo.mean())

In [None]:
import pandas as pd
df = pd.read_pickle("checkpoints/eval_output.pkl")

In [None]:
import torch

enc_dec_logits = torch.tensor(df.gen_logits[0])
dec_logits = torch.tensor(df.decoder_gen_logits[0][:len(enc_dec_logits),:])
enc_dec_logits1 = torch.tensor(df.gen_logits[1])
dec_logits1 = torch.tensor(df.decoder_gen_logits[1][:len(enc_dec_logits),:])
enc_dec_logits2 = torch.tensor(df.gen_logits[2])
dec_logits2 = torch.tensor(df.decoder_gen_logits[2][:len(enc_dec_logits),:])

enc_dec_logits = torch.stack([enc_dec_logits, enc_dec_logits1, enc_dec_logits2])
dec_logits = torch.stack([dec_logits, dec_logits1, dec_logits2])


log_prob, index_enc_dec2 = enc_dec_logits.topk(10)
log_prob, index_dec2 = dec_logits.topk(10)

jaccard, rob = calculate_logit_stats(index_enc_dec2, index_dec2)
print(jaccard.mean())
pritn(rob.mean())

In [None]:
index_dec.shape

In [None]:
index_enc_dec.numpy()

In [None]:
calculate_logit_stats(index_encdec, index_dec)

In [None]:
rbo.mean()

In [None]:
input_tensor = loss_batch["labels"]
first_ref_token_idx = [
    [value.item() for value in row if value not in (-100, 25)][0] for row in input_tensor
]
ref_logprob_encdec = get_ref_logprobs(softmax_scores_encdec, first_ref_token_idx)
ref_logprob_dec = get_ref_logprobs(softmax_scores_dec, first_ref_token_idx)
ref_logprob_encdec_base = get_ref_logprobs(softmax_scores_encdec_base, first_ref_token_idx)
ref_logprob_encdec_rand = get_ref_logprobs(softmax_scores_encdec_rand, first_ref_token_idx)
print("Logprob of first ref token:")
print(ref_logprob_encdec)
print(ref_logprob_dec)
print(ref_logprob_encdec_base)
print(ref_logprob_encdec_rand)
print("--------------------------------------------------")

## Analyzing how the encoder signal flows in the model
---
- Created a plot to show how different acitvations are for encdec and dec-only forwarf pass
- We see that the differences are there but they are not that pronounced 
- This shows that the signals are indeed affecting the model and are even becoming more prominent towards the end but overall the magnitude of the changes is too low

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from eval_utils.loading_utils import load_encdec_model, load_batches_from_evaldf
from transformers import AutoModel
from modeling_gpt2 import GPT2LMHeadModel
import pandas as pd

enc_base = AutoModel.from_pretrained("roberta-base")
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

#config = AutoConfig.from_pretrained("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475").decoder
#dec_base = GPT2LMHeadModel.from_pretrained("gpt2-medium", config = config)

checkpoint_path = "checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475"
#checkpoint_path = "checkpoints/run_divine-field-529/checkpoint-5475"

model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model(checkpoint_path, AutoModel, GPT2LMHeadModel)
dec_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")

In [None]:
import pandas as pd
from eval_utils.loading_utils import load_batches_from_evaldf

df = pd.read_pickle("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/eval_output.pkl")

loss_batch, gen_batch, dec_loss_batch, dec_gen_batch = load_batches_from_evaldf(df, 0)

In [None]:
model = model.train()
model = model.eval()

In [None]:
from nethook import TraceDict
import torch

layer_names = ["decoder."+n for n,m in model.decoder.named_modules() if len(n.split(".")) == 3]

with TraceDict(model, layer_names) as ret:
    _ = model(**gen_batch)
outputs = {k:v.output if isinstance(v.output, torch.Tensor) else v.output[0] for k,v in ret.items() if hasattr(v, "output")}


with TraceDict(model, layer_names) as ret_dec:
    _ = model.decoder(**dec_gen_batch)
outputs_dec = {k:v.output if isinstance(v.output, torch.Tensor) else v.output[0] for k,v in ret_dec.items() if hasattr(v, "output")}


In [None]:
import torch.nn.functional as F

outputs["decoder.lm_head"] = F.softmax(outputs["decoder.lm_head"], dim = -1)
outputs_dec["decoder.lm_head"] = F.softmax(outputs_dec["decoder.lm_head"], dim = -1)

In [None]:
#The abs mean diff between dec only activations and encdec forward pass
output_diff= {k: (v-outputs[k]).abs().mean(dim = -1)  for k,v in outputs_dec.items()}

# The abs mean diff for each activation in the model
differences = [diff[0].detach() for diff in output_diff.values()]
differences = torch.stack(differences).numpy().T

# The text for each token in the sequence
tokens = dec_tokenizer.batch_decode(dec_gen_batch["input_ids"][0])

In [None]:
from matplotlib import pyplot as plt

fig, ax = plt.subplots(figsize=(3.5, 2), dpi=200)
h = ax.pcolor(
    differences,
    cmap={None: "Purples", "None": "Purples", "mlp": "Greens", "attn": "Reds"}[
        None
    ],
    vmin=0.0,
)
ax.invert_yaxis()
ax.set_yticks([0.5 + i for i in range(len(differences))])
ax.set_xticks([0.5 + i for i in range(0, differences.shape[1],2)])
ax.set_xticklabels(list(range(0, differences.shape[1]-1, 2)), fontsize = 7)
#ax.set_xticklabels(list(range(0, differences.shape[1]-1, 2))+["lm_head"], fontsize = 7)
#if (xticklabels:=ax.get_xticklabels()): xticklabels[-1].set_rotation(45)
ax.set_yticklabels(tokens, fontsize = 7)
ax.set_xlabel(f"Layer id", fontsize = 7)
ax.set_title("Difference in layer activations", fontsize = 10)
cb = plt.colorbar(h)
cb.ax.tick_params(labelsize=7) 
plt.show()

In [None]:
from nethook import TraceDict
import torch

layer_names = ["decoder."+n for n,m in model.decoder.named_modules() if len(n.split(".")) == 4] + ["decoder.lm_head"]

with TraceDict(model, layer_names) as ret:
    _ = model(**gen_batch)
outputs = {k:v.output if isinstance(v.output, torch.Tensor) else v.output[0] for k,v in ret.items() if hasattr(v, "output")}


with TraceDict(model, layer_names) as ret_dec:
    _ = model.decoder(**dec_gen_batch)
outputs_dec = {k:v.output if isinstance(v.output, torch.Tensor) else v.output[0] for k,v in ret_dec.items() if hasattr(v, "output")}


In [None]:
import pandas as pd

df = pd.read_pickle("checkpoints/eval_output2.pkl")

In [None]:
df["gen_batch"][0]["input_ids"].shape

In [None]:
df[df.f1!=0]

In [None]:
outputs_dec["decoder.lm_head"][:,-1,:].argmax(-1)

In [None]:
output_diff= {k: (v-outputs[k]).abs().mean(dim = -1)  for k,v in outputs_dec.items()}
for k,v in output_diff.items():
    print(f"{k}: {v.mean(dim=-1)}")

 ## Analyzing Differences in loss 
 - Even if we randomly initialize the cross attention modules and use a default pretrained encoder the loss is barely affected
 - This is true no matter where the crossattn block is positioned i.e. after attention, after mlp, after everthing (final)


In [None]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from eval_utils.loading_utils import load_encdec_model, load_batches_from_evaldf
from transformers import AutoModel
from modeling_gpt2 import GPT2LMHeadModel
import pandas as pd

enc_base = AutoModel.from_pretrained("roberta-base")
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

#config = AutoConfig.from_pretrained("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475").decoder
#dec_base = GPT2LMHeadModel.from_pretrained("gpt2-medium", config = config)

#model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model("checkpoints/run_divine-field-529/checkpoint-5475", AutoModel, GPT2LMHeadModel)

from tokenizers.processors import TemplateProcessing
dec_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
dec_tokenizer.pad_token = dec_tokenizer.eos_token
dec_tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= dec_tokenizer.bos_token + " $A " + dec_tokenizer.eos_token,
    special_tokens=[(dec_tokenizer.bos_token, dec_tokenizer.bos_token_id),(dec_tokenizer.eos_token, dec_tokenizer.eos_token_id)],
)

In [None]:
import pandas as pd
from eval_utils.loading_utils import load_batches_from_evaldf

df = pd.read_pickle("checkpoints/run_divine-field-529/eval_output.pkl")

loss_batch, gen_batch, dec_loss_batch, dec_gen_batch = load_batches_from_evaldf(df, 0)

In [None]:
from train_utils.utils import prepare_dataset, prompt_qa, prompt_qc_enc
from datasets import load_from_disk

from train_utils.encoder import PrefixEncoder

enc_model, enc_tokenizer = PrefixEncoder.from_sentenc_checkpoint("checkpoints/2725_prefix10/")

dataset = load_from_disk("squad_with_answer_sentence")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})
dataset = dataset.map(lambda x: {k:v.strip() for k,v in x.items()})

df_qca = dataset.map(prepare_dataset, 
                     fn_kwargs={"prompt": prompt_qa, 
                                "tokenizer": dec_tokenizer, 
                                "create_labels" : False, 
                                "enc_tokenizer": enc_tokenizer, 
                                "context_enc": True, 
                                "context_column": "answer_sentence",
                                "answer_column": "answers",
                                "enc_prompt": prompt_qc_enc,
                                "num_prefix_token": 5}, 
                     batched=True, 
                     remove_columns=dataset["train"].column_names)
from train_utils.utils import CustomCollator
from torch.utils.data import DataLoader
collate_fn = CustomCollator(dec_tokenizer, enc_tokenizer = enc_tokenizer)

In [None]:
from datasets import load_dataset
from train_utils.eval import EvalCollator
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

dataset = load_dataset("squad")
dataset = dataset.map(lambda x: {"answers": x["answers"]["text"][0]})
loader = DataLoader(dataset["train"], batch_size = 3, collate_fn = EvalCollator(tokenizer,
                                                                                enc_tokenizer, 
                                                                                mode = "q", 
                                                                                context_enc = True, 
                                                                                cover_labels=True, 
                                                                                context_column = "qc",
                                                                                answer_column = "answers",
                                                                                num_prefix_token = 5))


In [None]:
loss_batch, gen_batch, answers = next(iter(loader))

In [None]:
gen_batch

In [None]:
model.encoder.num_prefix_token

In [None]:
enc_tokenizer.batch_decode(df_qca["validation"].to_pandas().iloc[0].input_ids)

In [None]:
loader = DataLoader(df_qca["train"], batch_size = 4, collate_fn=collate_fn)
batch = next(iter(loader))

In [None]:
model = model.train()
for i , batch in enumerate(loader):
    print(model(**batch).loss)
    if i == 10:break

In [None]:
for i in range(10):
    loss_batch, gen_batch, dec_loss_batch, dec_gen_batch = load_batches_from_evaldf(df, i)
    print(model(**loss_batch).loss)

In [None]:
batch["decoder_input_ids"]

In [None]:
loss_batch["decoder_input_ids"]

In [None]:
model = model.train()
out = model(**loss_batch)
out.loss

In [None]:
batch["decoder_input_ids"]

In [None]:
loss_batch["decoder_input_ids"]

In [None]:
model.train()
out = model(**loss_batch)
out_dec = model.decoder(**dec_loss_batch)
print(out.loss)
print(out_dec.loss)
dec_tokenizer.batch_decode(out.logits.argmax(dim = -1))

In [None]:
dropout_v = 0.1

model.encoder.embeddings.dropout.p = dropout_v
for i in range(12):
    model.encoder.encoder.layer[i].attention.self.dropout.p = dropout_v
    model.encoder.encoder.layer[i].attention.output.dropout.p = dropout_v
    model.encoder.encoder.layer[i].output.dropout.p = dropout_v

for i in range(len(model.decoder.transformer.h)):
    model.decoder.transformer.h[7].crossattention.attn_dropout.p =  dropout_v
    model.decoder.transformer.h[8].crossattention.resid_dropout.p =  dropout_v

In [None]:
model.eval()
out = model(**loss_batch)
out_dec = model.decoder(**dec_loss_batch)
print(out.loss)
print(out_dec.loss)
dec_tokenizer.batch_decode(out.logits.argmax(dim = -1))

In [None]:
model.eval()
out = model.generate(**gen_batch, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id)
dec_tokenizer.batch_decode(out, skip_special_tokens=True)

In [None]:
model.train()
out = model.generate(**gen_batch, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id)
dec_tokenizer.batch_decode(out, skip_special_tokens=True)

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from modeling_gpt2 import GPT2LMHeadModel

layer_ids = [1,2,3,4,5,6,7,8,9,10]

config = AutoConfig.from_pretrained("gpt2")
config.know_layer = layer_ids
config.know_type = "crossattn"
config.hidden_dropout = 0.0
config.know_pos = "final"
config.know_norm = "var"
dec_model = GPT2LMHeadModel.from_pretrained("gpt2", config = config)

enc_base = AutoModel.from_pretrained("roberta-base")

dec_tokenizer = AutoTokenizer.from_pretrained("gpt2")

In [None]:
import pandas as pd
from eval_utils.loading_utils import load_batches_from_evaldf

df = pd.read_pickle("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/eval_output.pkl")

loss_batch, gen_batch, dec_loss_batch, dec_gen_batch = load_batches_from_evaldf(df, 0)

In [None]:
for i in layer_ids:
    dec_model.transformer.h[i].cross_attn_pos = "mlp"
    dec_model.transformer.h[i].cross_attn_norm = "var"

hidden_states = enc_base(input_ids = loss_batch["input_ids"], attention_mask = loss_batch["attention_mask"]).last_hidden_state
hidden_states = hidden_states * 1

loss_encdec = dec_model(input_ids = loss_batch["decoder_input_ids"],
          attention_mask = loss_batch["decoder_attention_mask"],
          encoder_hidden_states = hidden_states,
          encoder_attention_mask = loss_batch["attention_mask"],
          labels = loss_batch["labels"]).loss

out = dec_model(**dec_loss_batch)
loss_dec = out.loss

print(loss_encdec)
print(loss_dec)

In [None]:
out = dec_model.generate(**dec_gen_batch, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id, return_dict_in_generate = True, output_scores = True)
dec_tokenizer.batch_decode(out.sequences, skip_special_tokens=True)

In [None]:
out = dec_model.generate(input_ids = dec_gen_batch["input_ids"], attention_mask = dec_gen_batch["attention_mask"], encoder_hidden_states = hidden_states, encoder_attention_mask = gen_batch["attention_mask"], max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id)
dec_tokenizer.batch_decode(out, skip_special_tokens=True)

In [None]:
from transformers import AutoConfig
checkpoint_path = "checkpoints/run_divine-field-529/checkpoint-5475/"
conf = AutoConfig.from_pretrained(checkpoint_path)

In [None]:
import pandas as pd 

df = pd.read_pickle("checkpoints/eval_output.pkl")

In [None]:
df[df.exact_match != 0]

In [None]:
model.enc_to_dec_proj.bias

In [None]:
def prepare_model_parameters(model):
    encoder_nodecay = [p for n,p in model.encoder.named_parameters() if any([x in n for x in ["bias", "LayerNorm"]])]
    encoder_decay = [p for n,p in model.encoder.named_parameters() if not any([x in n for x in ["bias", "LayerNorm"]])]

    decoder_cross_nodecay = [p for n,p in model.decoder.named_parameters() if any([x in n for x in ["cross","attn_gate"]]) and any([x in n for x in ["bias", "ln"]])]
    decoder_cross_decay = [p for n,p in model.decoder.named_parameters() if any([x in n for x in ["cross","attn_gate"]]) and not any([x in n for x in ["bias", "ln"]])]

    decoder_backbone_nodecay = [p for n,p in model.decoder.named_parameters() if not any([x in n for x in ["cross","attn_gate"]]) and any([x in n for x in ["bias", "ln"]])]
    decoder_backbone_decay = [p for n,p in model.decoder.named_parameters() if not any([x in n for x in ["cross","attn_gate"]]) and not any([x in n for x in ["bias", "ln"]])]

    if hasattr(model, "enc_to_dec_proj"):
        decoder_cross_nodecay += [model.enc_to_dec_proj.bias]
        decoder_cross_decay += [model.enc_to_dec_proj.weight]
    return {"decoder_backbone_decay": decoder_backbone_decay,
            "decoder_backbone_nodecay": decoder_backbone_nodecay,
            "decoder_cross_decay": decoder_cross_decay,
            "decoder_cross_nodecay": decoder_cross_nodecay,
            "encoder_decay": encoder_decay,
            "encoder_nodecay": encoder_nodecay}


def get_group_params(train_type:str, param_dict):    
    params_decay = []
    params_nodecay = []
    if "enc" in train_type or train_type == "full":
        params_decay+= param_dict.pop("encoder_decay") if "encoder_decay" in param_dict else []
        params_nodecay+= param_dict.pop("encoder_nodecay") if "encoder_nodecay" in param_dict else []
    if "cross" in train_type or train_type == "full":
        params_decay += param_dict.pop("decoder_cross_decay") if "decoder_cross_decay" in param_dict else []
        params_nodecay += param_dict.pop("decoder_cross_nodecay") if "decoder_cross_nodecay" in param_dict else []
    if "dec" in train_type or train_type == "full":
        params_decay += param_dict.pop("decoder_backbone_decay") if "decoder_backbone_decay" in param_dict else []
        params_nodecay += param_dict.pop("decoder_backbone_nodecay") if "decoder_backbone_nodecay" in param_dict else []
    return params_decay, params_nodecay

def get_g1_g2(model, train_type1, train_type2):
    all_param_groups = prepare_model_parameters(model)
    g1_decay_params, g1_nodecay_params = get_group_params(train_type1, all_param_groups)
    g2_decay_params, g2_nodecay_params = get_group_params(train_type2, all_param_groups)
    
    return g1_decay_params, g1_nodecay_params, g2_decay_params, g2_nodecay_params


g1_decay_params, g1_nodecay_params, g2_decay_params, g2_nodecay_params = get_g1_g2(model,"full", "full")

from torch.optim import AdamW
from optim_scheduler import DifferentialAlignmentSchedulerWithZeroPeriodLRFixed



def get_optimizer_and_scheduler(optim_args,g1_decay_params, g1_nodecay_params, g2_decay_params, g2_nodecay_params):
    dummy_lr = 1e-9
    optimizer_grouped_parameters = [
        {
            "params": g1_decay_params,
            "weight_decay": optim_args.weight_decay,
            'lr': dummy_lr
        },
        {
            "params": g1_nodecay_params,
            "weight_decay": 0.0,
            'lr': dummy_lr
        },
        {
            "params": g2_decay_params,
            "weight_decay": optim_args.weight_decay,
            'lr': dummy_lr
        },
        {
            "params": g2_nodecay_params,
            "weight_decay": 0.0,
            'lr': dummy_lr
        },
    ]
    optimizer = AdamW(optimizer_grouped_parameters)

    scheduler = DifferentialAlignmentSchedulerWithZeroPeriodLRFixed(optimizer, 
                                                                    warmup_steps_g1= optim_args.warmup_steps_g1, 
                                                                    warmup_steps_g2= optim_args.warmup_steps_g2, 
                                                                    lr_g1= optim_args.lr_g1,
                                                                    total_steps=optim_args.total_steps, 
                                                                    zero_period_steps = optim_args.zero_period_steps, 
                                                                    zero_period_lr = optim_args.zero_period_lr,
                                                                    decay = optim_args.decay_type)
    return optimizer, scheduler

In [None]:
import pandas as pd

df = pd.read_pickle("checkpoints/eval_output_548.pkl")

print(df.exact_match.mean())
print(df.f1.mean())

In [None]:
{"crossenccrossenc": (0.127,0.199), "fullfull": (0.127,0.22), "crosscross": (0.142,0.236), "crossfull": (0.127,0.2157), "crosscrossenc": (0.152, 0.25)}

In [None]:
from types import SimpleNamespace

optim_args = SimpleNamespace(weight_decay = 0.01, 
                            warmup_steps_g1= 500, 
                            warmup_steps_g2= 1000, 
                            zero_period_steps= 5475, 
                            total_steps= 10950, 
                            lr_g1= 1e-4, 
                            zero_period_lr= 0,
                            decay_type = "linear")


get_optimizer_and_scheduler(optim_args, g1_decay_params, g1_nodecay_params, g2_decay_params, g2_nodecay_params)

In [None]:
from transformers import AutoModel, AutoTokenizer, AutoConfig
from eval_utils.loading_utils import load_encdec_model, load_batches_from_evaldf
from transformers import AutoModel
from modeling_gpt2 import GPT2LMHeadModel
import pandas as pd

#enc_base = AutoModel.from_pretrained("roberta-base")
#enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

#config = AutoConfig.from_pretrained("checkpoints/gpt2medium-q-enc-squad_run_rich-night-520/checkpoint-5475").decoder
#dec_base = GPT2LMHeadModel.from_pretrained("gpt2-medium", config = config)

model, enc_tokenizer, dec_tokenizer, train_conf = load_encdec_model("checkpoints/run561_checkpoint-10950/", AutoModel, GPT2LMHeadModel)

from tokenizers.processors import TemplateProcessing
dec_tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
dec_tokenizer.pad_token = dec_tokenizer.eos_token
dec_tokenizer._tokenizer.post_processor = TemplateProcessing(
    single= dec_tokenizer.bos_token + " $A " + dec_tokenizer.eos_token,
    special_tokens=[(dec_tokenizer.bos_token, dec_tokenizer.bos_token_id),(dec_tokenizer.eos_token, dec_tokenizer.eos_token_id)],
)

In [None]:
from train_utils.eval import evaluate

output = evaluate(model, dec_tokenizer, enc_tokenizer, "squad_with_answer_sentence", "q", True, False, 1, "answer_sentence", "answers", False, max_batches=200, save_logits=False)

In [None]:
dec_tokenizer.batch_decode(output.gen_batch[0]["decoder_input_ids"])

In [None]:
output.iloc[6]

In [None]:
import torch
input = {k:torch.tensor(v).unsqueeze(0) for k,v in input.items()}

In [None]:
model.generate(**input, max_new_tokens=30, eos_token_id = dec_tokenizer.eos_token_id)

In [None]:
from evaluate import load

bleu = load("bleu")

In [None]:
bleu.compute(predictions =["a "], references=["abc"], )