In [3]:
from typing import Dict 
import transformers

# Adjusted from: https://github.com/tatsu-lab/stanford_alpaca/blob/main/train.py#L65
def smart_tokenizer_and_embedding_resize(
    special_tokens_dict: Dict,
    tokenizer: transformers.PreTrainedTokenizer,
    model: transformers.PreTrainedModel,
    cls_update: bool = False,
):
    """Resize tokenizer and embedding.

    Resizes token embeds, and intializes new embeds with mean of vocab
    """
    num_new_tokens = tokenizer.add_special_tokens(special_tokens_dict)
    model.resize_token_embeddings(len(tokenizer))

    if num_new_tokens > 0:
        input_embeddings = model.get_input_embeddings().weight.data
        if cls_update and tokenizer.cls_token_id is not None:
            # For the prefix encoder set CLS embed for new prefix token
            cls_embed = input_embeddings[tokenizer.cls_token_id]
            input_embeddings[-num_new_tokens:] = cls_embed
        else:
            input_embeddings_avg = input_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
            input_embeddings[-num_new_tokens:] = input_embeddings_avg
        # Encoder models may not have output embeds
        if (output_embeds:=model.get_output_embeddings()):
            output_embeddings = output_embeds.weight.data
            output_embeddings_avg = output_embeddings[:-num_new_tokens].mean(dim=0, keepdim=True)
            output_embeddings[-num_new_tokens:] = output_embeddings_avg

import pandas as pd

def prepare_dataset(df_pd):
    unique_context = df_pd.context.unique()
    context_id_lookup = {context:i for i, context in enumerate(unique_context)}
    df_pd["context_id"] = df_pd.context.apply(lambda x: context_id_lookup[x])
    df_pd["answers"] = df_pd.apply(lambda x: x["answers"]["text"][0], axis = 1)
    context_answer_lookups = {context_id: context_group.answers.values for context_id, context_group in df_pd.groupby("context_id")}
    df_pd["false_answers"] = df_pd.apply(lambda x: [answer for answer in context_answer_lookups[x["context_id"]] if answer != x.answers], axis = 1)
    df_pd = df_pd.drop(["id", "title", "context_id"], axis = 1)
    return df_pd

def create_classification_records(df, context_column = "context"):
    df = prepare_dataset(df)
    all_records = []
    for i, row in df.iterrows():
        false_records = [{"context":row[context_column], "question": row["question"], "answers": false_answer, "label": 0} for false_answer in row["false_answers"][:1]]
        correct_record = {"context":row[context_column], "question": row["question"], "answers": row["answers"], "label": 1}
        all_records.extend(false_records)
        all_records.append(correct_record)

    df = pd.DataFrame(all_records)
    return df

In [4]:
import torch
from torch import nn
from torch.nn import CrossEntropyLoss
from transformers.modeling_outputs import SequenceClassifierOutput

class CustomClassifier(nn.Module):
    def __init__(self, enc_model, num_prefix_tokens, num_labels=2):
        super().__init__()
        hidden_dim = enc_model.config.hidden_size
        self.enc_model = enc_model
        self.fusion_layer = nn.Linear(hidden_dim*2, hidden_dim)
        self.classif_layer = nn.Linear(hidden_dim*num_prefix_tokens, num_labels)
        self.dropout = nn.Dropout(0.1)
        self.num_labels = num_labels
        self.num_prefix_tokens = num_prefix_tokens
    
    def forward(self, qc_input_ids, qc_attention_mask, a_input_ids, a_attention_mask, labels= None):
        seq1_states = self.enc_model(input_ids = qc_input_ids, attention_mask = qc_attention_mask).last_hidden_state[:,:self.num_prefix_tokens,:]
        seq2_states = self.enc_model(input_ids = a_input_ids, attention_mask = a_attention_mask).last_hidden_state[:,:self.num_prefix_tokens,:]

        concat_seq = torch.cat((seq1_states, seq2_states), dim=-1)
        concat_seq = self.fusion_layer(concat_seq)
        concat_seq = nn.functional.relu(concat_seq)
        concat_seq = concat_seq.reshape(len(qc_input_ids), -1)
        concat_seq = self.dropout(concat_seq)
        logits = self.classif_layer(concat_seq)

        loss = None

        if labels is not None:
            loss_fct = CrossEntropyLoss()
            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))

        return SequenceClassifierOutput(
            loss=loss,
            logits=logits,
            hidden_states=None,
            attentions=None,
        )
    
class ClassifierCollator:
    def __init__(self, tokenizer, prompt_qc, prompt_a):
        self.tokenizer = tokenizer
        self.prompt_qc = prompt_qc
        self.prompt_a = prompt_a
        
    def __call__(self, batch):
        qc_tokens = self.tokenizer([self.prompt_qc.format(**sample) for sample in batch], padding=True, truncation=True, return_tensors="pt")
        a_tokens = self.tokenizer([self.prompt_a.format(**sample) for sample in batch], padding=True, truncation=True, return_tensors="pt")
        labels = torch.tensor([sample["label"] for sample in batch])
        return {"qc_input_ids": qc_tokens["input_ids"], 
                "qc_attention_mask": qc_tokens["attention_mask"], 
                "a_input_ids": a_tokens["input_ids"], 
                "a_attention_mask": a_tokens["attention_mask"],
                "labels": labels}

In [6]:
from transformers import AutoModel, AutoTokenizer

num_prefix_tokens = 3
special_token_dict = {"additional_special_tokens": [f"<PREFIX{i}>" for i in range(num_prefix_tokens-1)]}

enc_model = AutoModel.from_pretrained("roberta-base")
enc_tokenizer = AutoTokenizer.from_pretrained("roberta-base")

smart_tokenizer_and_embedding_resize(special_token_dict, enc_tokenizer, enc_model, cls_update=True)
model = CustomClassifier(enc_model, num_prefix_tokens=num_prefix_tokens, num_labels=2)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [36]:
from datasets import Dataset, DatasetDict, load_from_disk

dataset = load_from_disk("squad_with_answer_sentence")

df_pd_train = dataset["train"].to_pandas()
df_pd_validation = dataset["validation"].to_pandas()

df_train = create_classification_records(df_pd_train)
df_val = create_classification_records(df_pd_validation)

df_hf = DatasetDict({"train": Dataset.from_pandas(df_train), "validation": Dataset.from_pandas(df_val)})

In [38]:
df_train

Unnamed: 0,context,question,answers,label
0,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,a copper statue of Christ,0
1,"Architecturally, the school has a Catholic cha...",To whom did the Virgin Mary allegedly appear i...,Saint Bernadette Soubirous,1
2,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,Saint Bernadette Soubirous,0
3,"Architecturally, the school has a Catholic cha...",What is in front of the Notre Dame Main Building?,a copper statue of Christ,1
4,"Architecturally, the school has a Catholic cha...",The Basilica of the Sacred heart at Notre Dame...,Saint Bernadette Soubirous,0
...,...,...,...,...
174794,"Kathmandu Metropolitan City (KMC), in order to...",With what Belorussian city does Kathmandu have...,Minsk,1
174795,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,Oregon,0
174796,"Kathmandu Metropolitan City (KMC), in order to...",In what year did Kathmandu create its initial ...,1975,1
174797,"Kathmandu Metropolitan City (KMC), in order to...",What is KMC an initialism of?,Oregon,0


In [5]:
from torch.utils.data import DataLoader

prompt_qc = "".join([f"<PREFIX{i}>" for i in range(num_prefix_tokens-1)])+"Question: {question} Context: {context}"
prompt_a = "".join([f"<PREFIX{i}>" for i in range(num_prefix_tokens-1)])+"Answer: {answers}"

loader = DataLoader(df_hf["train"], batch_size = 10, collate_fn=ClassifierCollator(enc_tokenizer, prompt_qc, prompt_a))

In [10]:
from transformers import TrainingArguments
from train_utils.Trainer import CustomTrainer, UpdateOutputDirCallback
import wandb

wandb.login(key = "f190694cef6354f5205256582202a2b16502a236")


training_args = TrainingArguments(per_device_train_batch_size= 32,
                                  gradient_accumulation_steps= 32,
                                  warmup_steps= 500,
                                  num_train_epochs= 2,
                                  learning_rate= 1e-4,
                                  fp16= False,
                                  logging_steps= 100,
                                  evaluation_strategy= "epoch",
                                  save_strategy= "epoch",
                                  output_dir= "/netscratch/roeder/classifier_train",
                                  optim= "adamw_torch",)

trainer = CustomTrainer(
    model=model,
    args= training_args,
    train_dataset=df_hf["train"],
    eval_dataset=df_hf["validation"],
    tokenizer=enc_tokenizer,
    data_collator=ClassifierCollator(enc_tokenizer, prompt_qc, prompt_a),
    callbacks=[UpdateOutputDirCallback()],
)

### SentenceTransformer Training

In [19]:
from datasets import Dataset, DatasetDict, load_from_disk

dataset = load_from_disk("squad_with_answer_sentence")

df_pd_train = dataset["train"].to_pandas()
df_pd_validation = dataset["validation"].to_pandas()

df_pd_train = prepare_dataset(df_pd_train)
df_pd_validation = prepare_dataset(df_pd_validation)

df_pd_train = df_pd_train[df_pd_train.false_answers.apply(len)>0]
df_pd_validation = df_pd_validation[df_pd_validation.false_answers.apply(len)>0]

In [7]:
from sentence_transformers import SentenceTransformer, LoggingHandler, util, models, evaluation, losses, InputExample
from torch.utils.data import Dataset, DataLoader
class CustomDataset(Dataset):
    def __init__(self, dataset):
        self.dataset = dataset
    
    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        row = self.dataset.iloc[idx]
        query_text = prompt_qc.format(question=row["question"], context=row["context"])
        pos_text = prompt_a.format(answers=row["answers"])
        neg_text = [prompt_a.format(answers=f_answer) for f_answer in row["false_answers"]]

        return InputExample(texts=[query_text, pos_text, neg_text[0]])

In [30]:
num_prefix_tokens  = 3

model = SentenceTransformer("roberta-base")
model.tokenizer.model_max_length = 512

word_embedding_model = model._first_module()

tokens = [f"<PREFIX{i}>" for i in range(num_prefix_tokens-1)]
word_embedding_model.tokenizer.add_tokens(tokens, special_tokens=True)
word_embedding_model.auto_model.resize_token_embeddings(len(word_embedding_model.tokenizer))

prompt_qc = "".join([f"<PREFIX{i}>" for i in range(num_prefix_tokens-1)])+"Question: {question} Context: {context}"
prompt_a = "".join([f"<PREFIX{i}>" for i in range(num_prefix_tokens-1)])+"Answer: {answers}"

No sentence-transformers model found with name C:\Users\Daniel/.cache\torch\sentence_transformers\roberta-base. Creating a new one with MEAN pooling.
Some weights of RobertaModel were not initialized from the model checkpoint at C:\Users\Daniel/.cache\torch\sentence_transformers\roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [74]:
word_embedding_model.tokenizer

RobertaTokenizerFast(name_or_path='C:\Users\Daniel/.cache\torch\sentence_transformers\roberta-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	3: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50264: AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=False, special=True),
	50265: AddedToken("<PREFIX0>", rstrip=False, lstrip=F

In [31]:
df = CustomDataset(df_pd_train)
loader = DataLoader(df, batch_size=2, shuffle=True)
train_loss = losses.MultipleNegativesRankingLoss(model=model)

In [None]:
# Train the model
model.fit(train_objectives=[(loader, train_loss)],
          epochs=1,
          warmup_steps=200,
          use_amp=True,
          checkpoint_path="test",
          checkpoint_save_steps=len(loader),
          optimizer_params = {'lr': 1e-4},
          )

In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
tokenizer.all_special_tokens

['[UNK]', '[SEP]', '[PAD]', '[CLS]', '[MASK]']

In [43]:
# Load model directly
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("deepset/roberta-base-squad2")
model = AutoModelForQuestionAnswering.from_pretrained("deepset/roberta-base-squad2")

In [72]:
df_pd = dataset["train"].to_pandas()
q = df_pd.iloc[0].question
c = df_pd.iloc[0].context
sample = tokenizer(q, c, return_tensors="pt")

out  = model(**sample)
begin = out.start_logits.argmax()
end = out.end_logits.argmax()

predict_answer_tokens = sample.input_ids[0, begin : end + 1]
tokenizer.decode(predict_answer_tokens)

' Saint Bernadette Soubirous'