In [1]:
import torch
from transformers import LongformerTokenizer, LongformerForQuestionAnswering, AdamW
from torch.utils.data import DataLoader, Dataset
from transformers import pipeline
import os
import random



In [1]:
import json
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

file_path = "./train_converted.jsonl"
tokenizer = AutoTokenizer.from_pretrained(
    "valhalla/longformer-base-4096-finetuned-squadv1"
)
model = AutoModelForQuestionAnswering.from_pretrained(
    "valhalla/longformer-base-4096-finetuned-squadv1", return_dict=False
)



Some weights of the model checkpoint at valhalla/longformer-base-4096-finetuned-squadv1 were not used when initializing LongformerForQuestionAnswering: ['longformer.pooler.dense.bias', 'longformer.pooler.dense.weight']
- This IS expected if you are initializing LongformerForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
tokenized = []
file_path = "./train_converted.jsonl"
from datasets import Dataset
original_data = []
with open(file_path, "r") as file:
    for line in file:
        # Load each line as a JSON object
        data_line = json.loads(line)
        for training_item in data_line:
            original_data.append(training_item)
dataset = Dataset.from_dict(
    {
        "context": [item["context"] for item in original_data],
        "question": [item["question"] for item in original_data],
        "answers": [item["answers"] for item in original_data],
    }
)

In [39]:
dataset[0]

{'context': 'is carts based upon a variety of criteria. IMG also includes a genome annotation pipeline that integrates information from several tools, including  [START_ENT] kegg [END_ENT], Pfam, InterPro, and the Gene Ontology, among others. Users can also type or upload their own gene annotations (called MyIMG gene annotations) ',
 'question': 'Kegg Pipe Organ Builders KEGG',
 'answers': {'answer_start': [161], 'text': ['kegg']}}

In [3]:
def preprocess_training_examples(examples):
    questions = [q.strip() for q in examples["question"]]
    inputs = tokenizer(
        questions,
        examples["context"],
        truncation="only_second",
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    offset_mapping = inputs.pop("offset_mapping")
    sample_map = inputs.pop("overflow_to_sample_mapping")
    answers = examples["answers"]
    start_positions = []
    end_positions = []

    for i, offset in enumerate(offset_mapping):
        sample_idx = sample_map[i]
        answer = answers[sample_idx]
        start_char = answer["answer_start"][0]
        end_char = answer["answer_start"][0] + len(answer["text"][0])
        sequence_ids = inputs.sequence_ids(i)

        # Find the start and end of the context
        idx = 0
        while sequence_ids[idx] != 1:
            idx += 1
        context_start = idx
        while sequence_ids[idx] == 1:
            idx += 1
        context_end = idx - 1

        # If the answer is not fully inside the context, label is (0, 0)
        if offset[context_start][0] > start_char or offset[context_end][1] < end_char:
            start_positions.append(0)
            end_positions.append(0)
        else:
            # Otherwise it's the start and end token positions
            idx = context_start
            while idx <= context_end and offset[idx][0] <= start_char:
                idx += 1
            start_positions.append(idx - 1)

            idx = context_end
            while idx >= context_start and offset[idx][1] >= end_char:
                idx -= 1
            end_positions.append(idx + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions
    return inputs

In [4]:
train_ds = dataset.map(preprocess_training_examples, batched=True, remove_columns=dataset.column_names)

Map:   0%|          | 0/7939 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForQuestionAnswering, TrainingArguments, Trainer
from transformers import default_data_collator

data_collator = default_data_collator
# args = TrainingArguments(
#     f"longformer-finetuned-squad",
#     evaluation_strategy="epoch",
#     learning_rate=2e-5,
#     per_device_train_batch_size=8,
#     num_train_epochs=3,
#     weight_decay=0.01,
# )

args = TrainingArguments(
    "bert-finetuned-squad",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    fp16=True,
)
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    tokenizer=tokenizer,
)

In [6]:
trainer.train()

Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33magazzi-ruben99[0m. Use [1m`wandb login --relogin`[0m to force relogin


  0%|          | 0/2979 [00:00<?, ?it/s]

You're using a LongformerTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


OutOfMemoryError: CUDA out of memory. Tried to allocate 578.00 MiB (GPU 0; 11.76 GiB total capacity; 10.01 GiB already allocated; 91.50 MiB free; 10.93 GiB reserved in total by PyTorch) If reserved memory is >> allocated memory try setting max_split_size_mb to avoid fragmentation.  See documentation for Memory Management and PYTORCH_CUDA_ALLOC_CONF

In [25]:
tokenized[0]

{'input_ids': [[0, 19226, 6149, 35386, 9171, 15195, 268, 229, 7170, 534, 2, 2, 354, 31282, 716, 2115, 10, 3143, 9, 8608, 4, 9206, 534, 67, 1171, 10, 27392, 47760, 4116, 14, 33752, 335, 31, 484, 3270, 6, 217, 1437, 646, 4014, 11328, 1215, 5382, 742, 7321, 6149, 646, 9309, 1215, 5382, 7479, 14475, 424, 6, 3870, 10653, 6, 8, 5, 13120, 13302, 4383, 6, 566, 643, 4, 16034, 64, 67, 1907, 50, 20021, 49, 308, 10596, 47234, 36, 4155, 1308, 3755, 534, 10596, 47234, 43, 1437, 2]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'offset_mapping': [[(0, 0), (0, 2), (2, 4), (5, 9), (10, 15), (16, 21), (21, 24), (25, 26), (26, 28), (28, 29), (0, 0), (0, 0), (0, 2), (3, 8), (9, 14), (15, 19), (20, 21), (22, 29), (30, 32), (33, 41), (41, 42), (43, 45), (45, 46), (47, 51), (52

In [21]:
for i in range(len(train_data)):
  train_data[i]= train_data[i][0]

In [11]:
train_data[0]

{'id': 0,
 'input': 'is carts based upon a variety of criteria. IMG also includes a genome annotation pipeline that integrates information from several tools, including  [START_ENT] kegg [END_ENT], Pfam, InterPro, and the Gene Ontology, among others. Users can also type or upload their own gene annotations (called MyIMG gene annotations) ',
 'output': [{'answer': 'kegg', 'provenance': [{'title': 'kegg'}]}],
 'meta': {'left_context': 'is carts based upon a variety of criteria. IMG also includes a genome annotation pipeline that integrates information from several tools, including ',
  'right_context': ', Pfam, InterPro, and the Gene Ontology, among others. Users can also type or upload their own gene annotations (called MyIMG gene annotations) ',
  'mention': 'kegg'},
 'candidates': ['Kegg Pipe Organ Builders', 'KEGG']}

In [28]:
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
preprocessed_data = []
for datarow in train_data:
    input_text = (
        datarow["input"].replace("[START_ENT]", "<s>").replace("[END_ENT]", "</s>")
        + "[SEP]"
    )
    candidates_string = ""
    for item in datarow["candidates"]:
        candidates_string += item + "[SEP]"
    input_text += candidates_string
    input_text = tokenizer.encode(
        input_text, add_special_tokens=True, return_tensors="pt"
    )
    # start position of answer in candidates_string
    start_pos = candidates_string.find(datarow["output"][0]["answer"])
    # end position of answer in candidates_string
    end_pos = start_pos + len(datarow["output"][0]["answer"])
    preprocessed_data.append(
        {
            "input_ids": input_text["input_ids"],
            "start_position": torch.tensor(start_pos),
            "end_position": torch.tensor(end_pos),
             "attention_maks": input_text["attention_mask"]
        }
    )

In [31]:
# You would need to prepare your training data and dataloader
# Here's a simplified example
from torch.utils.data import DataLoader
from transformers import LongformerForQuestionAnswering, LongformerConfig
import torch.nn.functional as F

# Load the Longformer model for question answering
config = LongformerConfig.from_pretrained("allenai/longformer-base-4096")
model = LongformerForQuestionAnswering.from_pretrained(
    "allenai/longformer-base-4096", config=config
)

# Define optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-5)
criterion = torch.nn.CrossEntropyLoss()
# Prepare your training dataset
train_dataloader = DataLoader(preprocessed_data, batch_size=8, shuffle=True)

model.train()
num_epochs = 3  # Adjust the number of epochs as needed

for epoch in range(num_epochs):
    total_start_loss = 0.0
    total_end_loss = 0.0

    for batch in train_dataloader:
        input_ids, start_positions, end_positions = batch

        optimizer.zero_grad()
        outputs = model(input_ids)
        start_logits = outputs.start_logits
        end_logits = outputs.end_logits

        # Calculate the loss for start and end positions
        start_loss = F.cross_entropy(start_logits, start_positions)
        end_loss = F.cross_entropy(end_logits, end_positions)

        loss = start_loss + end_loss
        loss.backward()
        optimizer.step()

        total_start_loss += start_loss.item()
        total_end_loss += end_loss.item()

    average_start_loss = total_start_loss / len(train_dataloader)
    average_end_loss = total_end_loss / len(train_dataloader)
    print(
        f"Epoch {epoch+1}/{num_epochs}, Average Start Loss: {average_start_loss:.4f}, Average End Loss: {average_end_loss:.4f}"
    )

Some weights of LongformerForQuestionAnswering were not initialized from the model checkpoint at allenai/longformer-base-4096 and are newly initialized: ['qa_outputs.weight', 'qa_outputs.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: 'bool' object has no attribute 'nonzero'

In [None]:
input_text = "However, these rights were largely ignored, as Stroessner ruled under a state of siege for only a few short breaks from 1954 to 1987. Although the  [START_ENT] state of siege [END_ENT] technically only applied to Asunción after 1970, the courts held that anyone charged with security offenses could be brought to the c"
candidates = ["State of Siege", "State of emergency", "War"]

In [31]:
import torch
import torch.nn as nn
from transformers import LongformerModel


class LongformerSoftmax(nn.Module):
    def __init__(self, config):
        super(LongformerSoftmax, self).__init__()
        self.longformer = LongformerModel.from_pretrained(
            "allenai/longformer-base-4096"
        )
        self.tokenizer = LongformerTokenizer.from_pretrained(
            "allenai/longformer-base-4096"
        )
        self.linear = nn.Linear(config.hidden_size, 1)  # Linear layer for each token
        self.softmax = nn.Softmax(
            dim=1
        )  # Apply softmax along the token dimension (dim=1)

    def forward(self, input_text):
        # Tokenize the input text
        input_ids = self.tokenizer(
            input_text, return_tensors="pt", padding=True, truncation=True
        )
        attention_mask = input_ids["attention_mask"]

        # Pass the tokenized input through the Longformer model
        output = self.longformer(input_ids.input_ids, attention_mask, return_dict=True)
        logits = output[0]
        logits = self.linear(logits)
        softmax = self.softmax(logits)
        return torch.squeeze(softmax)

In [42]:
from transformers import LongformerTokenizer, LongformerConfig

# load the tokenizer and the model
tokenizer = LongformerTokenizer.from_pretrained("allenai/longformer-base-4096")
config = LongformerConfig.from_pretrained("allenai/longformer-base-4096")

model = LongformerSoftmax(config)
# encode the input sequence
input_text = "[SEP] After a long fight Superman saved <s> Metropolis </s> [SEP] Metropolis (1927 film) </ec> Metropolis-Hasting algorithm Metropolis (comics) </ec> [SEP]"
encoded = tokenizer.encode(input_text, return_tensors="pt")


# make a prediction with the model
res = model(input_text)
print(torch.argmax(res))
# print the prediction
# print(f"The predicted class for '{input_text}' is {pred}")

tensor(2)


In [41]:
encoded[0]

tensor([    0, 10975,  3388,   510,   742,  4993,    10,   251,  1032, 25143,
         5305,  1437,     0, 25869, 26510,     2,   646,  3388,   510,   742,
         4369, 26510,    36,  1646,  2518,   822,    43, 49703,  3204, 15698,
        25869, 26510,    12,   725, 15374, 17194,  4369, 26510,    36,   175,
         2857, 50017,  3204, 15698,   646,  3388,   510,   742,     2])

In [43]:
tokenizer.convert_ids_to_tokens(encoded[0])

['<s>',
 '[',
 'SE',
 'P',
 ']',
 'ĠAfter',
 'Ġa',
 'Ġlong',
 'Ġfight',
 'ĠSuperman',
 'Ġsaved',
 'Ġ',
 '<s>',
 'ĠMet',
 'ropolis',
 'Ġ',
 '</s>',
 'Ġ[',
 'SE',
 'P',
 ']',
 'ĠMet',
 'ropolis',
 'Ġ(',
 '19',
 '27',
 'Ġfilm',
 ')',
 'Ġ</',
 'ec',
 '>',
 'ĠMet',
 'ropolis',
 '-',
 'H',
 'asting',
 'Ġalgorithm',
 'ĠMet',
 'ropolis',
 'Ġ(',
 'com',
 'ics',
 ')',
 'Ġ</',
 'ec',
 '>',
 'Ġ[',
 'SE',
 'P',
 ']',
 '</s>']

In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained(
    "valhalla/longformer-base-4096-finetuned-squadv1"
)
model = AutoModelForQuestionAnswering.from_pretrained(
    "valhalla/longformer-base-4096-finetuned-squadv1", return_dict=False
)

text = "After a long fight Superman saved Metropolis"
question = "Metropolis (1927 film) Metropolis-Hasting algorithm Metropolis (comics) "
encoding = tokenizer(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]

# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]

start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

answer_tokens = all_tokens[torch.argmax(start_scores) : torch.argmax(end_scores) + 1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
# output => democratized NLP

Some weights of the model checkpoint at valhalla/longformer-base-4096-finetuned-squadv1 were not used when initializing LongformerForQuestionAnswering: ['longformer.pooler.dense.weight', 'longformer.pooler.dense.bias']
- This IS expected if you are initializing LongformerForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LongformerForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [22]:
print(answer)


