In [9]:
# Load model directly
from transformers import AutoModel, TrainingArguments, Trainer, AutoTokenizer, DataCollatorForLanguageModeling
from datasets import Dataset
import pandas as pd
import torch

model = AutoModel.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [10]:
# Get the dataset
df = pd.read_csv('dataset/Question_1.csv')
abstracts = df['Abstract']
dataset = Dataset.from_dict({'abstracts': abstracts})

print(dataset)

Dataset({
    features: ['abstracts'],
    num_rows: 50
})


In [16]:
def preprocess(text):
    return tokenizer(text['abstracts'], padding=True, truncation=True, max_length=512)

# Tokenize the dataset
tokenizer = AutoTokenizer.from_pretrained("bionlp/bluebert_pubmed_uncased_L-12_H-768_A-12")

tokenized_dataset = dataset.map(preprocess, batched=True)
tokenized_dataset = tokenized_dataset.remove_columns(['abstracts'])
tokenized_dataset.set_format("torch")

print("\n", tokenized_dataset, "\n\n", tokenized_dataset['input_ids'])

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Split the dataset
train_size = 30
validate_size = 10
test_size = 10

train_dataset, validate_dataset, test_dataset = torch.utils.data.random_split(tokenized_dataset, [train_size, validate_size, test_size])

Map: 100%|██████████| 50/50 [00:00<00:00, 2240.00 examples/s]


 Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 50
}) 

 tensor([[  101,  9808, 14428,  ...,     0,     0,     0],
        [  101,  2057,  2359,  ...,     0,     0,     0],
        [  101, 27144,  1999,  ...,  9385,  1024,   102],
        ...,
        [  101,  1996,  2006,  ...,     0,     0,     0],
        [  101,  1996, 13896,  ...,  2005,  1052,   102],
        [  101, 14405, 14127,  ...,     0,     0,     0]])





In [8]:
# Train the model
training_args = TrainingArguments(
    output_dir='./results',          # output directory
    num_train_epochs=3,              # total number of training epochs
    evaluation_strategy="steps",     # evaluation strategy to adopt during training
    per_device_train_batch_size=5,  # batch size per device during training
    per_device_eval_batch_size=5,   # batch size for evaluation
    learning_rate=1e-5,              # learning rate
    weight_decay=0.01,               # strength of weight decay
)

trainer = Trainer(
    model=model,                         # the instantiated 🤗 Transformers model to be trained
    args=training_args,                  # training arguments, defined above
    train_dataset=train_dataset,         # training dataset
    eval_dataset=validate_dataset,        # evaluation dataset
    data_collator=data_collator
)

    trainer.train()

# Evaluate the model
results = trainer.evaluate(test_dataset)
print(results)



TypeError: BertModel.forward() got an unexpected keyword argument 'labels'