In [2]:
import pandas as pd
# Assuming data is in a CSV file, with 'question' and 'label' as columns
df = pd.read_csv('bio_data.csv')

# Check class distribution
print(df['label'].value_counts(), df.dtypes)
# Balance classes if needed
df = df.groupby('label').sample(n=min(df['label'].value_counts()), random_state=42)
# Shuffle the dataset
df = df.sample(frac=1, random_state=42)
df

label
O              3710099
I-TREATMENT      28303
B-TREATMENT       1171
Name: count, dtype: int64 token    object
label    object
dtype: object


Unnamed: 0,token,label
978859,injection,B-TREATMENT
3068274,physical,B-TREATMENT
1197045,anti,B-TREATMENT
2873753,[CLS],I-TREATMENT
1682159,##vic,I-TREATMENT
...,...,...
951003,speech,B-TREATMENT
3068275,therapy,I-TREATMENT
3661476,cat,B-TREATMENT
517449,freed,O


In [3]:
# Assuming your labels are categorical, you need to map them to integers
df['label'] = df['label'].astype('category').cat.codes.astype('int64')
print(df['label'].dtype)  # Should print 'int64'
df.columns


int64


Index(['token', 'label'], dtype='object')

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')
def encode_data(tokenizer, questions, max_length):
    encoded = tokenizer.batch_encode_plus(
        questions,
        truncation=True,
        padding='longest',
        max_length=max_length,
        return_tensors='pt'  # return PyTorch tensors
    )
    return encoded["input_ids"], encoded["attention_mask"]
# Use an appropriate max_length 
input_ids, attention_mask = encode_data(tokenizer, df['token'].tolist(), max_length=128)
input_ids, attention_mask

  from .autonotebook import tqdm as notebook_tqdm


(tensor([[  101, 14546,   102,  ...,     0,     0,     0],
         [  101,  2952,   102,  ...,     0,     0,     0],
         [  101,  2848,   102,  ...,     0,     0,     0],
         ...,
         [  101,  5855,   102,  ...,     0,     0,     0],
         [  101, 11485,   102,  ...,     0,     0,     0],
         [  101,   119,   102,  ...,     0,     0,     0]]),
 tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]))

In [5]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=5,    # number of training epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=100,
    weight_decay=0.01,
    logging_dir='./logs',
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
from datasets import Dataset

train_dataset = Dataset.from_dict({
    'input_ids': input_ids,
    'attention_mask': attention_mask,
    'labels': df['label']
})

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
)
trainer.train()


Step,Training Loss
500,0.6003
1000,0.3756


TrainOutput(global_step=1100, training_loss=0.4734121357310902, metrics={'train_runtime': 107.5179, 'train_samples_per_second': 163.368, 'train_steps_per_second': 10.231, 'total_flos': 63185762259270.0, 'train_loss': 0.4734121357310902, 'epoch': 5.0})

In [7]:
evaluation_results = trainer.evaluate(eval_dataset=train_dataset)
evaluation_results

{'eval_loss': 0.2983037829399109,
 'eval_runtime': 1.7012,
 'eval_samples_per_second': 2065.058,
 'eval_steps_per_second': 32.331,
 'epoch': 5.0}

In [11]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification

# Load model and tokenizer
model = AutoModelForTokenClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=3)
tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

# Move model to GPU if available
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model.to(device)

# Example text
text = "The patient will receive intensity-modulated radiation therapy in order to deliver high-dose treatment to sensitive structures."

# Tokenize input
inputs = tokenizer(text, return_tensors="pt")

# Move inputs to GPU
inputs = {k: v.to(device) for k, v in inputs.items()}

# Predict
outputs = model(**inputs)
logits = outputs.logits

# Get predictions
predictions = logits.argmax(dim=-1)

# Print predictions
print(predictions)


Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


tensor([[2, 1, 2, 2, 2, 2, 0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2,
         2]], device='cuda:0')
