Check CUDA

In [15]:
import torch

print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
else:
    print("CUDA not available — still CPU only")

print(torch.__version__)  # should be 2.6.0 or newer

     

CUDA Available: True
Device: NVIDIA GeForce GTX 1080
2.6.0+cu126


Predict Medical Specialty, not Description

In [16]:
import pandas as pd
df = pd.read_csv('dataset.csv')

In [17]:
# Check distribution
print(df['label'].value_counts())

# Shuffle dataset 
df = df.sample(frac=1, random_state=42)

label
Surgery                          1103
Consult - History and Phy.        516
Cardiovascular / Pulmonary        372
Orthopedic                        355
Radiology                         273
General Medicine                  259
Gastroenterology                  230
Neurology                         223
SOAP / Chart / Progress Notes     166
Obstetrics / Gynecology           160
Urology                           158
Discharge Summary                 108
ENT - Otolaryngology               98
Neurosurgery                       94
Hematology - Oncology              90
Ophthalmology                      83
Nephrology                         81
Emergency Room Reports             75
Pediatrics - Neonatal              70
Pain Management                    62
Psychiatry / Psychology            53
Office Notes                       51
Podiatry                           47
Dermatology                        29
Dentistry                          27
Cosmetic / Plastic Surgery         27
Letter

Tokenization & Input Formatting

In [18]:
# Concatenate relevant fields for input
texts = (
    df['description'].astype(str) + " | " +
    df['sample_name'].astype(str) + " | " +
    df['transcription'].astype(str) + " | " +
    df['keywords'].astype(str)
).tolist()



Tokenize 

In [19]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('dmis-lab/biobert-v1.1')

def encode_data(tokenizer, specialty, max_length):
    encoded = tokenizer.batch_encode_plus(
        specialty,
        truncation=True,
        padding='longest',
        max_length=max_length,
        return_tensors='pt'  # return PyTorch tensors
    )
    return encoded["input_ids"], encoded["attention_mask"]

input_ids, attention_mask = encode_data(tokenizer, df['label'].tolist(), max_length=128)


Fine-Tune BioBert

In [21]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

# Get number of unique specialties
unique_values = df['label'].nunique() 

print(unique_values)

# Load the pre-trained model
model = AutoModelForSequenceClassification.from_pretrained('dmis-lab/biobert-v1.1', num_labels=1)
# Define the training arguments
training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,    # number of training epochs
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
)
# Create the Trainer and start training
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=(input_ids, attention_mask, df['row_id']),
)
trainer.train()

40


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: The batch received was empty, your model won't be able to train on it. Double-check that your training dataset contains keys expected by the model: input_ids,attention_mask,token_type_ids,position_ids,head_mask,inputs_embeds,labels,output_attentions,output_hidden_states,return_dict,label_ids,label,labels.