In [1]:
!pip install transformers datasets scikit-learn pandas evaluate -q

In [8]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import torch




Check CUDA

In [9]:
print("CUDA Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("Device:", torch.cuda.get_device_name(0))
else:
    print("CUDA not available — still CPU only")

print(torch.__version__)  # should be 2.6.0 or newer

     

CUDA Available: True
Device: NVIDIA GeForce GTX 1080
2.6.0+cu126


Initialize Dataset

In [11]:
df = pd.read_csv('dataset.csv')

# Combine relevant columns into one input text
df['input_text'] = (
    df['description'].fillna('') + ' ' +
    df['sample_name'].fillna('') + ' ' +
    df['transcription'].fillna('') + ' ' +
    df['keywords'].fillna('')
)


# Encode the target label
label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['medical_specialty'])

df[['input_text', 'label']].head()

Unnamed: 0,input_text,label
0,A 23-year-old white female presents with comp...,0
1,Consult for laparoscopic gastric bypass. Lap...,2
2,Consult for laparoscopic gastric bypass. Lap...,2
3,2-D M-Mode. Doppler. 2-D Echocardiogram - ...,3
4,2-D Echocardiogram 2-D Echocardiogram - 2 1...,3


Train-Test-Split Dataset

In [12]:
train_df, val_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Convert to HuggingFace Dataset
from datasets import Dataset
train_dataset = Dataset.from_pandas(train_df[['input_text', 'label']])
val_dataset = Dataset.from_pandas(val_df[['input_text', 'label']])

Tokenize With BioBERT Tokenizer

In [13]:
tokenizer = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

def tokenize_function(examples):
    return tokenizer(examples["input_text"], padding="max_length", truncation=True, max_length=512)

train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)

Map: 100%|██████████| 3999/3999 [00:04<00:00, 976.20 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 1088.22 examples/s]


Load Model For Sequence Classification

In [14]:
num_labels = df['label'].nunique()
model = AutoModelForSequenceClassification.from_pretrained("dmis-lab/biobert-base-cased-v1.1", num_labels=num_labels)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Accuracy Metrics

In [21]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = torch.argmax(torch.tensor(logits), axis=-1)
    return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script: 100%|██████████| 4.20k/4.20k [00:00<?, ?B/s]


Initialize Training Arguments

In [19]:
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
)

Train BioBERT

In [22]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy
1,2.357,1.686583,0.525
2,1.4733,1.411245,0.533
3,1.2095,1.361079,0.505


TrainOutput(global_step=1500, training_loss=1.6799140625, metrics={'train_runtime': 1305.2214, 'train_samples_per_second': 9.192, 'train_steps_per_second': 1.149, 'total_flos': 3157620301651968.0, 'train_loss': 1.6799140625, 'epoch': 3.0})

Save Model

In [24]:
model.save_pretrained("biobert_medical_specialty_classifier")
tokenizer.save_pretrained("biobert_medical_specialty_classifier")

# Save label encoder mapping
import json
# Convert NumPy integers to plain Python ints
label_map = {cls: int(label) for cls, label in zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_))}

# Save to JSON
with open("label_mapping.json", "w") as f:
    json.dump(label_map, f)

Inference Method

In [27]:
import torch

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def predict_specialty(text):
    model.eval()
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=512).to(device)
    
    with torch.no_grad():
        outputs = model(**inputs)
    
    pred_label = torch.argmax(outputs.logits, axis=1).item()
    return label_encoder.inverse_transform([pred_label])[0]

sample_input = "Patient reports abdominal pain and nausea for two days."
predict_specialty(sample_input)

' Consult - History and Phy.'