In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pickle

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load BioBERT tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Pre-tokenize and cache the datasets
def preprocess_and_cache(texts, labels, tokenizer, max_length, cache_file):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt",
    )
    with open(cache_file, "wb") as f:
        pickle.dump((encodings, labels.tolist()), f)

# Define maximum sequence length based on data distribution
data["Patient"] = data["Patient"].astype(str)
max_length = int(data["Patient"].apply(len).quantile(0.95))
max_length = min(max_length, 512)
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
preprocess_and_cache(train_texts, train_labels, tokenizer, max_length, "train_encodings.pkl")
preprocess_and_cache(test_texts, test_labels, tokenizer, max_length, "test_encodings.pkl")

# Load cached datasets
def load_cached_data(cache_file):
    with open(cache_file, "rb") as f:
        encodings, labels = pickle.load(f)
    return encodings, labels


In [7]:
train_encodings, train_labels = load_cached_data("train_encodings.pkl")
test_encodings, test_labels = load_cached_data("test_encodings.pkl")

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_encodings, train_labels)
test_dataset = SpecialistDataset(test_encodings, test_labels)

def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=0
    )
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}


# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

# Load BioBERT model
num_labels = len(specialist_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.gradient_checkpointing_enable()  # Enable gradient checkpointing for memory efficiency
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Mixed Precision Training Setup
scaler = GradScaler()  # Used to scale gradients for mixed precision

# Loss Function
criterion = torch.nn.CrossEntropyLoss()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Mixed precision training
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Scaler for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["labels"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())

    report = classification_report(labels, preds, target_names=list(specialist_mapping.keys()))
    return report

# Training Loop
epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")

# Evaluate Model
report = evaluate_model(model, test_loader, device)
print("Classification Report:")
print(report)

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_biobert_model")
tokenizer.save_pretrained("./fine_tuned_biobert_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()  # Used to scale gradients for mixed precision
  with autocast():


Epoch 1/5
Train Loss: 1.6333
Epoch 2/5
Train Loss: 1.3239


KeyboardInterrupt: 

In [14]:
import optuna
import torch
from torch.utils.data import DataLoader, Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from sklearn.metrics import classification_report
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pickle
from torch import nn
import os

# Set environment variable to disable tokenizer parallelism
os.environ["TOKENIZERS_PARALLELISM"] = "false"
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id
# Set up device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load preprocessed data
def load_cached_data(cache_file):
    with open(cache_file, "rb") as f:
        encodings, labels = pickle.load(f)
    return encodings, labels

train_encodings, train_labels = load_cached_data("train_encodings.pkl")
test_encodings, test_labels = load_cached_data("test_encodings.pkl")

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
            "attention_mask": torch.tensor(self.encodings["attention_mask"][idx], dtype=torch.long),
            "labels": torch.tensor(self.labels[idx], dtype=torch.long)
        }

# Collate function for dynamic padding
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=0
    )
    labels = torch.tensor([item["labels"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# Optimizing DataLoader for faster GPU transfer
train_dataset = SpecialistDataset(train_encodings, train_labels)
test_dataset = SpecialistDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn, pin_memory=True, num_workers=4)

# Initialize tokenizer and model
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
num_labels = len(specialist_mapping)

# Ensure tokenizer.pad_token_id is set correctly
if tokenizer.pad_token_id is None:
    tokenizer.pad_token_id = tokenizer.eos_token_id

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.to(device)

# Mixed Precision Training Setup
scaler = torch.amp.GradScaler()  # Used for mixed precision scaling
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Loss Function
criterion = nn.CrossEntropyLoss()

# Training function with mixed precision
def train_model(model, train_loader, optimizer, criterion, device, scaler, epochs):
    model.train()
    total_loss = 0
    for epoch in range(epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["labels"].to(device, non_blocking=True)

            # Mixed precision training
            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            # Scaler for mixed precision
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

    return total_loss / len(train_loader)

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels.extend(batch["labels"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())

    report = classification_report(labels, preds, target_names=list(specialist_mapping.keys()))
    return report

# Hyperparameter tuning function
def objective(trial):
    # Hyperparameter search space
    batch_size = trial.suggest_int('batch_size', 8, 32, step=8)
    epochs = trial.suggest_int('epochs', 3, 6)
    lr = trial.suggest_float('lr', 1e-5, 1e-3, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-5, 1e-2, log=True)
    max_length = trial.suggest_int('max_length', 128, 512, step=128)

    # Reinitialize model and tokenizer to prevent state carryover
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
    model.config.pad_token_id = tokenizer.pad_token_id
    model.to(device)

    optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)

    # Mixed precision
    scaler = torch.amp.GradScaler()

    # DataLoader with batch size from trial
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn, pin_memory=True, num_workers=4)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn, pin_memory=True, num_workers=4)

    # Training
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler, epochs)

    # Evaluation
    report = evaluate_model(model, test_loader, device)

    # Extract accuracy or other metrics from the classification report
    accuracy = report.get('accuracy', 0.0)  # Ensure 'accuracy' is in the report
    return accuracy

# Start the hyperparameter optimization
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=20)

# Print best hyperparameters
print(f"Best hyperparameters: {study.best_params}")

# Save the best model and tokenizer
best_params = study.best_params
best_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
best_model.to(device)
optimizer = AdamW(best_model.parameters(), lr=best_params['lr'], weight_decay=best_params['weight_decay'])

best_model.save_pretrained("./best_fine_tuned_biobert_model")
tokenizer.save_pretrained("./best_fine_tuned_biobert_model")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[I 2025-01-28 15:38:30,542] A new study created in memory with name: no-name-f357b6bd-3c38-475d-a5d4-f6657861a042
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
  "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
  "input_ids": torch.tensor(self.encodings["input_ids"][idx], dtype=torch.long),
  "input_ids": torch.tensor

TypeError: Caught TypeError in DataLoader worker process 0.
Original Traceback (most recent call last):
  File "/home/keshav/.local/lib/python3.10/site-packages/torch/utils/data/_utils/worker.py", line 351, in _worker_loop
    data = fetcher.fetch(index)  # type: ignore[possibly-undefined]
  File "/home/keshav/.local/lib/python3.10/site-packages/torch/utils/data/_utils/fetch.py", line 55, in fetch
    return self.collate_fn(data)
  File "/tmp/ipykernel_52491/1067715111.py", line 45, in collate_fn
    input_ids = torch.nn.utils.rnn.pad_sequence(
  File "/home/keshav/.local/lib/python3.10/site-packages/torch/nn/utils/rnn.py", line 478, in pad_sequence
    return torch._C._nn.pad_sequence(
TypeError: pad_sequence(): argument 'padding_value' (position 3) must be float, not NoneType


In [13]:
# Efficient code of the bio-bert without hyper-parameter
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.metrics import classification_report
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pickle
import torch.nn as nn
import torch.optim as optim

# Set up device and ensure we're using CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

# Load cached data (make sure 'train_encodings.pkl' and 'test_encodings.pkl' are properly created beforehand)
def load_cached_data(cache_file):
    with open(cache_file, "rb") as f:
        encodings, labels = pickle.load(f)
    return encodings, labels

train_encodings, train_labels = load_cached_data("train_encodings.pkl")
test_encodings, test_labels = load_cached_data("test_encodings.pkl")

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Collate function for dynamic padding
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# DataLoaders with optimized settings
train_dataset = SpecialistDataset(train_encodings, train_labels)
test_dataset = SpecialistDataset(test_encodings, test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn, pin_memory=True)

# Load BioBERT model
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token
num_labels = len(specialist_mapping)

model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.to(device)

# Mixed Precision Training Setup
scaler = GradScaler()  # Used for mixed precision scaling
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Loss Function
criterion = nn.CrossEntropyLoss()

# Training function with mixed precision
def train_model(model, train_loader, optimizer, criterion, device, scaler, epochs):
    model.train()
    total_loss = 0
    for epoch in range(epochs):
        for batch in train_loader:
            optimizer.zero_grad()
            input_ids = batch["input_ids"].to(device, non_blocking=True)
            attention_mask = batch["attention_mask"].to(device, non_blocking=True)
            labels = batch["labels"].to(device, non_blocking=True)

            # Mixed precision training
            with autocast():
                outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
                loss = outputs.loss

            # Scaler for mixed precision
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()

            total_loss += loss.item()

    return total_loss / len(train_loader)

# Evaluation function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["labels"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())

    report = classification_report(labels, preds, target_names=list(specialist_mapping.keys()))
    return report

# Training loop with multiple epochs
epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler, epochs=1)
    print(f"Epoch {epoch + 1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")

# Evaluate Model
report = evaluate_model(model, test_loader, device)
print("Classification Report:")
print(report)

# Save the fine-tuned model
model.save_pretrained("./fine_tuned_biobert_model")
tokenizer.save_pretrained("./fine_tuned_biobert_model")


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()  # Used for mixed precision scaling


TypeError: pad_sequence(): argument 'padding_value' (position 3) must be float, not NoneType

In [3]:
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cpu
