In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load tokenizer and set padding token
model_name = "23tanmay/BioDistillGPT2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure each text is a string
        label = self.labels[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SpecialistDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Data Collation Function
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=0
    )
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

# Load model and configure padding token
num_labels = len(specialist_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Mixed Precision Training Setup
scaler = GradScaler()  # Used to scale gradients for mixed precision

# Loss Function
criterion = torch.nn.CrossEntropyLoss()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Mixed precision training
        with autocast():  # Automatically cast operations to half precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Scaler for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["label"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
    accuracy = accuracy_score(labels, preds)
    return accuracy

# Training Loop
epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler)
    accuracy = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at 23tanmay/BioDistillGPT2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()  # Used to scale gradients for mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/5
Train Loss: 1.7730 | Test Accuracy: 0.5512


  with autocast():  # Automatically cast operations to half precision


Epoch 2/5
Train Loss: 1.4423 | Test Accuracy: 0.5620


  with autocast():  # Automatically cast operations to half precision


Epoch 3/5
Train Loss: 1.3622 | Test Accuracy: 0.5719


  with autocast():  # Automatically cast operations to half precision


Epoch 4/5
Train Loss: 1.2966 | Test Accuracy: 0.5738


  with autocast():  # Automatically cast operations to half precision


Epoch 5/5
Train Loss: 1.2418 | Test Accuracy: 0.5734


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')