In [1]:
import torch
print(torch.version.cuda)

12.4


In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pandas as pd
from sklearn.model_selection import ParameterGrid

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

data = pd.read_csv("filtered_specialists.csv")

specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str)+"\n"+data["Description"].astype(str), 
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class SpecialistDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  
        label = self.labels[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

train_dataset = SpecialistDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SpecialistDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

class CNNForSpecialistClassification(nn.Module):
    def __init__(self, pretrained_model_name, num_labels, embedding_dim=768, kernel_size=3, num_filters=128):
        super(CNNForSpecialistClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.bert.requires_grad_(False) 

        self.conv1 = nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=kernel_size)
        self.pool = nn.MaxPool1d(kernel_size=2)
        self.fc = nn.Linear(num_filters, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # Shape: [batch_size, seq_len, embedding_dim]

        # Apply CNN layers
        embeddings = embeddings.transpose(1, 2)  # Change to [batch_size, embedding_dim, seq_len]
        x = self.conv1(embeddings)  # Apply convolutional layer
        x = torch.relu(x)
        x = self.pool(x)  # Apply pooling

        # Global Average Pooling
        x = x.mean(dim=2)  # Average over the sequence length dimension

        # Final classification layer
        x = self.fc(x)
        return x

# Optimizer Setup
def get_optimizer(model, lr=1e-5):
    return optim.Adam(model.parameters(), lr=lr)

# Training Function
def train_model(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        with autocast():  
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["label"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs, dim=1).tolist())
    accuracy = accuracy_score(labels, preds)
    return accuracy

# Hyperparameter tuning using grid search
param_grid = {
    'learning_rate': [1e-5, 5e-5, 1e-4],
    'batch_size': [16, 32, 64],
    'num_filters': [32, 64, 128],
    'kernel_size': [3, 4, 5]
}

# Hyperparameter Grid Search
def hyperparameter_tuning(train_loader, test_loader, param_grid):
    best_accuracy = 0
    best_params = None
    for params in ParameterGrid(param_grid):
        print(f"Training with params: {params}")
        
        model = CNNForSpecialistClassification(
            pretrained_model_name=model_name, 
            num_labels=len(specialist_mapping),
            num_filters=params['num_filters'],
            kernel_size=params['kernel_size']
        ).to(device)
        
        optimizer = get_optimizer(model, lr=params['learning_rate'])
        criterion = nn.CrossEntropyLoss()
        scaler = GradScaler()  
        
        train_loader = DataLoader(train_dataset, batch_size=params['batch_size'], shuffle=True, collate_fn=collate_fn)
        test_loader = DataLoader(test_dataset, batch_size=params['batch_size'], collate_fn=collate_fn)
      
        for epoch in range(3):  # Limiting to 3 epochs for quicker tuning
            train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler)
            accuracy = evaluate_model(model, test_loader, device)
            print(f"Epoch {epoch+1}/3 | Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_params = params

    print(f"Best Params: {best_params} with Accuracy: {best_accuracy:.4f}")
    return best_params

best_params = hyperparameter_tuning(train_loader, test_loader, param_grid)

final_model = CNNForSpecialistClassification(
    pretrained_model_name=model_name, 
    num_labels=len(specialist_mapping),
    num_filters=best_params['num_filters'],
    kernel_size=best_params['kernel_size']
).to(device)

final_optimizer = get_optimizer(final_model, lr=best_params['learning_rate'])
final_criterion = nn.CrossEntropyLoss()
final_scaler = GradScaler()

for epoch in range(5):  
    train_loss = train_model(final_model, train_loader, final_optimizer, final_criterion, device, final_scaler)
    accuracy = evaluate_model(final_model, test_loader, device)
    print(f"Epoch {epoch+1}/5 | Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

# Save the Fine-Tuned Model
torch.save(final_model.state_dict(), "./cnn_fine_tuned_model.pth")
tokenizer.save_pretrained("./cnn_fine_tuned_model_tokenizer")

  from .autonotebook import tqdm as notebook_tqdm


Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 1e-05, 'num_filters': 32}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.9564 | Test Accuracy: 0.3325


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 2.5032 | Test Accuracy: 0.3883


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 2.3116 | Test Accuracy: 0.4165
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 1e-05, 'num_filters': 64}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.8056 | Test Accuracy: 0.3506


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 2.3658 | Test Accuracy: 0.4099


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 2.1663 | Test Accuracy: 0.4406
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 1e-05, 'num_filters': 128}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.6677 | Test Accuracy: 0.3878


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 2.2045 | Test Accuracy: 0.4361


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 2.0199 | Test Accuracy: 0.4627
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 5e-05, 'num_filters': 32}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.4240 | Test Accuracy: 0.4429


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.9482 | Test Accuracy: 0.4891


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.8038 | Test Accuracy: 0.5042
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 5e-05, 'num_filters': 64}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.2537 | Test Accuracy: 0.4678


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.8238 | Test Accuracy: 0.5078


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.7106 | Test Accuracy: 0.5224
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 5e-05, 'num_filters': 128}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.1495 | Test Accuracy: 0.4860


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.7493 | Test Accuracy: 0.5223


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.6491 | Test Accuracy: 0.5300
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 0.0001, 'num_filters': 32}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.1845 | Test Accuracy: 0.4863


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.7656 | Test Accuracy: 0.5199


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.6615 | Test Accuracy: 0.5369
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 0.0001, 'num_filters': 64}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.0561 | Test Accuracy: 0.5117


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.6882 | Test Accuracy: 0.5375


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.6013 | Test Accuracy: 0.5427
Training with params: {'batch_size': 16, 'kernel_size': 3, 'learning_rate': 0.0001, 'num_filters': 128}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 1.9640 | Test Accuracy: 0.5253


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.6293 | Test Accuracy: 0.5444


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.5485 | Test Accuracy: 0.5656
Training with params: {'batch_size': 16, 'kernel_size': 4, 'learning_rate': 1e-05, 'num_filters': 32}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.8987 | Test Accuracy: 0.3190


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 2.4173 | Test Accuracy: 0.3928


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 2.2267 | Test Accuracy: 0.4249
Training with params: {'batch_size': 16, 'kernel_size': 4, 'learning_rate': 1e-05, 'num_filters': 64}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.7324 | Test Accuracy: 0.3801


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 2.2673 | Test Accuracy: 0.4315


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 2.0716 | Test Accuracy: 0.4540
Training with params: {'batch_size': 16, 'kernel_size': 4, 'learning_rate': 1e-05, 'num_filters': 128}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.6071 | Test Accuracy: 0.4123


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 2.1346 | Test Accuracy: 0.4563


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.9581 | Test Accuracy: 0.4752
Training with params: {'batch_size': 16, 'kernel_size': 4, 'learning_rate': 5e-05, 'num_filters': 32}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.3605 | Test Accuracy: 0.4624


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.8802 | Test Accuracy: 0.4974


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.7519 | Test Accuracy: 0.5249
Training with params: {'batch_size': 16, 'kernel_size': 4, 'learning_rate': 5e-05, 'num_filters': 64}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.1844 | Test Accuracy: 0.4855


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.7824 | Test Accuracy: 0.5060


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.6793 | Test Accuracy: 0.5268
Training with params: {'batch_size': 16, 'kernel_size': 4, 'learning_rate': 5e-05, 'num_filters': 128}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.0782 | Test Accuracy: 0.5011


  with autocast():  # Automatically cast operations to half precision


Epoch 2/3 | Train Loss: 1.7046 | Test Accuracy: 0.5281


  with autocast():  # Automatically cast operations to half precision


Epoch 3/3 | Train Loss: 1.6151 | Test Accuracy: 0.5389
Training with params: {'batch_size': 16, 'kernel_size': 4, 'learning_rate': 0.0001, 'num_filters': 32}


  scaler = GradScaler()  # For mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/3 | Train Loss: 2.1280 | Test Accuracy: 0.4959


  with autocast():  # Automatically cast operations to half precision


KeyboardInterrupt: 

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pandas as pd

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str) + "\n" + data["Description"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load BERT tokenizer
model_name = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure each text is a string
        label = self.labels[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SpecialistDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Data Collation Function
def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

# Improved CNN Model
class ImprovedCNNForSpecialistClassification(nn.Module):
    def __init__(self, pretrained_model_name, num_labels, embedding_dim=768, kernel_sizes=[3, 4, 5], num_filters=128, dropout_rate=0.3):
        super(ImprovedCNNForSpecialistClassification, self).__init__()
        self.bert = AutoModel.from_pretrained(pretrained_model_name)
        self.bert.requires_grad_(False) 
    
        self.convs = nn.ModuleList([
            nn.Conv1d(in_channels=embedding_dim, out_channels=num_filters, kernel_size=k)
            for k in kernel_sizes
        ])
  
        self.batch_norm = nn.BatchNorm1d(num_filters * len(kernel_sizes))

        self.dropout = nn.Dropout(dropout_rate)

        self.fc = nn.Linear(num_filters * len(kernel_sizes), num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embeddings = outputs.last_hidden_state  # Shape: [batch_size, seq_len, embedding_dim]

        embeddings = embeddings.transpose(1, 2)  # Change to [batch_size, embedding_dim, seq_len]
        conv_outputs = []
        for conv in self.convs:
            conv_output = torch.relu(conv(embeddings))  
            conv_output = torch.max(conv_output, dim=2)[0]  
            conv_outputs.append(conv_output)

        x = torch.cat(conv_outputs, dim=1)  # Shape: [batch_size, num_filters * len(kernel_sizes)]
        x = self.batch_norm(x)
        x = self.dropout(x)
        x = self.fc(x)
        return x

# Initialize the model with the best hyperparameters
model = ImprovedCNNForSpecialistClassification(
    pretrained_model_name=model_name,
    num_labels=len(specialist_mapping),
    num_filters=128,
    kernel_sizes=[3, 4, 5],
    dropout_rate=0.3
).to(device)

# Optimizer Setup
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Loss Function
criterion = nn.CrossEntropyLoss()

# Mixed Precision Training
scaler = GradScaler()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Mixed precision training
        with autocast():  # Automatically cast operations to half precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

        # Scaler for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["label"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs, dim=1).tolist())
    accuracy = accuracy_score(labels, preds)
    return accuracy

# Training Loop
for epoch in range(5):  # Train for 5 epochs
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler)
    accuracy = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch + 1}/5 | Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

# Save the Fine-Tuned Model
torch.save(model.state_dict(), "./improved_cnn_fine_tuned_model.pth")
tokenizer.save_pretrained("./improved_cnn_fine_tuned_model_tokenizer")

  from .autonotebook import tqdm as notebook_tqdm
  scaler = GradScaler()
  with autocast():  # Automatically cast operations to half precision


Epoch 1/5 | Train Loss: 1.6830 | Test Accuracy: 0.6593


  with autocast():  # Automatically cast operations to half precision


Epoch 2/5 | Train Loss: 1.2565 | Test Accuracy: 0.6820


  with autocast():  # Automatically cast operations to half precision


Epoch 3/5 | Train Loss: 1.1297 | Test Accuracy: 0.6939


  with autocast():  # Automatically cast operations to half precision


Epoch 4/5 | Train Loss: 1.0433 | Test Accuracy: 0.6994


  with autocast():  # Automatically cast operations to half precision


Epoch 5/5 | Train Loss: 0.9701 | Test Accuracy: 0.7013


('./improved_cnn_fine_tuned_model_tokenizer/tokenizer_config.json',
 './improved_cnn_fine_tuned_model_tokenizer/special_tokens_map.json',
 './improved_cnn_fine_tuned_model_tokenizer/vocab.txt',
 './improved_cnn_fine_tuned_model_tokenizer/added_tokens.json',
 './improved_cnn_fine_tuned_model_tokenizer/tokenizer.json')

## Bio-Bert Code Final

In [10]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pandas as pd

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

speciality_counts = data['Specialist'].value_counts()

# Get specialities that occur more than 1000 times
specialities_to_keep = speciality_counts[speciality_counts > 500].index

# Filter the data to keep only rows with specialities that occur more than 1000 times
data = data[data['Specialist'].isin(specialities_to_keep)]

# Map the filtered specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str) + "\n" + data["Description"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load BioBERT tokenizer
model_name = "monologg/biobert_v1.1_pubmed"  # BioBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure each text is a string
        label = self.labels[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SpecialistDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Data Collation Function
def collate_fn(batch):
    input_ids = torch.stack([item["input_ids"] for item in batch])
    attention_mask = torch.stack([item["attention_mask"] for item in batch])
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=32, collate_fn=collate_fn)

# BioBERT Model with Classification Head
class BioBERTForSpecialistClassification(nn.Module):
    def __init__(self, pretrained_model_name, num_labels, dropout_rate=0.3):
        super(BioBERTForSpecialistClassification, self).__init__()
        self.biobert = AutoModel.from_pretrained(pretrained_model_name)
        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(self.biobert.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask):
        outputs = self.biobert(input_ids=input_ids, attention_mask=attention_mask)
        pooled_output = outputs.last_hidden_state[:, 0, :]  # Use the [CLS] token for classification
        pooled_output = self.dropout(pooled_output)
        logits = self.classifier(pooled_output)
        return logits

# Initialize the model
model = BioBERTForSpecialistClassification(
    pretrained_model_name=model_name,
    num_labels=len(specialist_mapping),
    dropout_rate=0.3
).to(device)

# Optimizer Setup
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Loss Function
criterion = nn.CrossEntropyLoss()

# Mixed Precision Training
scaler = GradScaler()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Mixed precision training
        with autocast():  # Automatically cast operations to half precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs, labels)

        # Scaler for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["label"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs, dim=1).tolist())
    accuracy = accuracy_score(labels, preds)
    return accuracy

# Training Loop
for epoch in range(100):  # Train for 100 epochs
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler)
    accuracy = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch + 1}/50 | Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

# Save the Fine-Tuned Model
torch.save(model.state_dict(), "./biobert_fine_tuned_model.pth")
tokenizer.save_pretrained("./biobert_fine_tuned_model_tokenizer")

  scaler = GradScaler()
  with autocast():  # Automatically cast operations to half precision


Epoch 1/5 | Train Loss: 1.3127 | Test Accuracy: 0.6671


  with autocast():  # Automatically cast operations to half precision


Epoch 2/5 | Train Loss: 1.2185 | Test Accuracy: 0.6184


  with autocast():  # Automatically cast operations to half precision


Epoch 3/5 | Train Loss: 1.3274 | Test Accuracy: 0.1701


  with autocast():  # Automatically cast operations to half precision


Epoch 4/5 | Train Loss: 1.1550 | Test Accuracy: 0.6626


  with autocast():  # Automatically cast operations to half precision


Epoch 5/5 | Train Loss: 1.0802 | Test Accuracy: 0.6445


  with autocast():  # Automatically cast operations to half precision


Epoch 6/5 | Train Loss: 1.1020 | Test Accuracy: 0.6504


  with autocast():  # Automatically cast operations to half precision


Epoch 7/5 | Train Loss: 1.3467 | Test Accuracy: 0.5327


  with autocast():  # Automatically cast operations to half precision


Epoch 8/5 | Train Loss: 1.2727 | Test Accuracy: 0.0641


  with autocast():  # Automatically cast operations to half precision


Epoch 9/5 | Train Loss: 1.9660 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 10/5 | Train Loss: 2.8838 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 11/5 | Train Loss: 2.8800 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 12/5 | Train Loss: 2.8780 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 13/5 | Train Loss: 2.8882 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 14/5 | Train Loss: 2.8830 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 15/5 | Train Loss: 2.8788 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 16/5 | Train Loss: 2.8765 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 17/5 | Train Loss: 2.8752 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 18/5 | Train Loss: 2.8764 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 19/5 | Train Loss: 2.8747 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 20/5 | Train Loss: 2.8738 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 21/5 | Train Loss: 2.8736 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 22/5 | Train Loss: 2.8728 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 23/5 | Train Loss: 2.8725 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 24/5 | Train Loss: 2.8724 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 25/5 | Train Loss: 2.8726 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 26/5 | Train Loss: 2.8722 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 27/5 | Train Loss: 2.8718 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 28/5 | Train Loss: 2.8719 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 29/5 | Train Loss: 2.8716 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 30/5 | Train Loss: 2.8715 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 31/5 | Train Loss: 2.8717 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 32/5 | Train Loss: 2.8714 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 33/5 | Train Loss: 2.8710 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 34/5 | Train Loss: 2.8713 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 35/5 | Train Loss: 2.8713 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 36/5 | Train Loss: 2.8709 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 37/5 | Train Loss: 2.8711 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 38/5 | Train Loss: 2.8711 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 39/5 | Train Loss: 2.8710 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 40/5 | Train Loss: 2.8709 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 41/5 | Train Loss: 2.8712 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 42/5 | Train Loss: 2.8712 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 43/5 | Train Loss: 2.8756 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 44/5 | Train Loss: 2.8726 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 45/5 | Train Loss: 2.8728 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 46/5 | Train Loss: 2.8724 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 47/5 | Train Loss: 2.8719 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 48/5 | Train Loss: 2.8716 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 49/5 | Train Loss: 2.8716 | Test Accuracy: 0.1466


  with autocast():  # Automatically cast operations to half precision


Epoch 50/5 | Train Loss: 2.8711 | Test Accuracy: 0.1466


('./biobert_fine_tuned_model_tokenizer/tokenizer_config.json',
 './biobert_fine_tuned_model_tokenizer/special_tokens_map.json',
 './biobert_fine_tuned_model_tokenizer/vocab.txt',
 './biobert_fine_tuned_model_tokenizer/added_tokens.json',
 './biobert_fine_tuned_model_tokenizer/tokenizer.json')

In [6]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pandas as pd

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Calculate the frequency of each speciality
speciality_counts = data['Specialist'].value_counts()

# Get specialities that occur more than 1000 times
specialities_to_keep = speciality_counts[speciality_counts > 500].index

# Filter the data to keep only rows with specialities that occur more than 1000 times
data = data[data['Specialist'].isin(specialities_to_keep)]

# Map the filtered specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)




In [7]:
data.shape

(87586, 5)

In [8]:
specialities_to_keep

Index(['family medicine', 'obstetrics and gynecology', 'dermatology',
       'psychiatry', 'neurology', 'orthopaedic surgery', 'cardiology',
       'emergency medicine', 'pediatrics', 'urology', 'pulmonology',
       'pathology', 'general surgery', 'sexologist',
       'colon and rectal surgery', 'gastroenterology', 'hospital medicine',
       'vascular surgery', 'oncology', 'nephrology', 'neurological surgery',
       'allergy and immunology', 'endocrinology', 'rheumatology',
       'physical medicine and rehabilitation'],
      dtype='object', name='Specialist')