In [1]:
import numpy as np
import pandas as pd

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "23tanmay/BioDistillGPT2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split (80:20)
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].tolist(),  # Convert to list
    data["Specialist_id"].tolist(),  # Convert to list
    test_size=0.2,
    random_state=42,
)

# Load tokenizer
model_name = "23tanmay/BioDistillGPT2"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize data
def tokenize_data(texts, labels):
    tokenized = tokenizer(
        texts, 
        padding="max_length", 
        truncation=True, 
        max_length=128, 
        return_tensors="pt"
    )
    tokenized["labels"] = torch.tensor(labels)  # Ensure labels are a tensor
    return tokenized

# Tokenize train and test data
train_data = tokenize_data(train_texts, train_labels)
test_data = tokenize_data(test_texts, test_labels)

# Load the model
num_labels = len(specialist_mapping)  # Number of specialists
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)

TypeError: TextEncodeInput must be Union[TextInputSequence, Tuple[InputSequence, InputSequence]]

In [None]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=5,
    weight_decay=0.01,
    logging_dir="./logs",
    load_best_model_at_end=True,
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_data,
    eval_dataset=test_data,
    tokenizer=tokenizer,
)

trainer.train()


In [None]:
eval_results = trainer.evaluate()
print("Testing Loss:", eval_results["eval_loss"])


In [None]:
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

In [5]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

data = data.head(100)
# data = data.head(100)

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load tokenizer and set padding token
model_name = "23tanmay/BioDistillGPT2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure each text is a string
        label = self.labels[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SpecialistDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Data Collation Function
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=0
    )
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

# Load model and configure padding token
num_labels = len(specialist_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Loss Function
criterion = torch.nn.CrossEntropyLoss()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["label"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
    accuracy = accuracy_score(labels, preds)
    return accuracy

# Training Loop
epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    accuracy = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at 23tanmay/BioDistillGPT2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/5
Train Loss: 2.3767 | Test Accuracy: 0.4500
Epoch 2/5
Train Loss: 1.6740 | Test Accuracy: 0.8000
Epoch 3/5
Train Loss: 1.2207 | Test Accuracy: 0.8000
Epoch 4/5
Train Loss: 0.8616 | Test Accuracy: 0.8000
Epoch 5/5
Train Loss: 0.6647 | Test Accuracy: 0.8000


('./fine_tuned_model\\tokenizer_config.json',
 './fine_tuned_model\\special_tokens_map.json',
 './fine_tuned_model\\vocab.json',
 './fine_tuned_model\\merges.txt',
 './fine_tuned_model\\added_tokens.json',
 './fine_tuned_model\\tokenizer.json')

In [3]:
# from transformers import get_scheduler

# # Hyperparameter Search Space
# param_grid = {
#     "learning_rate": [1e-5, 3e-5, 5e-5],
#     "batch_size": [8, 16, 32],
#     "weight_decay": [0.0, 0.01],
# }

# best_model = None
# best_accuracy = 0
# best_params = {}

# # Tuning Loop
# for lr in param_grid["learning_rate"]:
#     for batch_size in param_grid["batch_size"]:
#         for weight_decay in param_grid["weight_decay"]:
#             print(f"\nTesting Configuration: LR={lr}, Batch Size={batch_size}, Weight Decay={weight_decay}")
            
#             # Update DataLoader with New Batch Size
#             train_loader = DataLoader(
#                 train_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
#             )
#             test_loader = DataLoader(test_dataset, batch_size=batch_size, collate_fn=collate_fn)
            
#             # Define Model, Optimizer, and Scheduler
#             model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
#             model.config.pad_token_id = tokenizer.pad_token_id
#             model.to(device)

#             optimizer = AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
#             num_training_steps = len(train_loader) * 5  # Assuming 5 epochs
#             scheduler = get_scheduler("linear", optimizer=optimizer, num_training_steps=num_training_steps)

#             # Training Loop for Current Configuration
#             for epoch in range(5):  # Use fewer epochs to speed up tuning
#                 train_loss = train_model(model, train_loader, optimizer, criterion, device)
#                 accuracy = evaluate_model(model, test_loader, device)
#                 print(f"Epoch {epoch+1}: Train Loss={train_loss:.4f}, Accuracy={accuracy:.4f}")
                
#                 # Track Best Model
#                 if accuracy > best_accuracy:
#                     best_accuracy = accuracy
#                     best_model = model
#                     best_params = {"learning_rate": lr, "batch_size": batch_size, "weight_decay": weight_decay}

# print("\nBest Configuration:")
# print(f"Learning Rate: {best_params['learning_rate']}")
# print(f"Batch Size: {best_params['batch_size']}")
# print(f"Weight Decay: {best_params['weight_decay']}")
# print(f"Best Accuracy: {best_accuracy:.4f}")

# # Save Best Model
# best_model.save_pretrained("./best_model")
# tokenizer.save_pretrained("./best_model")


Testing Configuration: LR=1e-05, Batch Size=8, Weight Decay=0.0


Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at 23tanmay/BioDistillGPT2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ValueError: SchedulerType.LINEAR requires `num_warmup_steps`, please provide that argument.

In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Limit training instances to a maximum of 40 per specialist type
max_per_specialty = 60
data = data.groupby("Specialist").head(max_per_specialty).reset_index(drop=True)
print(data.head())

  from .autonotebook import tqdm as notebook_tqdm


   id                                            Patient  \
0   0  Hi doctor,I am just wondering what is abutting...   
1   1  Hi doctor, I am a 26 year old male. I am 5 fee...   
2   2  Hello doctor, I am 48 years old. I am experien...   
3   3  Hello doctor, I have multiple small cysts in b...   
4   4  Hi doctor, During masturbation I just rub the ...   

                                         Description  \
0  hi. i have gone through your query with dilige...   
1  hello. i have gone through your information an...   
2  hi. for further doubts consult a sexologist on...   
3  hello. i just read your query. see kalarachi k...   
4  hi. for further doubts consult a sexologist on...   

                  Specialist  
0                  neurology  
1            general surgery  
2                 sexologist  
3  obstetrics and gynecology  
4                 sexologist  


In [13]:
# Count the number of instances per specialty
data1 = pd.read_csv("filtered_specialists.csv")
specialty_counts = data1["Specialist"].value_counts()

# Filter specialties with fewer than 60 instances
specialties_less_than_60 = specialty_counts[specialty_counts <= 1000]

# Print specialties and their instance counts
print("Specialties with fewer than 60 instances:")
print(specialties_less_than_60)


Specialties with fewer than 60 instances:
Specialist
oncology                                820
nephrology                              813
neurological surgery                    748
allergy and immunology                  583
endocrinology                           540
rheumatology                            514
physical medicine and rehabilitation    509
otolaryngology                          405
hospice and palliative medicine         378
thoracic surgery                        332
radiology                               282
plastic surgery                         211
anesthesiology                           61
ophthalmic surgery                       33
sleep medicine                           16
preventive medicine                      11
genetics and genomics                    11
forensic pathology                        5
Name: count, dtype: int64


In [3]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Limit training instances to a maximum of 40 per specialist type
max_per_specialty = 40
data = data.groupby("Specialist").head(max_per_specialty).reset_index(drop=True)

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load tokenizer and set padding token
model_name = "23tanmay/BioDistillGPT2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure each text is a string
        label = self.labels[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SpecialistDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Data Collation Function
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=0
    )
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

# Load model and configure padding token
num_labels = len(specialist_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)

# Loss Function
criterion = torch.nn.CrossEntropyLoss()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["label"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
    accuracy = accuracy_score(labels, preds)
    return accuracy

# Training Loop
epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device)
    accuracy = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

KeyboardInterrupt: 

In [4]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW, get_scheduler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load tokenizer and set padding token
model_name = "23tanmay/BioDistillGPT2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length=128):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])  # Ensure each text is a string
        label = self.labels[idx]
        encoded = self.tokenizer(
            text,
            padding="max_length",
            truncation=True,
            max_length=self.max_length,
            return_tensors="pt",
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0),
            "attention_mask": encoded["attention_mask"].squeeze(0),
            "label": torch.tensor(label, dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_texts.tolist(), train_labels.tolist(), tokenizer)
test_dataset = SpecialistDataset(test_texts.tolist(), test_labels.tolist(), tokenizer)

# Data Collation Function
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=0
    )
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "label": labels}

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

# Load model and configure padding token
num_labels = len(specialist_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=1e-5)

# Mixed Precision Training Setup
scaler = GradScaler()  # Used to scale gradients for mixed precision

# Loss Function
criterion = torch.nn.CrossEntropyLoss()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["label"].to(device)

        # Mixed precision training
        with autocast():  # Automatically cast operations to half precision
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Scaler for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        
        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["label"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())
    accuracy = accuracy_score(labels, preds)
    return accuracy

# Training Loop
epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler)
    accuracy = evaluate_model(model, test_loader, device)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f} | Test Accuracy: {accuracy:.4f}")

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_model")
tokenizer.save_pretrained("./fine_tuned_model")

Some weights of GPT2ForSequenceClassification were not initialized from the model checkpoint at 23tanmay/BioDistillGPT2 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = GradScaler()  # Used to scale gradients for mixed precision
  with autocast():  # Automatically cast operations to half precision


Epoch 1/5
Train Loss: 1.8167 | Test Accuracy: 0.5539


  with autocast():  # Automatically cast operations to half precision


Epoch 2/5
Train Loss: 1.4463 | Test Accuracy: 0.5665


  with autocast():  # Automatically cast operations to half precision


Epoch 3/5
Train Loss: 1.3612 | Test Accuracy: 0.5709


  with autocast():  # Automatically cast operations to half precision


Epoch 4/5
Train Loss: 1.2978 | Test Accuracy: 0.5749


  with autocast():  # Automatically cast operations to half precision


Epoch 5/5
Train Loss: 1.2416 | Test Accuracy: 0.5722


('./fine_tuned_model/tokenizer_config.json',
 './fine_tuned_model/special_tokens_map.json',
 './fine_tuned_model/vocab.json',
 './fine_tuned_model/merges.txt',
 './fine_tuned_model/added_tokens.json',
 './fine_tuned_model/tokenizer.json')

In [3]:
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [None]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from torch.cuda.amp import GradScaler, autocast  # For mixed precision training
import pickle

# Check if GPU is available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load CSV
data = pd.read_csv("filtered_specialists.csv")

# Map specialists to numeric labels
specialist_mapping = {spec: idx for idx, spec in enumerate(data["Specialist"].unique())}
data["Specialist_id"] = data["Specialist"].map(specialist_mapping)

# Train-test split
train_texts, test_texts, train_labels, test_labels = train_test_split(
    data["Patient"].astype(str),  # Convert to string explicitly
    data["Specialist_id"],
    test_size=0.2,
    random_state=42,
)

# Load BioBERT tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
tokenizer.pad_token = tokenizer.eos_token  # Set padding token

# Pre-tokenize and cache the datasets
def preprocess_and_cache(texts, labels, tokenizer, max_length, cache_file):
    encodings = tokenizer(
        texts.tolist(),
        truncation=True,
        padding=True,
        max_length=max_length,
        return_tensors="pt",
    )
    with open(cache_file, "wb") as f:
        pickle.dump((encodings, labels.tolist()), f)

# Define maximum sequence length based on data distribution
data["Patient"] = data["Patient"].astype(str)
max_length = int(data["Patient"].apply(len).quantile(0.95))
if tokenizer.pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})
preprocess_and_cache(train_texts, train_labels, tokenizer, max_length, "train_encodings.pkl")
preprocess_and_cache(test_texts, test_labels, tokenizer, max_length, "test_encodings.pkl")

# Load cached datasets
def load_cached_data(cache_file):
    with open(cache_file, "rb") as f:
        encodings, labels = pickle.load(f)
    return encodings, labels

train_encodings, train_labels = load_cached_data("train_encodings.pkl")
test_encodings, test_labels = load_cached_data("test_encodings.pkl")

# Custom Dataset Class
class SpecialistDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.encodings["input_ids"][idx],
            "attention_mask": self.encodings["attention_mask"][idx],
            "label": torch.tensor(self.labels[idx], dtype=torch.long),
        }

# Create Datasets
train_dataset = SpecialistDataset(train_encodings, train_labels)
test_dataset = SpecialistDataset(test_encodings, test_labels)

# Data Collation Function (Dynamic Padding)
def collate_fn(batch):
    input_ids = torch.nn.utils.rnn.pad_sequence(
        [item["input_ids"] for item in batch], batch_first=True, padding_value=tokenizer.pad_token_id
    )
    attention_mask = torch.nn.utils.rnn.pad_sequence(
        [item["attention_mask"] for item in batch], batch_first=True, padding_value=0
    )
    labels = torch.tensor([item["label"] for item in batch])
    return {"input_ids": input_ids, "attention_mask": attention_mask, "labels": labels}

# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True, collate_fn=collate_fn)
test_loader = DataLoader(test_dataset, batch_size=16, collate_fn=collate_fn)

# Load BioBERT model
num_labels = len(specialist_mapping)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=num_labels)
model.config.pad_token_id = tokenizer.pad_token_id  # Align model's padding token with tokenizer
model.gradient_checkpointing_enable()  # Enable gradient checkpointing for memory efficiency
model.to(device)

# Optimizer
optimizer = AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)

# Mixed Precision Training Setup
scaler = GradScaler()  # Used to scale gradients for mixed precision

# Loss Function
criterion = torch.nn.CrossEntropyLoss()

# Training Function
def train_model(model, train_loader, optimizer, criterion, device, scaler):
    model.train()
    total_loss = 0
    for batch in train_loader:
        optimizer.zero_grad()
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = batch["labels"].to(device)

        # Mixed precision training
        with autocast():
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss

        # Scaler for mixed precision
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        total_loss += loss.item()
    return total_loss / len(train_loader)

# Evaluation Function
def evaluate_model(model, test_loader, device):
    model.eval()
    preds = []
    labels = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)
            labels.extend(batch["labels"].tolist())

            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            preds.extend(torch.argmax(outputs.logits, dim=1).tolist())

    report = classification_report(labels, preds, target_names=list(specialist_mapping.keys()))
    return report

# Training Loop
epochs = 5
for epoch in range(epochs):
    train_loss = train_model(model, train_loader, optimizer, criterion, device, scaler)
    print(f"Epoch {epoch+1}/{epochs}")
    print(f"Train Loss: {train_loss:.4f}")

# Evaluate Model
report = evaluate_model(model, test_loader, device)
print("Classification Report:")
print(report)

# Save the Fine-Tuned Model
model.save_pretrained("./fine_tuned_biobert_model")
tokenizer.save_pretrained("./fine_tuned_biobert_model")