In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Install Required Libraries
!pip install transformers scikit-learn pandas torch peft difflib

In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, T5ForConditionalGeneration, Trainer, TrainingArguments
from peft import LoraConfig, get_peft_model
from sklearn.metrics import classification_report
from difflib import get_close_matches

# Load LLM

In [None]:
# Load ClinicalT5 model
model_name = "luqh/ClinicalT5-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = T5ForConditionalGeneration.from_pretrained(model_name, from_flax=True)

# GPU Check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.92k [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.42M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/664 [00:00<?, ?B/s]

flax_model.msgpack:   0%|          | 0.00/892M [00:00<?, ?B/s]

  pt_model_dict[flax_key] = torch.from_numpy(flax_tensor)
All Flax model weights were used when initializing T5ForConditionalGeneration.

Some weights of T5ForConditionalGeneration were not initialized from the Flax model and are newly initialized: ['decoder.embed_tokens.weight', 'encoder.embed_tokens.weight', 'lm_head.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Load File

In [None]:
# File paths
training_set = "/content/drive/MyDrive/P2/T1/Dataset/combined/LLM-c-train.csv"
validation_set = "/content/drive/MyDrive/P2/T1/Dataset/combined/LLM-c-val.csv"
testing_set = "/content/drive/MyDrive/P2/T1/Dataset/combined/LLM-c-test.csv"

# Load datasets
train_data = pd.read_csv(training_set)
val_data = pd.read_csv(validation_set)
test_data = pd.read_csv(testing_set)

In [None]:
# Predefined disease list
disease_list = [
 'hypertensive disease', 'fibroid tumor', 'malignant neoplasms', 'adhesion',
 'hepatitis', 'neoplasm metastasis', 'benign prostatic hypertrophy',
 'tachycardia sinus', 'obesity morbid', 'infection', 'affect labile',
 'sickle cell anemia', 'pneumothorax', 'pancreatitis', 'melanoma', 'gastritis',
 'tricuspid valve insufficiency', 'cirrhosis', 'stenosis aortic valve',
 'delirium', 'gastroenteritis', 'kidney failure acute', 'overload fluid',
 'manic disorder', 'failure heart', 'hypertension pulmonary',
 'infection urinary tract', 'neutropenia', 'peripheral vascular disease',
 'ulcer peptic', 'degenerative polyarthritis', 'colitis', 'osteomyelitis',
 'biliary calculus', 'coronary arteriosclerosis', 'hepatitis C',
 'hyperlipidemia', 'chronic obstructive airway disease',
 'deep vein thrombosis', 'epilepsy', 'dependence', 'hyperglycemia', 'obesity',
 'paroxysmal dyspnea', 'carcinoma breast', 'thrombocytopaenia',
 'pyelonephritis', 'effusion pericardial', 'thrombus', 'adenocarcinoma',
 'gout', 'glaucoma', 'arthritis', 'hypoglycemia', 'asthma', 'neuropathy',
 'schizophrenia', 'hepatitis B', 'hyperbilirubinemia',
]

# LoRA (Low-Rank Adaptation)

In [None]:
from peft import LoraConfig, get_peft_model

# Configure LoRA for ClinicalT5
lora_config = LoraConfig(
    r=32,  # Low-rank adaptation rank
    lora_alpha=64,  # Scaling factor
    target_modules=[
        "encoder.block.0.layer.0.SelfAttention.q",
        "encoder.block.0.layer.0.SelfAttention.v",
        "decoder.block.0.layer.0.SelfAttention.q",
        "decoder.block.0.layer.0.SelfAttention.v",
        "decoder.block.0.layer.1.EncDecAttention.q",
        "decoder.block.0.layer.1.EncDecAttention.v"
    ],
    lora_dropout=0.2,  # Dropout rate for regularization
    task_type="SEQ_2_SEQ_LM"  # Corrected task type for sequence-to-sequence language modeling
)

# Apply LoRA to the ClinicalT5 model
model = get_peft_model(model, lora_config)
print("LoRA applied successfully to ClinicalT5!")


LoRA applied successfully to ClinicalT5!


# Tokenize Symptoms

In [None]:
# Tokenize the datasets
def tokenize_texts(texts):
    return tokenizer(texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

train_encodings = tokenize_texts(train_data['symptoms'])
val_encodings = tokenize_texts(val_data['symptoms'])

# Tokenize the labels
def tokenize_labels(labels):
    return tokenizer(labels.tolist(), padding=True, truncation=True, max_length=128, return_tensors="pt")

train_labels_encodings = tokenize_labels(train_data['disease_label'])
val_labels_encodings = tokenize_labels(val_data['disease_label'])

# Dataset class for fine-tuning
class SymptomDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels_encodings):
        self.encodings = encodings
        self.labels_encodings = labels_encodings

    def __len__(self):
        return len(self.labels_encodings['input_ids'])

    def __getitem__(self, idx):
        item = {key: val[idx] for key, val in self.encodings.items()}
        item['decoder_input_ids'] = self.labels_encodings['input_ids'][idx]
        item['decoder_attention_mask'] = self.labels_encodings['attention_mask'][idx]
        item['labels'] = self.labels_encodings['input_ids'][idx]
        return item

# Create PyTorch Datasets
train_dataset = SymptomDataset(train_encodings, train_labels_encodings)
val_dataset = SymptomDataset(val_encodings, val_labels_encodings)

# Fine-tuning ClinicalT5

In [None]:
# Fine-Tuning BioBERT
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",  # Enable logging after a fixed number of steps
    logging_steps=10,  # Log training loss every 10 steps
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=128,
    num_train_epochs=6,
    weight_decay= 0.001,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=2
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset
)




In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,0.6705,0.19438
2,0.3043,0.027933
3,0.2138,0.011047
4,0.1748,0.006432
5,0.1344,0.004615
6,0.1296,0.004252


TrainOutput(global_step=19104, training_loss=0.4651545347814359, metrics={'train_runtime': 1976.0676, 'train_samples_per_second': 154.683, 'train_steps_per_second': 9.668, 'total_flos': 2.402985582054605e+16, 'train_loss': 0.4651545347814359, 'epoch': 6.0})

# Diagnose disease with ClinicalT5

In [None]:
import torch
from transformers import AutoTokenizer, T5ForConditionalGeneration
from peft import LoraConfig, get_peft_model
from difflib import get_close_matches
import pandas as pd

# Function to predict diseases using ClinicalT5 in batches
def predict_disease_batched(symptoms, tokenizer, model, device, batch_size=32):
    predictions = []
    for i in range(0, len(symptoms), batch_size):
        batch_symptoms = symptoms[i:i+batch_size]
        inputs = tokenizer(
            batch_symptoms, return_tensors="pt", padding=True, truncation=True, max_length=128
        ).to(device)

        with torch.no_grad():
            outputs = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], max_length=32)

        batch_predictions = tokenizer.batch_decode(outputs, skip_special_tokens=True)
        predictions.extend(batch_predictions)
    return predictions


# Function to restrict predictions to the disease list using soft matching
def filter_predictions(predictions, disease_list):
    filtered_predictions = []
    for pred in predictions:
        if pred in disease_list:
            filtered_predictions.append(pred)
        else:
            closest_match = get_close_matches(pred, disease_list, n=1, cutoff=0.6)
            if closest_match:
                filtered_predictions.append(closest_match[0])
            else:
                filtered_predictions.append("Unknown")
    return filtered_predictions

print("Generating predictions for validation data...")
val_predictions_raw = predict_disease_batched(val_data['symptoms'].tolist(), tokenizer, model, device)
val_predictions = filter_predictions(val_predictions_raw, disease_list)

print("Generating predictions for test data...")
test_predictions_raw = predict_disease_batched(test_data['symptoms'].tolist(), tokenizer, model, device)
test_predictions = filter_predictions(test_predictions_raw, disease_list)

# Evaluation

In [None]:
# Calculate evaluation metrics
val_true_labels = val_data['disease_label'].tolist()
test_true_labels = test_data['disease_label'].tolist()

# Validation Report
val_report = classification_report(
    val_true_labels,
    val_predictions,
    zero_division=0
)
print("Validation Report:")
print(val_report)

In [None]:
# Calculate evaluation metrics
val_true_labels = val_data['disease_label'].tolist()
test_true_labels = test_data['disease_label'].tolist()

# Validation Report
val_report = classification_report(
    val_true_labels,
    val_predictions,
    zero_division=0
)
print("Validation Report:")
print(val_report)

# Test Report
test_report = classification_report(
    test_true_labels,
    test_predictions,
    zero_division=0
)
print("Test Report:")
print(test_report)

In [None]:
# Save test classification report to CSV
save_path = "/content/drive/MyDrive/P2/T1/Dataset/combined/LLM/ClinicalT5-c-class.csv"
test_report_dict = classification_report(test_true_labels, test_predictions, output_dict=True, zero_division=0)
test_report_df = pd.DataFrame(test_report_dict).transpose()
test_report_df.to_csv(save_path, index=True)

print(f"Classification report saved successfully to {save_path}")

Classification report saved successfully to /content/drive/MyDrive/P2/T1/Dataset/combined/LLM/ClinicalT5-c-class.csv
