In [32]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [33]:
!pip install transformers datasets peft wandb sacremoses



In [34]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
from peft import LoraConfig, get_peft_model

In [35]:
# Check GPU availability
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available. Please enable it in runtime settings.")

GPU is available!
GPU name: NVIDIA A100-SXM4-40GB


In [36]:
import torch
torch.cuda.empty_cache()

# Load Dataset

In [37]:
# File paths
training_set = "/content/drive/MyDrive/P2/T1/Dataset/combined/LLM-c-train.csv"
validation_set = "/content/drive/MyDrive/P2/T1/Dataset/combined/LLM-c-val.csv"
testing_set = "/content/drive/MyDrive/P2/T1/Dataset/combined/LLM-c-test.csv"

# Load datasets
train_data = pd.read_csv(training_set)
val_data = pd.read_csv(validation_set)
test_data = pd.read_csv(testing_set)

# Add prompt column to the datasets
for dataset in [train_data, val_data, test_data]:
    dataset['prompt'] = dataset.apply(
        lambda row: f"Symptoms: {row['symptoms']}\nPredict the disease:",
        axis=1
    )

# Create mappings for text-based labels
unique_labels = train_data['disease_label'].unique()
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for idx, label in enumerate(unique_labels)}

# Map text labels to numeric indices
for dataset in [train_data, val_data, test_data]:
    dataset['target'] = dataset['disease_label'].map(label_to_id)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")
print(f"Number of testing samples: {len(test_dataset)}")

disease_counts = train_data['disease_label'].value_counts()
print("Disease Distribution in Training Data:")
print(disease_counts)


Number of training samples: 50944
Number of validation samples: 9646
Number of testing samples: 9718
Disease Distribution in Training Data:
disease_label
hypertensive disease                  1530
biliary calculus                      1491
hyperlipidemia                        1464
coronary arteriosclerosis             1385
colitis                               1303
paroxysmal dyspnea                    1270
failure heart                         1120
osteomyelitis                         1115
infection                             1093
pyelonephritis                        1068
malignant neoplasms                   1033
gastritis                             1007
stenosis aortic valve                  994
overload fluid                         978
delirium                               964
hepatitis                              962
pancreatitis                           958
deep vein thrombosis                   943
manic disorder                         936
dependence                   

In [38]:
disease_counts = train_data['disease_label'].value_counts()
print("Disease Distribution in Training Data:")
print(disease_counts)

Disease Distribution in Training Data:
disease_label
hypertensive disease                  1530
biliary calculus                      1491
hyperlipidemia                        1464
coronary arteriosclerosis             1385
colitis                               1303
paroxysmal dyspnea                    1270
failure heart                         1120
osteomyelitis                         1115
infection                             1093
pyelonephritis                        1068
malignant neoplasms                   1033
gastritis                             1007
stenosis aortic valve                  994
overload fluid                         978
delirium                               964
hepatitis                              962
pancreatitis                           958
deep vein thrombosis                   943
manic disorder                         936
dependence                             930
tachycardia sinus                      929
adenocarcinoma                         922
e

# Load LLM

In [39]:
# Import the required classes
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load BioBERT
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=59)  # Placeholder for `num_labels`

# GPU Check
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# LoRA (Low-Rank Adaptation)

In [40]:

# Configure LoRA for BioBERT
lora_config = LoraConfig(
    r=16,  # Low-rank adaptation rank
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # LoRA applied to attention layers
    lora_dropout=0.1,  # Dropout rate for regularization
    task_type="SEQ_CLS"  # Sequence classification task
)

model = get_peft_model(model, lora_config)
print("LoRA applied successfully to BioBERT!")


LoRA applied successfully to BioBERT!


# Tokenization function

In [41]:
# Tokenization function for the dataset
def tokenize_function(examples):
    # Tokenize the 'prompt' column
    tokenized_inputs = tokenizer(
        examples['prompt'],  # Input text (symptoms prompt)
        truncation=True,
        padding='max_length',
        max_length=256,  # Adjust as needed
        return_tensors="pt"
    )
    # Add 'labels' field (numeric targets)
    tokenized_inputs["labels"] = examples["target"]
    return tokenized_inputs

# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the dataset format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/50944 [00:00<?, ? examples/s]

Map:   0%|          | 0/9646 [00:00<?, ? examples/s]

Map:   0%|          | 0/9718 [00:00<?, ? examples/s]

# Training Arguments

In [42]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    logging_strategy="steps",
    logging_steps=10,
    learning_rate=5e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs = 6,
    weight_decay=0.001,
    logging_dir="./logs",
    save_strategy="epoch",
    save_total_limit=2,
    remove_unused_columns=False  # Disable column filtering
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    tokenizer=tokenizer
)


  trainer = Trainer(


In [43]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset= val_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [44]:
# Train the model
trainer.train()

Epoch,Training Loss,Validation Loss
1,2.1635,1.971682
2,1.4116,1.213706
3,1.0954,0.849066
4,0.9661,0.673803
5,0.8884,0.596718
6,0.8887,0.572131


TrainOutput(global_step=9552, training_loss=1.4626299418771087, metrics={'train_runtime': 2297.4417, 'train_samples_per_second': 133.045, 'train_steps_per_second': 4.158, 'total_flos': 4.053059239359283e+16, 'train_loss': 1.4626299418771087, 'epoch': 6.0})

# Training and Validation Loss Graph

In [45]:
"""
import matplotlib.pyplot as plt

# Data from the table
epochs = [1, 2, 3, 4, 5, 6]
training_loss = [2.616900, 1.970200, 1.714200, 1.640900, 1.364900, 1.114800]
validation_loss = [2.713036, 1.945025, 1.528872, 1.351326, 1.194266, 1.159532]

# Plot the graph
plt.figure(figsize=(10, 6))
plt.plot(epochs, training_loss, label="Training Loss", marker="o")
plt.plot(epochs, validation_loss, label="Validation Loss", marker="o")

# Add labels, title, and legend
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Epochs")
plt.legend()
plt.grid(True)

# Show the plot
plt.show()
"""

'\nimport matplotlib.pyplot as plt\n\n# Data from the table\nepochs = [1, 2, 3, 4, 5, 6]\ntraining_loss = [2.616900, 1.970200, 1.714200, 1.640900, 1.364900, 1.114800]\nvalidation_loss = [2.713036, 1.945025, 1.528872, 1.351326, 1.194266, 1.159532]\n\n# Plot the graph\nplt.figure(figsize=(10, 6))\nplt.plot(epochs, training_loss, label="Training Loss", marker="o")\nplt.plot(epochs, validation_loss, label="Validation Loss", marker="o")\n\n# Add labels, title, and legend\nplt.xlabel("Epoch")\nplt.ylabel("Loss")\nplt.title("Training and Validation Loss Over Epochs")\nplt.legend()\nplt.grid(True)\n\n# Show the plot\nplt.show()\n'

# Evaluate the fine-tuned model directly

In [46]:
from datasets import Dataset

# Load the test dataset
test_file_path = '/content/drive/MyDrive/P2/T1/Dataset/combined/LLM-c-test.csv'
test_data = pd.read_csv(test_file_path)

# Add prompt column (if not already present)
test_data['prompt'] = test_data.apply(
    lambda row: f"Symptoms: {row['symptoms']}\nPredict the disease:",
    axis=1
)

# Convert to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_data)

In [47]:
import re

def predict_disease(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()
        return id_to_label[predicted_class_id]  # Map numeric index back to text label

# Test Predictions
test_data['predicted_disease'] = test_data['prompt'].apply(predict_disease)

# Print Predictions
for index, row in test_data.iterrows():
    print(f"Prompt: {row['prompt']}")
    print(f"Predicted Disease: {row['predicted_disease']}")


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predict the disease:
Predicted Disease: infection
Prompt: Symptoms: Patient shows symptoms as follows: have stiffness; have polydypsia; have numbness of hand.
Predict the disease:
Predicted Disease: degenerative polyarthritis
Prompt: Symptoms: Patient shows symptoms as follows: have hypothermia, natural; have unconscious state; have clammy skin; have claudication; have qt interval prolonged.
Predict the disease:
Predicted Disease: hypoglycemia
Prompt: Symptoms: Patient shows symptoms as follows: have palpitation; have nausea; have cushingoid habitus; have diarrhea.
Predict the disease:
Predicted Disease: biliary calculus
Prompt: Symptoms: Patient shows symptoms as follows: have pain chest; have nervousness; have apyrexial; have hemiplegia; have gurgle.
Predict the disease:
Predicted Disease: ulcer peptic
Prompt: Symptoms: Patient shows symptoms as follows: have yellow sputum; have inappropriate affect; have abdominal tend

# Save fine-tuned Model

In [48]:
"""
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/P2/LLM/fine_tuned_biogpt')
tokenizer.save_pretrained('/content/drive/MyDrive/P2/LLM/fine_tuned_biogpt')
print("Fine-tuned BioGPT model and tokenizer saved successfully!")
"""

'\n# Save the fine-tuned model\nmodel.save_pretrained(\'/content/drive/MyDrive/P2/LLM/fine_tuned_biogpt\')\ntokenizer.save_pretrained(\'/content/drive/MyDrive/P2/LLM/fine_tuned_biogpt\')\nprint("Fine-tuned BioGPT model and tokenizer saved successfully!")\n'

# Generate Classification Report

In [49]:
from sklearn.metrics import classification_report
import pandas as pd

# Assume `test_data` is your test dataset after generating predictions
# The test_data dataframe should already have columns: 'Label' (true labels) and 'Predicted' (model predictions)

# True labels and predicted labels
true_labels = test_data['disease_label']
predicted_labels = test_data['predicted_disease']

# Generate classification report
report = classification_report(true_labels, predicted_labels, output_dict=False)
print(report)

# Save classification report to a file
report_dict = classification_report(true_labels, predicted_labels, output_dict=True)
pd.DataFrame(report_dict).transpose().to_csv('/content/drive/MyDrive/P2/T1/Dataset/combined/LLM/BioBERT-c-class.csv', index=True)
print("Classification report saved to '/content/drive/MyDrive/P2/T1/Dataset/combined/LLM/BioBERT-c-class.csv'")


                                    precision    recall  f1-score   support

                    adenocarcinoma       0.83      0.95      0.88       183
                          adhesion       0.96      0.85      0.90       176
                     affect labile       0.79      0.84      0.81        83
                         arthritis       0.96      0.82      0.88       139
                            asthma       0.87      0.80      0.84        95
      benign prostatic hypertrophy       0.98      0.71      0.83        83
                  biliary calculus       0.90      0.65      0.76       242
                  carcinoma breast       0.94      0.94      0.94       178
chronic obstructive airway disease       0.68      0.95      0.79       159
                         cirrhosis       0.93      0.84      0.88       194
                           colitis       0.63      0.88      0.74       206
         coronary arteriosclerosis       0.71      0.68      0.70       218
           

In [50]:
"""
output_path = '/content/drive/MyDrive/P2/LLM/2BioBERT_Predictions.csv'
test_data.to_csv(output_path, index=False)
print(f"Predictions saved to: {output_path}")
"""

'\noutput_path = \'/content/drive/MyDrive/P2/LLM/2BioBERT_Predictions.csv\'\ntest_data.to_csv(output_path, index=False)\nprint(f"Predictions saved to: {output_path}")\n'