In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install transformers datasets peft wandb sacremoses

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting sacremoses
  Downloading sacremoses-0.1.1-py3-none-any.whl.metadata (8.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.2.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading sacremoses-0.1.1-py3-none-any.whl (897 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
from peft import LoraConfig, get_peft_model
import pandas as pd
from sklearn.model_selection import train_test_split
import torch

In [None]:
# Check GPU availability
if torch.cuda.is_available():
    print("GPU is available!")
    print(f"GPU name: {torch.cuda.get_device_name(0)}")
else:
    print("GPU not available. Please enable it in runtime settings.")

GPU is available!
GPU name: NVIDIA A100-SXM4-40GB


In [None]:
import torch
torch.cuda.empty_cache()

# Load Dataset

In [None]:
# File paths
training_set = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_train.csv"
validation_set = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_val.csv"
testing_set = "/content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_test.csv"

# Load datasets
train_data = pd.read_csv(training_set)
val_data = pd.read_csv(validation_set)
test_data = pd.read_csv(testing_set)

# Add prompt column to the datasets
for dataset in [train_data, val_data, test_data]:
    dataset['prompt'] = dataset.apply(
        lambda row: f"Symptoms: {row['symptoms']}\nPredict the disease:",
        axis=1
    )

# Create mappings for text-based labels
unique_labels = train_data['disease_label'].unique()
label_to_id = {label: idx for idx, label in enumerate(unique_labels)}
id_to_label = {idx: label for idx, label in enumerate(unique_labels)}

# Map text labels to numeric indices
for dataset in [train_data, val_data, test_data]:
    dataset['target'] = dataset['disease_label'].map(label_to_id)

# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data)
val_dataset = Dataset.from_pandas(val_data)
test_dataset = Dataset.from_pandas(test_data)

print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(val_dataset)}")
print(f"Number of testing samples: {len(test_dataset)}")

disease_counts = train_data['disease_label'].value_counts()
print("Disease Distribution in Training Data:")
print(disease_counts)


Number of training samples: 1695
Number of validation samples: 209
Number of testing samples: 218
Disease Distribution in Training Data:
disease_label
coronary arteriosclerosis             111
hypertensive disease                  108
biliary calculus                      101
hyperlipidemia                         99
pancreatitis                           84
colitis                                75
paroxysmal dyspnea                     70
pyelonephritis                         67
failure heart                          56
osteomyelitis                          48
infection                              45
malignant neoplasms                    44
chronic obstructive airway disease     43
tricuspid valve insufficiency          41
stenosis aortic valve                  39
gastritis                              33
gout                                   31
overload fluid                         29
delirium                               28
hepatitis                              27
gastroent

# Load LLM

In [None]:
# Import the required classes
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Load ClinicalBERT model and tokenizer
model_name = "emilyalsentzer/Bio_ClinicalBERT"  # ClinicalBERT model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(unique_labels)
)

print("ClinicalBERT model and tokenizer loaded successfully!")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ClinicalBERT model and tokenizer loaded successfully!


# LoRA (Low-Rank Adaptation)

In [None]:
"""
# Configure LoRA for BioBERT
lora_config = LoraConfig(
    r=8,  # Low-rank adaptation rank
    lora_alpha=32,  # Scaling factor
    target_modules=["query", "value"],  # LoRA applied to attention layers
    lora_dropout=0.1,  # Dropout rate for regularization
    task_type="SEQ_CLS"  # Sequence classification task
)

model = get_peft_model(model, lora_config)
print("LoRA applied successfully to ClinicalBERT!")
"""

'\n# Configure LoRA for BioBERT\nlora_config = LoraConfig(\n    r=8,  # Low-rank adaptation rank\n    lora_alpha=32,  # Scaling factor\n    target_modules=["query", "value"],  # LoRA applied to attention layers\n    lora_dropout=0.1,  # Dropout rate for regularization\n    task_type="SEQ_CLS"  # Sequence classification task\n)\n\nmodel = get_peft_model(model, lora_config)\nprint("LoRA applied successfully to ClinicalBERT!")\n'

# Tokenization function

In [None]:
# Tokenization function for the dataset
def tokenize_function(examples):
    # Tokenize the 'prompt' column
    tokenized_inputs = tokenizer(
        examples['prompt'],  # Input text (symptoms prompt)
        truncation=True,
        padding='max_length',
        max_length=256,  # Adjust as needed
        return_tensors="pt"
    )
    # Add 'labels' field (numeric targets)
    tokenized_inputs["labels"] = examples["target"]
    return tokenized_inputs

# Apply tokenization to the datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set the dataset format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
val_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])


Map:   0%|          | 0/1695 [00:00<?, ? examples/s]

Map:   0%|          | 0/209 [00:00<?, ? examples/s]

Map:   0%|          | 0/218 [00:00<?, ? examples/s]

# Training Arguments

In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir='./results_clinicalbert',
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate=5e-5,
    num_train_epochs=30,
    weight_decay=0.005,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,
    logging_dir='./logs_clinicalbert',
    logging_steps=10,
    fp16=True,  # Mixed precision
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',
    greater_is_better=False
)




In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
# Train the model
trainer.train()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

 ··········


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,3.1349,2.846174
2,2.5819,2.287
3,2.2131,2.073688
4,1.9763,1.978789
5,1.7385,1.869311
6,1.6583,1.819446
7,1.7124,1.765669
8,1.7268,1.664358
9,1.6026,1.646746
10,1.5667,1.628275


TrainOutput(global_step=3180, training_loss=1.5289451674095489, metrics={'train_runtime': 270.2047, 'train_samples_per_second': 188.191, 'train_steps_per_second': 11.769, 'total_flos': 6692361490022400.0, 'train_loss': 1.5289451674095489, 'epoch': 30.0})

# Training and Validation Loss Graph

In [None]:
"""
import matplotlib.pyplot as plt

# Data from the table
epochs = [1, 2]
training_loss = [2.1982, 2.4477, 1.8128, 1.3208, 1.1841, 1.1038]
validation_loss = [1.836972, 2.2917, 1.5439, 1.1537, 0.9580, 0.9067]

# Plot the graph
plt.figure(figsize=(10, 6))
plt.plot(epochs, training_loss, label="Training Loss", marker="o")
plt.plot(epochs, validation_loss, label="Validation Loss", marker="o")

# Add labels, title, and legend
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.title("Training and Validation Loss Over Epochs")
plt.legend()
plt.grid(True)

# Show the plot
plt.show()
"""

'\nimport matplotlib.pyplot as plt\n\n# Data from the table\nepochs = [1, 2]\ntraining_loss = [2.1982, 2.4477, 1.8128, 1.3208, 1.1841, 1.1038]\nvalidation_loss = [1.836972, 2.2917, 1.5439, 1.1537, 0.9580, 0.9067]\n\n# Plot the graph\nplt.figure(figsize=(10, 6))\nplt.plot(epochs, training_loss, label="Training Loss", marker="o")\nplt.plot(epochs, validation_loss, label="Validation Loss", marker="o")\n\n# Add labels, title, and legend\nplt.xlabel("Epoch")\nplt.ylabel("Loss")\nplt.title("Training and Validation Loss Over Epochs")\nplt.legend()\nplt.grid(True)\n\n# Show the plot\nplt.show()\n'

In [None]:
"""
model.save_pretrained('/content/drive/MyDrive/P2/LLM/fine_tuned_biobert')
tokenizer.save_pretrained('/content/drive/MyDrive/P2/LLM/fine_tuned_biobert')
print("Fine-tuned BioBERT model and tokenizer saved successfully!")
"""

'\nmodel.save_pretrained(\'/content/drive/MyDrive/P2/LLM/fine_tuned_biobert\')\ntokenizer.save_pretrained(\'/content/drive/MyDrive/P2/LLM/fine_tuned_biobert\')\nprint("Fine-tuned BioBERT model and tokenizer saved successfully!")\n'

# Evaluate the fine-tuned model directly

In [None]:
from datasets import Dataset

# Load the test dataset
test_file_path = '/content/drive/MyDrive/P2/T1/Dataset/overlapped/LLM_o_test.csv'
test_data = pd.read_csv(test_file_path)

# Add prompt column (if not already present)
test_data['prompt'] = test_data.apply(
    lambda row: f"Symptoms: {row['symptoms']}\nPredict the disease:",
    axis=1
)

# Convert to Hugging Face Dataset
test_dataset = Dataset.from_pandas(test_data)

In [None]:
import re

def predict_disease(prompt):
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=256).to("cuda")
    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        predicted_class_id = torch.argmax(logits, dim=-1).item()
        return id_to_label[predicted_class_id]  # Map numeric index back to text label

# Test Predictions
test_data['predicted_disease'] = test_data['prompt'].apply(predict_disease)

# Print Predictions
for index, row in test_data.iterrows():
    print(f"Prompt: {row['prompt']}")
    print(f"Predicted Disease: {row['predicted_disease']}")


Prompt: Symptoms: Patient shows symptoms as follows: have abscess bacterial; have fever; have apyrexial.
Predict the disease:
Predicted Disease: osteomyelitis
Prompt: Symptoms: Patient shows symptoms as follows: have abscess bacterial; have fever; have apyrexial.
Predict the disease:
Predicted Disease: osteomyelitis
Prompt: Symptoms: Patient shows symptoms as follows: have angina pectoris; have sweating increased.
Predict the disease:
Predicted Disease: coronary arteriosclerosis
Prompt: Symptoms: Patient shows symptoms as follows: have angina pectoris; have sweating increased.
Predict the disease:
Predicted Disease: coronary arteriosclerosis
Prompt: Symptoms: Patient shows symptoms as follows: have angina pectoris; have sweating increased.
Predict the disease:
Predicted Disease: coronary arteriosclerosis
Prompt: Symptoms: Patient shows symptoms as follows: have apyrexial; have cough.
Predict the disease:
Predicted Disease: neutropenia
Prompt: Symptoms: Patient shows symptoms as follows

# Save fine-tuned Model

In [None]:
"""
# Save the fine-tuned model
model.save_pretrained('/content/drive/MyDrive/P2/LLM/fine_tuned_biogpt')
tokenizer.save_pretrained('/content/drive/MyDrive/P2/LLM/fine_tuned_biogpt')
print("Fine-tuned BioGPT model and tokenizer saved successfully!")
"""

# Generate Classification Report

In [None]:
from sklearn.metrics import classification_report
import pandas as pd

# Assume `test_data` is your test dataset after generating predictions
# The test_data dataframe should already have columns: 'Label' (true labels) and 'Predicted' (model predictions)

# True labels and predicted labels
true_labels = test_data['disease_label']
predicted_labels = test_data['predicted_disease']

# Generate classification report
report = classification_report(true_labels, predicted_labels, output_dict=False)
print(report)

# Save classification report to a file
report_dict = classification_report(true_labels, predicted_labels, output_dict=True)
pd.DataFrame(report_dict).transpose().to_csv('/content/drive/MyDrive/P2/T1/LLM/BioBERT/BioBERTclassification_report.csv', index=True)
print("Classification report saved to '/content/drive/MyDrive/P2/LLM/ClinicalBERTclassification_report.csv'")


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


                                    precision    recall  f1-score   support

                    adenocarcinoma       0.00      0.00      0.00         3
                          adhesion       0.00      0.00      0.00         2
                     affect labile       0.00      0.00      0.00         1
                         arthritis       0.00      0.00      0.00         2
                            asthma       0.50      0.33      0.40         3
      benign prostatic hypertrophy       0.00      0.00      0.00         1
                  biliary calculus       0.40      0.17      0.24        12
                  carcinoma breast       0.50      0.50      0.50         2
chronic obstructive airway disease       0.44      0.80      0.57         5
                         cirrhosis       0.40      0.50      0.44         4
                           colitis       0.25      0.14      0.18         7
         coronary arteriosclerosis       0.31      0.29      0.30        14
           