In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/synthetic-medical-data/synthetic_medical_data.csv


In [3]:
file_path = "/kaggle/input/synthetic-medical-data/synthetic_medical_data.csv"  
df = pd.read_csv(file_path)

In [4]:
df = df[["Generated Description", "Predicted Disease"]]
df.rename(columns={"Generated Description": "text", "Predicted Disease": "label"}, inplace=True)

# Display a few rows
print(df.head())

                                                text                label
0  For the past few days, I have patches_in_throa...                 AIDS
1  I noticed scurring, blackheads, skin_rash, pus...                 Acne
2  I noticed yellowish_skin, abdominal_pain, dist...  Alcoholic Hepatitis
3  For the past few days, I have chills, shiverin...              Allergy
4  I have muscle_weakness, swelling_joints, painf...            Arthritis


In [5]:
from sklearn.preprocessing import LabelEncoder

# Encode disease labels
label_encoder = LabelEncoder()
df["label"] = label_encoder.fit_transform(df["label"])

import pickle

# Save the encoder
with open("label_encoder.pkl", "wb") as f:
    pickle.dump(label_encoder, f)


# Save the label mapping for later decoding
label_mapping = dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))

In [6]:
from transformers import AutoTokenizer

# Load PubMedBERT tokenizer
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract")

# Tokenize the text descriptions
def tokenize_function(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True)

# Apply tokenization
from datasets import Dataset
dataset = Dataset.from_pandas(df)
dataset = dataset.map(tokenize_function, batched=True)
dataset = dataset.train_test_split(test_size=0.2)

# Split into train and test sets
train_dataset = dataset["train"]
test_dataset = dataset["test"]


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/225k [00:00<?, ?B/s]

Map:   0%|          | 0/20000 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [7]:
from transformers import AutoModelForSequenceClassification

num_labels = len(label_mapping)  # Number of unique diseases

# Load PubMedBERT model
model = AutoModelForSequenceClassification.from_pretrained(
    "microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract",
    num_labels=num_labels
)


pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-PubMedBERT-base-uncased-abstract and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./pubmedbert_disease_prediction",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    learning_rate= 2.4432297149345087e-05,
    num_train_epochs=10,
    weight_decay=0.01,
    save_total_limit=2,
    load_best_model_at_end=True
)




In [9]:
!wandb login 2161a2b86735ea110585354d6ed05d7936b6e8ea

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


In [10]:
from transformers import Trainer, TrainingArguments

# Define the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer
)

# Start training
trainer.train()

  trainer = Trainer(
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33mdhrupalpatidar1313[0m ([33mdhrupalpatidar1313-wappnet-systems-pvt-[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,5.7618,4.450728
2,3.8484,2.776561
3,2.4705,1.623545
4,1.4651,0.868883
5,0.8207,0.447445
6,0.4588,0.241959
7,0.2715,0.146925
8,0.1801,0.101535
9,0.1344,0.080753
10,0.1157,0.074485




TrainOutput(global_step=5000, training_loss=1.5527047676086425, metrics={'train_runtime': 1685.8003, 'train_samples_per_second': 94.91, 'train_steps_per_second': 2.966, 'total_flos': 4933122841598976.0, 'train_loss': 1.5527047676086425, 'epoch': 10.0})

In [11]:
metrics = trainer.evaluate()
print(metrics)



{'eval_loss': 0.07448519021272659, 'eval_runtime': 11.0599, 'eval_samples_per_second': 361.667, 'eval_steps_per_second': 11.302, 'epoch': 10.0}


In [12]:
model.save_pretrained("fine_tuned_pubmedbert")
tokenizer.save_pretrained("fine_tuned_pubmedbert")

('fine_tuned_pubmedbert/tokenizer_config.json',
 'fine_tuned_pubmedbert/special_tokens_map.json',
 'fine_tuned_pubmedbert/vocab.txt',
 'fine_tuned_pubmedbert/added_tokens.json',
 'fine_tuned_pubmedbert/tokenizer.json')

In [13]:
from transformers import pipeline

MODEL_PATH = "/kaggle/working/fine_tuned_pubmedbert"  # Change this if your model is saved elsewhere

# Load the fine-tuned model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)

classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
text = "I have a persistent cough, fever, and chills."
result = classifier(text)
# Extract the numerical label from "LABEL_X"
predicted_label_index = int(result[0]["label"].replace("LABEL_", ""))  # Convert "LABEL_476" -> 476

# Convert back to the actual disease name using LabelEncoder
predicted_disease = label_encoder.inverse_transform([predicted_label_index])

print(predicted_disease[0])  # Output the predicted disease name


Device set to use cuda:0


pneumoconiosis


In [None]:
import optuna
import os
from transformers import TrainingArguments, Trainer

# Global variable to track the best model
best_model = None
best_tokenizer = None
best_loss = float("inf")  # Initialize best loss as infinity

def objective(trial):
    global best_model, best_tokenizer, best_loss

    # Define hyperparameter search space
    learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
    batch_size = trial.suggest_categorical("batch_size", [8, 16, 32])
    epochs = trial.suggest_int("epochs", 3, 10)

    training_args = TrainingArguments(
        output_dir="./pubmedbert_optuna",
        evaluation_strategy="epoch",
        save_strategy="epoch",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        learning_rate=learning_rate,
        num_train_epochs=epochs,
        weight_decay=0.01,
        save_total_limit=2,
        load_best_model_at_end=True  # ✅ Ensures best model is used
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=test_dataset,
        tokenizer=tokenizer
    )

    trainer.train()
    eval_results = trainer.evaluate()
    eval_loss = eval_results["eval_loss"]

    # ✅ Store only the best model
    if eval_loss < best_loss:
        best_loss = eval_loss
        best_model = model
        best_tokenizer = tokenizer
        print(f"✅ New best model found! Loss: {best_loss}")

    return eval_loss  # Minimize loss

# Run Optuna tuning
study = optuna.create_study(direction="minimize")
study.optimize(objective, n_trials=2)

# Get the best hyperparameters
best_params = study.best_params
print("Best Hyperparameters:", best_params)

# ✅ Save only the best model after tuning completes
best_model_path = f"best_model_lr{best_params['learning_rate']}_bs{best_params['batch_size']}_epochs{best_params['epochs']}"
os.makedirs(best_model_path, exist_ok=True)
best_model.save_pretrained(best_model_path)
best_tokenizer.save_pretrained(best_model_path)

print(f"✅ Best model saved at: {best_model_path}")


[I 2025-02-25 06:52:48,214] A new study created in memory with name: no-name-42aa48e8-83c3-4e34-85e9-86ba3b712f6c
  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  trainer = Trainer(
[34m[1mwandb[0m: Currently logged in as: [33mdhrupalpatidar1313[0m ([33mdhrupalpatidar1313-wappnet-systems-pvt-[0m). Use [1m`wandb login --relogin`[0m to force relogin




Epoch,Training Loss,Validation Loss
1,4.6329,3.649493
2,2.4294,1.699323
3,1.0761,0.654263
4,0.4373,0.247509
5,0.1989,0.114712
6,0.1162,0.070636
7,0.0901,0.059622




[I 2025-02-25 07:21:53,880] Trial 0 finished with value: 0.05962178111076355 and parameters: {'learning_rate': 2.6331868576418965e-05, 'batch_size': 8, 'epochs': 7}. Best is trial 0 with value: 0.05962178111076355.


✅ New best model found! Loss: 0.05962178111076355


  learning_rate = trial.suggest_loguniform("learning_rate", 1e-5, 5e-5)
  trainer = Trainer(


Epoch,Training Loss,Validation Loss
1,0.0662,0.026688
2,0.0303,0.015494
3,0.0194,0.011082
4,0.0145,0.009019
5,0.012,0.007954
6,0.0109,0.007615




[I 2025-02-25 07:39:18,375] Trial 1 finished with value: 0.0076147764921188354 and parameters: {'learning_rate': 2.4432297149345087e-05, 'batch_size': 16, 'epochs': 6}. Best is trial 1 with value: 0.0076147764921188354.


✅ New best model found! Loss: 0.0076147764921188354
Best Hyperparameters: {'learning_rate': 2.4432297149345087e-05, 'batch_size': 16, 'epochs': 6}
✅ Best model saved at: best_model_lr2.4432297149345087e-05_bs16_epochs6


In [14]:
!zip -r file.zip /kaggle/working

  adding: kaggle/working/ (stored 0%)
  adding: kaggle/working/logs/ (stored 0%)
  adding: kaggle/working/logs/events.out.tfevents.1740477675.a6f9d4425b2e.31.1 (deflated 25%)
  adding: kaggle/working/logs/events.out.tfevents.1740475581.a6f9d4425b2e.31.0 (deflated 75%)
  adding: kaggle/working/label_encoder.pkl (deflated 59%)
  adding: kaggle/working/.virtual_documents/ (stored 0%)
  adding: kaggle/working/wandb/ (stored 0%)
  adding: kaggle/working/wandb/debug-internal.log (deflated 75%)
  adding: kaggle/working/wandb/debug.log (deflated 71%)
  adding: kaggle/working/wandb/run-20250225_092627-yxlcqn81/ (stored 0%)
  adding: kaggle/working/wandb/run-20250225_092627-yxlcqn81/run-yxlcqn81.wandb (deflated 80%)
  adding: kaggle/working/wandb/run-20250225_092627-yxlcqn81/logs/ (stored 0%)
  adding: kaggle/working/wandb/run-20250225_092627-yxlcqn81/logs/debug-internal.log (deflated 75%)
  adding: kaggle/working/wandb/run-20250225_092627-yxlcqn81/logs/debug.log (deflated 71%)
  adding: kaggle/

In [15]:
from IPython.display import FileLink
FileLink(r'file.zip')