In [1]:
import torch
torch.cuda.is_available(), torch.cuda.get_device_name(0)


(True, 'NVIDIA GeForce RTX 4050 Laptop GPU')

In [2]:
import pandas as pd
import numpy as np
import os
import torch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

from transformers import (
    DistilBertTokenizerFast,
    DistilBertForSequenceClassification,
    Trainer,
    TrainingArguments
)
from transformers import IntervalStrategy

In [3]:
# 1. LOAD BASE DATA
# =========================
df = pd.read_csv("../dataset/symptom/training_data.csv")
print("Shape:", df.shape)
print(df.head())

# symptom columns = all except 'prognosis'
symptom_cols = [c for c in df.columns if c != "prognosis"]

Shape: (4920, 134)
   itching  skin_rash  nodal_skin_eruptions  continuous_sneezing  shivering  \
0        1          1                     1                    0          0   
1        0          1                     1                    0          0   
2        1          0                     1                    0          0   
3        1          1                     0                    0          0   
4        1          1                     1                    0          0   

   chills  joint_pain  stomach_pain  acidity  ulcers_on_tongue  ...  scurring  \
0       0           0             0        0                 0  ...         0   
1       0           0             0        0                 0  ...         0   
2       0           0             0        0                 0  ...         0   
3       0           0             0        0                 0  ...         0   
4       0           0             0        0                 0  ...         0   

   skin_peeling  si

In [4]:
# 2. CONVERT BINARY SYMPTOMS â†’ TEXT
# =========================

def row_to_text(row):
    active = [col.replace("_", " ").strip() for col in symptom_cols if row[col] == 1]
    if not active:
        return "Patient reports no significant symptoms."
    if len(active) == 1:
        return f"Patient reports {active[0]}."
    else:
        body = ", ".join(active[:-1])
        last = active[-1]
        return f"Patient reports {body} and {last}."

df["symptom_text"] = df.apply(row_to_text, axis=1)

print("\nExample generated texts:")
print(df[["symptom_text", "prognosis"]].head())


Example generated texts:
                                        symptom_text         prognosis
0  Patient reports itching, skin rash, nodal skin...  Fungal infection
1  Patient reports skin rash, nodal skin eruption...  Fungal infection
2  Patient reports itching, nodal skin eruptions ...  Fungal infection
3  Patient reports itching, skin rash and dischro...  Fungal infection
4  Patient reports itching, skin rash and nodal s...  Fungal infection


In [5]:
# 3. LABEL ENCODING
# =========================
le = LabelEncoder()
df["label_id"] = le.fit_transform(df["prognosis"])

num_classes = len(le.classes_)
print("\nNumber of diseases:", num_classes)
print("Classes:", list(le.classes_))


Number of diseases: 41
Classes: ['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma', 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes ', 'Dimorphic hemmorhoids(piles)', 'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis', 'Paralysis (brain hemorrhage)', 'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis', 'Typhoid', 'Urinary tract infection', 'Varicose veins', 'hepatitis A']


In [6]:
# 4. TRAIN/VALIDATION SPLIT
# =========================
train_df, val_df = train_test_split(
    df[["symptom_text", "label_id"]],
    test_size=0.2,
    random_state=42,
    stratify=df["label_id"]
)

print("Train size:", train_df.shape[0])
print("Val size:", val_df.shape[0])

Train size: 3936
Val size: 984


In [7]:
# 5. TOKENIZER & MODEL
# =========================
model_name = "distilbert-base-uncased"  # lighter than full BERT
tokenizer = DistilBertTokenizerFast.from_pretrained(model_name)

def tokenize_batch(texts):
    return tokenizer(
        texts.tolist(),
        padding=True,
        truncation=True,
        max_length=128
    )

train_encodings = tokenize_batch(train_df["symptom_text"])
val_encodings = tokenize_batch(val_df["symptom_text"])

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

In [8]:
# 6. TORCH DATASETS
# =========================
class SymptomsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

train_dataset = SymptomsDataset(train_encodings, train_df["label_id"].values)
val_dataset   = SymptomsDataset(val_encodings,   val_df["label_id"].values)

model = DistilBertForSequenceClassification.from_pretrained(
    model_name,
    num_labels=num_classes
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
# 7. TrainingArguments + Trainer
# =========================
from transformers import TrainingArguments, Trainer
from sklearn.metrics import accuracy_score, f1_score

# Metric function
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = logits.argmax(axis=-1)
    acc = accuracy_score(labels, preds)
    macro_f1 = f1_score(labels, preds, average="macro")
    return {"accuracy": acc, "macro_f1": macro_f1}

# GPU-optimized training settings
training_args = TrainingArguments(
    output_dir="./bert_symptom_results",
    num_train_epochs=4,                
    per_device_train_batch_size=32,
    per_device_eval_batch_size=64,
    learning_rate=5e-5,
    eval_strategy="epoch",        # <-- FIXED HERE
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    fp16=True,
    logging_steps=50,
    report_to="none"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Macro F1
1,1.2698,0.300263,1.0,1.0
2,0.1214,0.031988,1.0,1.0
3,0.0304,0.01736,1.0,1.0
4,0.0235,0.014606,1.0,1.0


TrainOutput(global_step=492, training_loss=0.507337248422266, metrics={'train_runtime': 69.6928, 'train_samples_per_second': 225.906, 'train_steps_per_second': 7.06, 'total_flos': 252724745866752.0, 'train_loss': 0.507337248422266, 'epoch': 4.0})

In [12]:
# 8. Evaluate
# =========================

metrics = trainer.evaluate()
metrics


{'eval_loss': 0.3002631962299347,
 'eval_accuracy': 1.0,
 'eval_macro_f1': 1.0,
 'eval_runtime': 0.755,
 'eval_samples_per_second': 1303.266,
 'eval_steps_per_second': 21.191,
 'epoch': 4.0}

In [None]:
import joblib

save_dir = "../saved_mdl/bert_symptom_nlp"
os.makedir(save_dir, exist_ok=True)

# Save model + tokenizer in HuggingFace format
trainer.save_model(save_dir)
tokenizer.save_pretrained(save_dir)

# Save label encoder
joblib.dump(
    {"label_encoder": le},
    os.path.join(save_dir, "label_encoder.pkl")
)

print("\nSaved DistilBERT symptom NLP model to:", save_dir)


Saved DistilBERT symptom NLP model to: ../saved_models/bert_symptom_nlp
