In [None]:
import pandas as pd
import numpy as np
import joblib
import os

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

from sklearn.ensemble import RandomForestClassifier, StackingClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [9]:
# 1. LOAD DATA
# ======================================================
train_df = pd.read_csv("../dataset/symptom/training_data.csv")
test_df = pd.read_csv("../dataset/symptom/test_data.csv")

print("Raw Train shape:", train_df.shape)
print("Raw Test shape:", test_df.shape)

# ---- Drop useless 'Unnamed' columns (all-NaN) ----
for df in (train_df, test_df):
    unnamed_cols = [c for c in df.columns if c.startswith("Unnamed")]
    if unnamed_cols:
        df.drop(columns=unnamed_cols, inplace=True)

print("Clean Train shape:", train_df.shape)
print("Clean Test shape:", test_df.shape)

Raw Train shape: (4920, 134)
Raw Test shape: (42, 133)
Clean Train shape: (4920, 133)
Clean Test shape: (42, 133)


In [10]:
# 2. SPLIT FEATURES & TARGET
# ======================================================
X_train = train_df.drop("prognosis", axis=1)
y_train = train_df["prognosis"]

X_test = test_df.drop("prognosis", axis=1)
y_test = test_df["prognosis"]

# Sanity check for NaNs
print("NaNs in X_train:", X_train.isna().sum().sum())
print("NaNs in X_test:", X_test.isna().sum().sum())

NaNs in X_train: 0
NaNs in X_test: 0


In [11]:
# 3. LABEL ENCODE DISEASE NAMES
# ======================================================
le = LabelEncoder()
y_train_enc = le.fit_transform(y_train)
y_test_enc = le.transform(y_test)

print("\nDiseases:", list(le.classes_))
print("Total diseases:", len(le.classes_))


Diseases: ['(vertigo) Paroymsal  Positional Vertigo', 'AIDS', 'Acne', 'Alcoholic hepatitis', 'Allergy', 'Arthritis', 'Bronchial Asthma', 'Cervical spondylosis', 'Chicken pox', 'Chronic cholestasis', 'Common Cold', 'Dengue', 'Diabetes ', 'Dimorphic hemmorhoids(piles)', 'Drug Reaction', 'Fungal infection', 'GERD', 'Gastroenteritis', 'Heart attack', 'Hepatitis B', 'Hepatitis C', 'Hepatitis D', 'Hepatitis E', 'Hypertension ', 'Hyperthyroidism', 'Hypoglycemia', 'Hypothyroidism', 'Impetigo', 'Jaundice', 'Malaria', 'Migraine', 'Osteoarthristis', 'Paralysis (brain hemorrhage)', 'Peptic ulcer diseae', 'Pneumonia', 'Psoriasis', 'Tuberculosis', 'Typhoid', 'Urinary tract infection', 'Varicose veins', 'hepatitis A']
Total diseases: 41


In [13]:
# 4. DEFINE BASE MODELS
# ======================================================
rf_model = RandomForestClassifier(
    n_estimators=500,
    random_state=42
)

xgb_model = XGBClassifier(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=7,
    subsample=0.9,
    colsample_bytree=0.9,
    eval_metric="mlogloss",
    tree_method="hist",
    random_state=42
)

svm_model = SVC(
    probability=True,
    kernel="rbf",
    C=2,
    gamma="scale"
)

In [14]:
# 5. STACKING MODEL
# ======================================================
estimators = [
    ("rf", rf_model),
    ("xgb", xgb_model),
    ("svm", svm_model)
]

stack_model = StackingClassifier(
    estimators=estimators,
    final_estimator=LogisticRegression(max_iter=500),
    stack_method="predict_proba",
    passthrough=False,
    cv=5,
    n_jobs=-1
)

models = {
    "RandomForest": rf_model,
    "XGBoost": xgb_model,
    "SVM": svm_model,
    "Stacking": stack_model
}

In [15]:
# 6. TRAIN + EVALUATE ALL MODELS
# ======================================================
results = {}

for name, clf in models.items():
    print(f"\n===== Training {name} =====")
    clf.fit(X_train, y_train_enc)
    preds = clf.predict(X_test)

    acc = accuracy_score(y_test_enc, preds)
    results[name] = acc

    print("Accuracy:", acc)
    print("Confusion matrix:\n", confusion_matrix(y_test_enc, preds))
    print("Classification report:\n",
          classification_report(y_test_enc, preds, target_names=le.classes_))


===== Training RandomForest =====
Accuracy: 0.9761904761904762
Confusion matrix:
 [[1 0 0 ... 0 0 0]
 [0 1 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]
 ...
 [0 0 0 ... 1 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 1]]
Classification report:
                                          precision    recall  f1-score   support

(vertigo) Paroymsal  Positional Vertigo       1.00      1.00      1.00         1
                                   AIDS       1.00      1.00      1.00         1
                                   Acne       1.00      1.00      1.00         1
                    Alcoholic hepatitis       1.00      1.00      1.00         1
                                Allergy       1.00      1.00      1.00         1
                              Arthritis       1.00      1.00      1.00         1
                       Bronchial Asthma       1.00      1.00      1.00         1
                   Cervical spondylosis       1.00      1.00      1.00         1
                            Chicken pox    

In [16]:
# 7. PICK FINAL MODEL
# ======================================================

# Option 1: Automatically choose best by accuracy (already picks SVM here)
best_name = max(results, key=results.get)
best_model = models[best_name]

print("\nBest model (by accuracy):", best_name)
print("Best accuracy:", results[best_name])


Best model (by accuracy): SVM
Best accuracy: 1.0


In [None]:
# 8. SAVE ARTIFACTS
# ======================================================
artifacts = {
    "model": best_model,
    "symptom": list(X_train.columns),
    "label_encoder": le
}

os.makedir("../saved_models", exist_ok=True)
joblib.dump(artifacts, "../saved_mdl/symptom_model.pkl")

print("\nSaved → ../saved_mdl/symptom_model.pkl")


Saved → ../saved_models/symptom_model.pkl
