In [None]:
import time
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from collections import Counter
import xgboost as xgb


df = pd.read_csv("Final_Augmented_dataset_Diseases_and_Symptoms.csv")
X = df.drop("diseases", axis=1)
y = df["diseases"]


le = LabelEncoder()
y_encoded = le.fit_transform(y)


class_counts = Counter(y_encoded)
valid_classes = [cls for cls, count in class_counts.items() if count >= 2]
mask = np.isin(y_encoded, valid_classes)


X_filtered = X[mask]
y_filtered = y[mask]


le_filtered = LabelEncoder()
y_final = le_filtered.fit_transform(y_filtered)


X_train, X_test, y_train, y_test = train_test_split(
    X_filtered, y_final, test_size=0.2, random_state=42, stratify=y_final
)


In [None]:
y_pred = model.predict(X_test)


predicted_diseases = le_filtered.inverse_transform(y_pred)
actual_diseases = le_filtered.inverse_transform(y_test)


for i in range(10):
    print(f"Actual: {actual_diseases[i]}  |  Predicted: {predicted_diseases[i]}")


from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
print(f"\n✅ Accuracy: {accuracy:.4f} ({accuracy * 100:.2f}%)")

Actual: strep throat  |  Predicted: strep throat
Actual: otitis externa (swimmer's ear)  |  Predicted: otitis externa (swimmer's ear)
Actual: retinal detachment  |  Predicted: retinal detachment
Actual: otitis externa (swimmer's ear)  |  Predicted: otitis externa (swimmer's ear)
Actual: obstructive sleep apnea (osa)  |  Predicted: tietze syndrome
Actual: trichomonas infection  |  Predicted: trichomonas infection
Actual: trichomonas infection  |  Predicted: trichomonas infection
Actual: trigeminal neuralgia  |  Predicted: trigeminal neuralgia
Actual: dementia  |  Predicted: dementia
Actual: mumps  |  Predicted: mumps

✅ Accuracy: 0.8391 (83.91%)


[531 531 531 ... 506 506 506]


In [None]:
model = xgb.XGBClassifier(
    objective="multi:softprob",  
    num_class=len(np.unique(y_final)),
    eval_metric="mlogloss",
    learning_rate=0.1,           
    max_depth=8,                 
    n_estimators=200,            
    subsample=0.8,               
    colsample_bytree=0.8,       
    verbosity=1
)


start = time.time()
model.fit(X_train, y_train)
end = time.time()
print(f"✅ Training completed in {end - start:.2f} seconds.")


y_proba = model.predict_proba(X_test)
y_pred = np.argmax(y_proba, axis=1)


In [None]:
from sklearn.metrics import accuracy_score, top_k_accuracy_score

num_classes = model.n_classes_ 

acc = accuracy_score(y_test, y_pred)
top3_acc = top_k_accuracy_score(y_test, y_proba, k=3, labels=np.arange(num_classes))
top5_acc = top_k_accuracy_score(y_test, y_proba, k=5, labels=np.arange(num_classes))

print(f"\n🎯 Accuracy: {acc:.4f} ({acc * 100:.2f}%)")
print(f"🎯 Top-3 Accuracy: {top3_acc:.4f}")
print(f"🎯 Top-5 Accuracy: {top5_acc:.4f}")


🎯 Accuracy: 0.8391 (83.91%)
🎯 Top-3 Accuracy: 0.9498
🎯 Top-5 Accuracy: 0.9742


In [None]:
def predict_disease_by_name(symptom_names, model, label_encoder, column_names):
    
    input_vector = np.zeros((1, len(column_names)))
    for symptom in symptom_names:
        if symptom in column_names:
            index = column_names.get_loc(symptom)
            input_vector[0, index] = 1
        else:
            print(f"'{symptom}' not found in symptom list.")

    
    class_index = model.predict(input_vector)[0]
    predicted_disease = label_encoder.inverse_transform([class_index])[0]

    print(f"Predicted Disease: {predicted_disease}")

    proba = model.predict_proba(input_vector)
    top3 = np.argsort(proba[0])[::-1][:3]
    print("\n🔍 Top 3 Likely Diseases:")
    for idx in top3:
        name = label_encoder.inverse_transform([idx])[0]
        print(f"• {name}: {proba[0][idx]:.4f}")


In [54]:
predict_disease_by_name(
    ["anxiety and nervousness", "dizziness", "fever"],
    model,
    le_filtered,
    X.columns  
)




✅ Predicted Disease: panic disorder

🔍 Top 3 Likely Diseases:
• panic disorder: 0.3023
• persistent vomiting of unknown cause: 0.2684
• spinocerebellar ataxia: 0.1694


In [58]:
predict_disease_by_name(
    ["fever", "dizziness", "shortness of breath"],
    model,
    le_filtered,
    X.columns
)


✅ Predicted Disease: pneumoconiosis

🔍 Top 3 Likely Diseases:
• pneumoconiosis: 0.8862
• persistent vomiting of unknown cause: 0.0250
• pulmonary eosinophilia: 0.0205


In [None]:
import pickle


with open("disease_model.sav", "wb") as f:
    pickle.dump(model, f)


with open("label_encoder.sav", "wb") as f:
    pickle.dump(le_filtered, f)


with open("symptom_columns.sav", "wb") as f:
    pickle.dump(list(X.columns), f)

print("✅ All files saved for future prediction.")


✅ All files saved for future prediction.
✅ All files saved for future prediction.
