In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("disease.csv")

# Separate features and target
X = df.drop("diseases", axis=1)
y = df["diseases"]


In [2]:
print("Total unique diseases:", y.nunique())
print("\nTop 20 most common diseases:\n")
print(y.value_counts())


Total unique diseases: 773

Top 20 most common diseases:

diseases
cystitis                          1219
vulvodynia                        1218
nose disorder                     1218
complex regional pain syndrome    1217
spondylosis                       1216
                                  ... 
thalassemia                          1
huntington disease                   1
typhoid fever                        1
kaposi sarcoma                       1
turner syndrome                      1
Name: count, Length: 773, dtype: int64


In [3]:
min_support = 30  # minimum samples per disease
disease_counts = y.value_counts()
common_diseases = disease_counts[disease_counts >= min_support].index

df_filtered = df[df["diseases"].isin(common_diseases)]
print("Remaining samples:", len(df_filtered))
print("Remaining diseases:", df_filtered["diseases"].nunique())

Remaining samples: 244938
Remaining diseases: 582


In [4]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(df_filtered["diseases"])
X = df_filtered.drop("diseases", axis=1)

In [5]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

In [6]:
import xgboost as xgb

xgb_model = xgb.XGBClassifier(
    objective='multi:softprob',    # <-- change here
    num_class=len(label_encoder.classes_),
    tree_method='hist',
    max_leaves=64,
    n_estimators=200,             # raise n_estimators for stability
    max_depth=8,
    learning_rate=0.05,
    eval_metric='mlogloss',
    n_jobs=-1,
    random_state=100
)

In [7]:
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    verbose=True                  
)

[0]	validation_0-mlogloss:4.07808
[1]	validation_0-mlogloss:3.52072
[2]	validation_0-mlogloss:3.18825
[3]	validation_0-mlogloss:2.94265
[4]	validation_0-mlogloss:2.74451
[5]	validation_0-mlogloss:2.58007
[6]	validation_0-mlogloss:2.43846
[7]	validation_0-mlogloss:2.31504
[8]	validation_0-mlogloss:2.20657
[9]	validation_0-mlogloss:2.10917
[10]	validation_0-mlogloss:2.02111
[11]	validation_0-mlogloss:1.94080
[12]	validation_0-mlogloss:1.86699
[13]	validation_0-mlogloss:1.79850
[14]	validation_0-mlogloss:1.73510
[15]	validation_0-mlogloss:1.67600
[16]	validation_0-mlogloss:1.62079
[17]	validation_0-mlogloss:1.56920
[18]	validation_0-mlogloss:1.52065
[19]	validation_0-mlogloss:1.47486
[20]	validation_0-mlogloss:1.43185
[21]	validation_0-mlogloss:1.39108
[22]	validation_0-mlogloss:1.35243
[23]	validation_0-mlogloss:1.31588
[24]	validation_0-mlogloss:1.28117
[25]	validation_0-mlogloss:1.24815
[26]	validation_0-mlogloss:1.21680
[27]	validation_0-mlogloss:1.18685
[28]	validation_0-mlogloss:1.1

In [8]:
from sklearn.metrics import classification_report, accuracy_score, top_k_accuracy_score

y_pred = xgb_model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))


Accuracy: 0.8444312892953376

Classification Report:

                                                 precision    recall  f1-score   support

                      abdominal aortic aneurysm       1.00      1.00      1.00        28
                               abdominal hernia       0.99      0.93      0.96        81
                                abscess of nose       0.85      0.78      0.81        58
                         abscess of the pharynx       0.88      0.93      0.90        68
                           acanthosis nigricans       0.67      0.67      0.67         6
                                      acariasis       0.78      1.00      0.88         7
                                      achalasia       0.68      0.76      0.72        17
                                           acne       0.68      0.79      0.73        99
                              actinic keratosis       0.81      0.72      0.76       182
                            acute bronchiolitis       0

In [27]:
import numpy as np
import pandas as pd

# Define symptoms for the new patient
common_cold_symptoms = [
    'fever', 'cough', 'sore throat', 'runny nose',
    'nasal congestion', 'sneezing', 'fatigue',
    'headache', 'chills'
]

melanoma_symptoms = [
    "abnormal appearing skin",
    "skin swelling",
    "skin lesion",
    "skin growth",
    "skin moles",
    "swollen lymph nodes"
    "irregular appearing scalp",
    "change in skin mole size or color",
    "itchy eyelid"
]

panic_disorder_symptoms = [
    "anxiety and nervousness",
    "shortness of breath",
    "depressive or psychotic symptoms",
    "chest tightness",
    "palpitations",
    "irregular heartbeat",
    "breathing fast"
]


In [28]:
# Create a new data point for the patient
# Ensure the new data point has the same columns as the training data (X_train)
# Initialize with zeros, then set the symptom columns to 1
new_patient_data = pd.DataFrame(0, index=[0], columns=X_train.columns)
for symptom in melanoma_symptoms:
    if symptom in new_patient_data.columns:
        new_patient_data[symptom] = 1
    else:
        print(f"Warning: Symptom '{symptom}' not found in training data columns.")



In [29]:
# Get prediction probabilities for all classes
proba = xgb_model.predict_proba(new_patient_data)[0]  # shape: (num_classes,)

# Get class names from label encoder
class_names = label_encoder.inverse_transform(np.arange(len(proba)))

# Combine class names and probabilities into a DataFrame
results = pd.DataFrame({
    'Disease': class_names,
    'Probability': proba
})

In [30]:
# Sort by probability (descending) and display top 5
top_5 = results.sort_values(by='Probability', ascending=False).head(5)

print("\n🔝 Top 5 Predicted Diseases:")
print(top_5.to_string(index=False))


🔝 Top 5 Predicted Diseases:
      Disease  Probability
     melanoma     0.975407
         scar     0.006594
      rosacea     0.002173
skin disorder     0.002007
     glaucoma     0.001987


In [13]:
import joblib

joblib.dump(xgb_model, "illness_predictor_model.pkl")
joblib.dump(label_encoder, "label_encoder.pkl")
joblib.dump(X.columns.tolist(), "symptom_features.pkl")


['symptom_features.pkl']