In [4]:
# Install required libraries (run once)
!pip install xgboost scikit-learn pandas numpy



In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report

from xgboost import XGBClassifier

In [None]:
# Load from local file (navigate up one directory to find Data folder)
df = pd.read_csv("SymptomClassifier-XAI\Classification-Model\Data\Healthcare.csv")

In [5]:
df["Gender"] = df["Gender"].map({"Male": 0, "Female": 1})

In [6]:
all_symptoms = set()

for symptoms in df["Symptoms"]:
    for s in symptoms.split(","):
        all_symptoms.add(s.strip())

all_symptoms = sorted(list(all_symptoms))
all_symptoms


['abdominal pain',
 'anxiety',
 'appetite loss',
 'back pain',
 'blurred vision',
 'chest pain',
 'cough',
 'depression',
 'diarrhea',
 'dizziness',
 'fatigue',
 'fever',
 'headache',
 'insomnia',
 'joint pain',
 'muscle pain',
 'nausea',
 'rash',
 'runny nose',
 'shortness of breath',
 'sneezing',
 'sore throat',
 'sweating',
 'swelling',
 'tremors',
 'vomiting',
 'weight gain',
 'weight loss']

In [7]:
for symptom in all_symptoms:
    df[symptom.replace(" ", "_")] = df["Symptoms"].apply(
        lambda x: 1 if symptom in x else 0
    )

df.head()

Unnamed: 0,Patient_ID,Age,Gender,Symptoms,Symptom_Count,Disease,abdominal_pain,anxiety,appetite_loss,back_pain,...,runny_nose,shortness_of_breath,sneezing,sore_throat,sweating,swelling,tremors,vomiting,weight_gain,weight_loss
0,1,29,0.0,"fever, back pain, shortness of breath",3,Allergy,0,0,0,1,...,0,1,0,0,0,0,0,0,0,0
1,2,76,1.0,"insomnia, back pain, weight loss",3,Thyroid Disorder,0,0,0,1,...,0,0,0,0,0,0,0,0,0,1
2,3,78,0.0,"sore throat, vomiting, diarrhea",3,Influenza,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,4,58,,"blurred vision, depression, weight loss, muscl...",4,Stroke,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,5,55,1.0,"swelling, appetite loss, nausea",3,Heart Disease,0,0,1,0,...,0,0,0,0,0,1,0,0,0,0


In [8]:
df = df.drop(columns=["Symptoms", "Patient_ID"])
df

Unnamed: 0,Age,Gender,Symptom_Count,Disease,abdominal_pain,anxiety,appetite_loss,back_pain,blurred_vision,chest_pain,...,runny_nose,shortness_of_breath,sneezing,sore_throat,sweating,swelling,tremors,vomiting,weight_gain,weight_loss
0,29,0.0,3,Allergy,0,0,0,1,0,0,...,0,1,0,0,0,0,0,0,0,0
1,76,1.0,3,Thyroid Disorder,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
2,78,0.0,3,Influenza,0,0,0,0,0,0,...,0,0,0,1,0,0,0,1,0,0
3,58,,4,Stroke,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,1
4,55,1.0,3,Heart Disease,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
24995,42,0.0,6,Ulcer,1,1,0,0,1,0,...,1,0,0,0,0,0,0,0,0,0
24996,36,0.0,6,Common Cold,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
24997,70,,3,Anxiety,0,1,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
24998,9,,4,Obesity,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
label_encoder = LabelEncoder()
df["Disease_Label"] = label_encoder.fit_transform(df["Disease"])

df[["Disease", "Disease_Label"]]

Unnamed: 0,Disease,Disease_Label
0,Allergy,0
1,Thyroid Disorder,27
2,Influenza,19
3,Stroke,26
4,Heart Disease,16
...,...,...
24995,Ulcer,29
24996,Common Cold,8
24997,Anxiety,2
24998,Obesity,22


In [10]:
X = df.drop(columns=["Disease", "Disease_Label"])
y = df["Disease_Label"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
model = XGBClassifier(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    objective="multi:softprob",
    eval_metric="mlogloss"
)

model.fit(X_train, y_train)

In [13]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))

Accuracy: 0.0366

Classification Report:

                        precision    recall  f1-score   support

               Allergy       0.04      0.04      0.04       161
                Anemia       0.03      0.04      0.03       171
               Anxiety       0.04      0.07      0.05       168
             Arthritis       0.05      0.06      0.05       174
                Asthma       0.04      0.05      0.05       151
            Bronchitis       0.03      0.03      0.03       170
              COVID-19       0.04      0.04      0.04       170
Chronic Kidney Disease       0.02      0.02      0.02       166
           Common Cold       0.06      0.07      0.06       165
              Dementia       0.01      0.01      0.01       166
            Depression       0.05      0.05      0.05       173
            Dermatitis       0.03      0.03      0.03       173
              Diabetes       0.03      0.02      0.02       174
              Epilepsy       0.03      0.03      0.03       1

In [14]:
def predict_disease(age, gender, symptoms):
    input_data = dict.fromkeys(X.columns, 0)

    input_data["Age"] = age
    input_data["Gender"] = 0 if gender.lower() == "male" else 1
    input_data["Symptom_Count"] = len(symptoms)

    for symptom in symptoms:
        col = symptom.replace(" ", "_")
        if col in input_data:
            input_data[col] = 1

    input_df = pd.DataFrame([input_data])

    probabilities = model.predict_proba(input_df)[0]
    top_indices = np.argsort(probabilities)[::-1][:3]

    results = []
    for idx in top_indices:
        results.append({
            "Disease": label_encoder.inverse_transform([idx])[0],
            "Confidence": round(probabilities[idx] * 100, 2)
        })

    return results


In [15]:
prediction = predict_disease(
    age=40,
    gender="Male",
    symptoms=["fever", "cough", "headache"]
)

prediction


[{'Disease': 'Ulcer', 'Confidence': 5.39},
 {'Disease': 'Dermatitis', 'Confidence': 4.88},
 {'Disease': "Parkinson's", 'Confidence': 4.7}]

In [16]:
import joblib

joblib.dump(model, 'xgboost_model.joblib')

print("Model saved as 'xgboost_model.joblib'")

Model saved as 'xgboost_model.joblib'
