In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split, RandomizedSearchCV, cross_val_score
from sklearn.preprocessing import OneHotEncoder, KBinsDiscretizer, LabelEncoder
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# 1. Load data
df = pd.read_csv("dataset.csv")

# 2. Encode the target variable
target_col = "Outcome Variable"
label_encoder = LabelEncoder()
df[target_col] = label_encoder.fit_transform(df[target_col])  # disease names → numbers

# 3. Split features and target
X = df.drop(columns=[target_col])
y = df[target_col]

# 4. Detect feature types
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
numerical_cols = X.select_dtypes(include=["int64", "float64"]).columns.tolist()

# Remove 'Age' from numerical_cols since we will bin it separately
if "Age" in numerical_cols:
    numerical_cols.remove("Age")

# 5. Preprocessing: bin Age + one-hot encode categoricals
preprocessor = ColumnTransformer(transformers=[
    ("age_bin", KBinsDiscretizer(n_bins=3, encode="onehot-dense", strategy="uniform"), ["Age"]),
    ("cat", OneHotEncoder(handle_unknown="ignore", sparse_output=False), categorical_cols),
], remainder="passthrough")  # passthrough numeric features

# 6. Build the full pipeline
model_pipeline = ImbPipeline(steps=[
    ("smote", SMOTE(random_state=42)),
    ("preprocess", preprocessor),
    ("classifier", XGBClassifier(
        use_label_encoder=False,
        eval_metric="mlogloss",
        random_state=42
    )),
])

# 7. Train-test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# 8. Hyperparameter tuning
param_dist = {
    "classifier__n_estimators": [100, 200],
    "classifier__max_depth": [5, 7],
    "classifier__learning_rate": [0.05, 0.1],
    "classifier__subsample": [0.8, 1.0],
}

search = RandomizedSearchCV(
    model_pipeline,
    param_distributions=param_dist,
    n_iter=5,
    cv=3,
    scoring="accuracy",
    random_state=42,
    n_jobs=-1
)

# 9. Train the model
search.fit(X_train, y_train)
best_model = search.best_estimator_

# 10. Evaluate
y_pred = best_model.predict(X_test)

print("\n✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\n📄 Classification Report:\n", classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print("\n🔲 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# 11. Optional: cross-validation
cv_scores = cross_val_score(best_model, X, y, cv=5, scoring="accuracy", n_jobs=-1)
print(f"\n🔄 5-Fold CV Accuracy: {cv_scores.mean()*100:.2f}%")


ValueError: 
All the 15 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
5 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 265, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 1057, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\base.py", line 106, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\base.py", line 161, in _check_X_y
    X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Myocardial Infarction (Heart...'

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 329, in fit
    Xt, yt = self._fit(X, y, routed_params)
             ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 265, in _fit
    X, y, fitted_transformer = fit_resample_one_cached(
                               ^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\joblib\memory.py", line 312, in __call__
    return self.func(*args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\pipeline.py", line 1057, in _fit_resample_one
    X_res, y_res = sampler.fit_resample(X, y, **params.get("fit_resample", {}))
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\base.py", line 208, in fit_resample
    return super().fit_resample(X, y)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\base.py", line 106, in fit_resample
    X, y, binarize_y = self._check_X_y(X, y)
                       ^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\imblearn\base.py", line 161, in _check_X_y
    X, y = self._validate_data(X, y, reset=True, accept_sparse=accept_sparse)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\base.py", line 650, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1301, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\utils\validation.py", line 1012, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\sklearn\utils\_array_api.py", line 751, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\Ansh Thakkar\anaconda3\Lib\site-packages\pandas\core\generic.py", line 2153, in __array__
    arr = np.asarray(values, dtype=dtype)
          ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: 'Prader-Willi Syndrome'


In [7]:
df['Outcome Variable'].value_counts()
df.isnull().sum()


Disease                 0
Fever                   0
Cough                   0
Fatigue                 0
Difficulty Breathing    0
Age                     0
Gender                  0
Blood Pressure          0
Cholesterol Level       0
Outcome Variable        0
dtype: int64

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from imblearn.over_sampling import SMOTE


df = pd.read_csv("dataset.csv")


label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

X = df.drop(columns=["Outcome Variable"])
y = df["Outcome Variable"]

smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X, y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)


rf_params = {
    'n_estimators': [200, 300],
    'max_depth': [10, 20],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
grid_search = GridSearchCV(RandomForestClassifier(random_state=42), rf_params, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)


best_rf = grid_search.best_estimator_

xgb_model = xgb.XGBClassifier(n_estimators=300, max_depth=10, learning_rate=0.05, random_state=42)
xgb_model.fit(X_train, y_train)


y_pred_rf = best_rf.predict(X_test)
y_pred_xgb = xgb_model.predict(X_test)


accuracy_rf = accuracy_score(y_test, y_pred_rf)
accuracy_xgb = accuracy_score(y_test, y_pred_xgb)

print(f"Optimized Random Forest Accuracy: {accuracy_rf * 100:.2f}%")
print(f"XGBoost Accuracy: {accuracy_xgb * 100:.2f}%")

Optimized Random Forest Accuracy: 51.11%
XGBoost Accuracy: 50.10%


In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier

# Load and preprocess dataset
df = pd.read_csv("Disease_symptom_and_patient_profile_dataset.csv").dropna()

label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# Define feature columns
feature_columns = [
    'Fever', 'Cough', 'Fatigue', 'Difficulty Breathing',
    'Age', 'Gender', 'Blood Pressure', 'Cholesterol Level'
]

X = df[feature_columns]
y = df["Disease"]

# Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X, y)

# Input options
options = {
    'Fever': ['Yes', 'No'],
    'Cough': ['Yes', 'No'],
    'Fatigue': ['Yes', 'No'],
    'Difficulty Breathing': ['Yes', 'No'],
    'Gender': ['Male', 'Female'],
    'Blood Pressure': ['Low', 'Normal', 'High'],
    'Cholesterol Level': ['Low', 'Normal', 'High']
}

# Take user input
print("\n🔍 Enter patient details to predict disease:")
manual_input = {}

for key, choices in options.items():
    print(f"\n{key} options: {choices}")
    value = input(f"Enter {key}: ").strip().title()
    while value not in choices:
        print(f"Invalid input. Please choose from {choices}")
        value = input(f"Enter {key}: ").strip().title()
    manual_input[key] = value

while True:
    try:
        manual_input["Age"] = int(input("\nEnter Age: "))
        break
    except ValueError:
        print("Please enter a valid number.")

# Convert input to DataFrame
input_df = pd.DataFrame([manual_input])

# Encode input using saved encoders
for col in input_df.columns:
    if col in label_encoders:
        input_df[col] = label_encoders[col].transform(input_df[col])

# ✅ Ensure correct feature order
input_df = input_df[feature_columns]

# Predict
prediction = model.predict(input_df)
predicted_disease = label_encoders['Disease'].inverse_transform(prediction)

print(f"\n🩺 Predicted Disease: {predicted_disease[0]}")



🔍 Enter patient details to predict disease:

Fever options: ['Yes', 'No']

Cough options: ['Yes', 'No']

Fatigue options: ['Yes', 'No']

Difficulty Breathing options: ['Yes', 'No']

Gender options: ['Male', 'Female']
Invalid input. Please choose from ['Male', 'Female']
Invalid input. Please choose from ['Male', 'Female']
Invalid input. Please choose from ['Male', 'Female']

Blood Pressure options: ['Low', 'Normal', 'High']
Invalid input. Please choose from ['Low', 'Normal', 'High']
Invalid input. Please choose from ['Low', 'Normal', 'High']

Cholesterol Level options: ['Low', 'Normal', 'High']
Please enter a valid number.

🩺 Predicted Disease: Chronic Obstructive Pulmonary...


In [7]:
import joblib
joblib.dump(xgb_model, 'models/xgb_model.pkl')
joblib.dump(label_encoders, 'models/label_encoders.pkl')


['models/label_encoders.pkl']