In [28]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import catboost as cb
import xgboost as xgb

In [2]:
df_train = pd.read_csv('../data/train.csv')
df_train.head()

Unnamed: 0,itching,skin_rash,nodal_skin_eruptions,continuous_sneezing,shivering,chills,joint_pain,stomach_pain,acidity,ulcers_on_tongue,...,blackheads,scurring,skin_peeling,silver_like_dusting,small_dents_in_nails,inflammatory_nails,blister,red_sore_around_nose,yellow_crust_ooze,prognosis
0,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
1,0,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
2,1,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
3,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection
4,1,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Fungal Infection


In [9]:
x_train_total = df_train.drop(['prognosis'], axis=1)
y_train_total = df_train['prognosis']

label_encoder = LabelEncoder()
y_train_total = label_encoder.fit_transform(y_train_total)

print(f"\nEncoded y sample: {y_train_total[:5]}")
print(f"Mapping: {dict(zip(label_encoder.classes_, label_encoder.transform(label_encoder.classes_)))}")


Encoded y sample: [14 14 14 14 14]
Mapping: {'AIDS': np.int64(0), 'Acne': np.int64(1), 'Alcoholic Hepatitis': np.int64(2), 'Allergy': np.int64(3), 'Arthritis': np.int64(4), 'Bronchial Asthma': np.int64(5), 'Cervical Spondylosis': np.int64(6), 'Chickenpox': np.int64(7), 'Chronic Cholestasis': np.int64(8), 'Common Cold': np.int64(9), 'Dengue': np.int64(10), 'Diabetes ': np.int64(11), 'Dimorphic Hemmorhoids (piles)': np.int64(12), 'Drug Reaction': np.int64(13), 'Fungal Infection': np.int64(14), 'GERD': np.int64(15), 'Gastroenteritis': np.int64(16), 'Heart Attack': np.int64(17), 'Hepatitis A': np.int64(18), 'Hepatitis B': np.int64(19), 'Hepatitis C': np.int64(20), 'Hepatitis D': np.int64(21), 'Hepatitis E': np.int64(22), 'Hypertension ': np.int64(23), 'Hyperthyroidism': np.int64(24), 'Hypoglycemia': np.int64(25), 'Hypothyroidism': np.int64(26), 'Impetigo': np.int64(27), 'Jaundice': np.int64(28), 'Malaria': np.int64(29), 'Migraine': np.int64(30), 'Osteoarthritis': np.int64(31), 'Paralysis 

In [10]:
x_train_val, x_test, y_train_val, y_test = train_test_split(x_train_total, y_train_total, test_size=0.2, random_state=42, stratify=y_train_total)

In [11]:
x_train_fit, x_val, y_train_fit, y_val = train_test_split(x_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

In [15]:
cb_test = cb.CatBoostClassifier(iterations=1, task_type='GPU', devices='0', verbose=0)
cb_test.fit(x_train_fit.iloc[:2], y_train_fit[:2])
catboost_task_type = 'GPU'
print("\nCatBoost GPU is available and will be used.")


CatBoost GPU is available and will be used.


Catboost

In [16]:
base_catboost = cb.CatBoostClassifier(
    iterations=700,
    learning_rate=0.05,
    depth=6,
    l2_leaf_reg=3,
    loss_function='MultiClass',
    eval_metric='Accuracy',
    random_seed=42,
    task_type=catboost_task_type,
    devices='0' if catboost_task_type == 'GPU' else None,
)

print("\nTraining CatBoost with base parameters...")
base_catboost.fit(
    x_train_fit, y_train_fit,
    eval_set=[(x_val, y_val)],
    early_stopping_rounds=50, 
    verbose=100
)


Training CatBoost with base parameters...
0:	learn: 0.1670027	test: 0.1663306	best: 0.1663306 (0)	total: 61.5ms	remaining: 43s
bestTest = 1
bestIteration = 27
Shrink model to first 28 iterations.


<catboost.core.CatBoostClassifier at 0x7f67d7d5aed0>

In [17]:
y_pred_cat_base = base_catboost.predict(x_test)
if y_pred_cat_base.ndim > 1 and y_pred_cat_base.shape[1] == 1:
    y_pred_cat_base = y_pred_cat_base.flatten()
accuracy_cat_base = accuracy_score(y_test, y_pred_cat_base)
print(f"\nBase CatBoost Test Accuracy: {accuracy_cat_base:.4f}")
print(classification_report(y_test, y_pred_cat_base, target_names=label_encoder.classes_, zero_division=0))


Base CatBoost Test Accuracy: 1.0000
                               precision    recall  f1-score   support

                         AIDS       1.00      1.00      1.00        24
                         Acne       1.00      1.00      1.00        24
          Alcoholic Hepatitis       1.00      1.00      1.00        24
                      Allergy       1.00      1.00      1.00        24
                    Arthritis       1.00      1.00      1.00        24
             Bronchial Asthma       1.00      1.00      1.00        24
         Cervical Spondylosis       1.00      1.00      1.00        24
                   Chickenpox       1.00      1.00      1.00        25
          Chronic Cholestasis       1.00      1.00      1.00        24
                  Common Cold       1.00      1.00      1.00        24
                       Dengue       1.00      1.00      1.00        25
                    Diabetes        1.00      1.00      1.00        24
Dimorphic Hemmorhoids (piles)       1.0

In [21]:
xgboost_device = 'cpu' # Default to CPU
try:
    if x_train_fit.shape[0] >= 2:
        test_x_slice = x_train_fit.iloc[:min(2, x_train_fit.shape[0])]
        test_y_slice_original = y_train_fit[:min(2, x_train_fit.shape[0])]

        num_unique_in_slice = len(np.unique(test_y_slice_original))

        if num_unique_in_slice > 0:
            temp_encoder = LabelEncoder()
            test_y_slice_reencoded = temp_encoder.fit_transform(test_y_slice_original)
            num_class_test_slice = len(temp_encoder.classes_)

            xgb_test_params = {
                'tree_method': 'hist',
                'device': 'cuda',
                'objective': 'multi:softmax',
                'num_class': num_class_test_slice
            }
            xgb_test = xgb.XGBClassifier(n_estimators=1, **xgb_test_params, random_state=42, use_label_encoder=False)
            xgb_test.fit(test_x_slice, test_y_slice_reencoded)
            xgboost_device = 'cuda'
            print("\nXGBoost GPU is available and will be used.")
        else:
            print("\nXGBoost GPU test slice has no classes or is problematic. Defaulting to CPU.")
    else:
        print("\nNot enough samples in x_train_fit for XGBoost GPU test. Defaulting to CPU.")
except Exception as e:
    print(f"\nXGBoost GPU not available or error: {e}. Falling back to CPU.")

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)



XGBoost GPU is available and will be used.


In [33]:
base_xgboost = xgb.XGBClassifier(
    n_estimators=700,
    learning_rate=0.05,
    max_depth=5,
    gamma=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    objective='multi:softmax',
    num_class=len(label_encoder.classes_),
    eval_metric='mlogloss',
    use_label_encoder=False, 
    random_state=42,
    tree_method='hist',
    device=xgboost_device,
    early_stopping_rounds=None
)

print("\nTraining XGBoost with base parameters...")
base_xgboost.fit(
    x_train_fit, y_train_fit,
    eval_set=[(x_val, y_val)],
    verbose=100
)


Training XGBoost with base parameters...


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


[0]	validation_0-mlogloss:3.01669
[100]	validation_0-mlogloss:0.03214
[200]	validation_0-mlogloss:0.01016
[300]	validation_0-mlogloss:0.00994
[400]	validation_0-mlogloss:0.00987
[500]	validation_0-mlogloss:0.00983
[600]	validation_0-mlogloss:0.00981
[699]	validation_0-mlogloss:0.00979


In [25]:
y_pred_xgb_base = base_xgboost.predict(x_test)
accuracy_xgb_base = accuracy_score(y_test, y_pred_xgb_base)
print(f"\nBase XGBoost Test Accuracy: {accuracy_xgb_base:.4f}")
print(classification_report(y_test, y_pred_xgb_base, target_names=label_encoder.classes_, zero_division=0))

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)



Base XGBoost Test Accuracy: 1.0000
                               precision    recall  f1-score   support

                         AIDS       1.00      1.00      1.00        24
                         Acne       1.00      1.00      1.00        24
          Alcoholic Hepatitis       1.00      1.00      1.00        24
                      Allergy       1.00      1.00      1.00        24
                    Arthritis       1.00      1.00      1.00        24
             Bronchial Asthma       1.00      1.00      1.00        24
         Cervical Spondylosis       1.00      1.00      1.00        24
                   Chickenpox       1.00      1.00      1.00        25
          Chronic Cholestasis       1.00      1.00      1.00        24
                  Common Cold       1.00      1.00      1.00        24
                       Dengue       1.00      1.00      1.00        25
                    Diabetes        1.00      1.00      1.00        24
Dimorphic Hemmorhoids (piles)       1.00

In [34]:
base_random_forest = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

print("\nTraining RandomForest with base parameters...")
base_random_forest.fit(x_train_fit, y_train_fit)


Training RandomForest with base parameters...


In [30]:
y_pred_rf_base = base_random_forest.predict(x_test)
accuracy_rf_base = accuracy_score(y_test, y_pred_rf_base)
print(f"\nBase RandomForest Test Accuracy: {accuracy_rf_base:.4f}")
print(classification_report(y_test, y_pred_rf_base, target_names=label_encoder.classes_, zero_division=0))


Base RandomForest Test Accuracy: 1.0000
                               precision    recall  f1-score   support

                         AIDS       1.00      1.00      1.00        24
                         Acne       1.00      1.00      1.00        24
          Alcoholic Hepatitis       1.00      1.00      1.00        24
                      Allergy       1.00      1.00      1.00        24
                    Arthritis       1.00      1.00      1.00        24
             Bronchial Asthma       1.00      1.00      1.00        24
         Cervical Spondylosis       1.00      1.00      1.00        24
                   Chickenpox       1.00      1.00      1.00        25
          Chronic Cholestasis       1.00      1.00      1.00        24
                  Common Cold       1.00      1.00      1.00        24
                       Dengue       1.00      1.00      1.00        25
                    Diabetes        1.00      1.00      1.00        24
Dimorphic Hemmorhoids (piles)      

In [35]:
print("\nCreating Ensemble Classifier with base models...")

ensemble_clf_base = VotingClassifier(
    estimators=[
        ('catboost', base_catboost),
        ('xgboost', base_xgboost),
        ('randomforest', base_random_forest)
    ],
    voting='soft',
    weights=None,
    n_jobs=-1
)


Creating Ensemble Classifier with base models...


In [36]:
ensemble_clf_base.fit(x_train_val, y_train_val)

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


0:	learn: 0.1683468	total: 94.8ms	remaining: 1m 6s
1:	learn: 0.3046875	total: 134ms	remaining: 46.8s
2:	learn: 0.4226310	total: 168ms	remaining: 39s
3:	learn: 0.5393145	total: 204ms	remaining: 35.5s
4:	learn: 0.6338206	total: 248ms	remaining: 34.5s
5:	learn: 0.7046371	total: 285ms	remaining: 33s
6:	learn: 0.7530242	total: 332ms	remaining: 32.9s
7:	learn: 0.7782258	total: 376ms	remaining: 32.5s
8:	learn: 0.7804940	total: 422ms	remaining: 32.4s
9:	learn: 0.8986895	total: 473ms	remaining: 32.6s
10:	learn: 0.9690020	total: 525ms	remaining: 32.9s
11:	learn: 0.9697581	total: 568ms	remaining: 32.6s
12:	learn: 0.9745464	total: 621ms	remaining: 32.8s
13:	learn: 0.9780746	total: 666ms	remaining: 32.6s
14:	learn: 0.9780746	total: 703ms	remaining: 32.1s
15:	learn: 0.9780746	total: 761ms	remaining: 32.5s
16:	learn: 0.9818548	total: 816ms	remaining: 32.8s
17:	learn: 0.9818548	total: 864ms	remaining: 32.7s
18:	learn: 0.9818548	total: 911ms	remaining: 32.6s
19:	learn: 0.9848790	total: 956ms	remaining:

In [37]:
y_pred_ensemble = ensemble_clf_base.predict(x_test)
accuracy_ensemble = accuracy_score(y_test, y_pred_ensemble)
print("\nEnsemble Classifier (Base Models):")
print(f"Accuracy: {accuracy_ensemble:.4f}")
print(classification_report(y_test, y_pred_ensemble, target_names=label_encoder.classes_, zero_division=0))


Ensemble Classifier (Base Models):
Accuracy: 1.0000
                               precision    recall  f1-score   support

                         AIDS       1.00      1.00      1.00        24
                         Acne       1.00      1.00      1.00        24
          Alcoholic Hepatitis       1.00      1.00      1.00        24
                      Allergy       1.00      1.00      1.00        24
                    Arthritis       1.00      1.00      1.00        24
             Bronchial Asthma       1.00      1.00      1.00        24
         Cervical Spondylosis       1.00      1.00      1.00        24
                   Chickenpox       1.00      1.00      1.00        25
          Chronic Cholestasis       1.00      1.00      1.00        24
                  Common Cold       1.00      1.00      1.00        24
                       Dengue       1.00      1.00      1.00        25
                    Diabetes        1.00      1.00      1.00        24
Dimorphic Hemmorhoids (

In [39]:
import joblib

In [40]:
model_filename = 'ensemble_symptom_classifier.joblib'
joblib.dump(ensemble_clf_base, model_filename)
print(f"Ensemble model saved to {model_filename}")

Ensemble model saved to ensemble_symptom_classifier.joblib


In [41]:
label_encoder_filename = 'label_encoder.joblib'
joblib.dump(label_encoder, label_encoder_filename)
print(f"Label encoder saved to {label_encoder_filename}")

Label encoder saved to label_encoder.joblib
