# XGBoost for Ordinal Dataset with Imbalance Handling, Cross-Validation, and Optuna

In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from imblearn.over_sampling import SMOTE
from xgboost import XGBClassifier
import optuna
import numpy as np
import kagglehub
    

## Load and Prepare Data

In [2]:
path = kagglehub.dataset_download("jainaru/thyroid-disease-data")
path = f"{path}/Thyroid_Diff.csv"
data = pd.read_csv(path) #uses pandas to read the CSV file into dataframe named 'data'
data = data.sample(frac=1, random_state=1) #randomly shuffles the rows in the 'data'

# Encode target column (Ordinal Mapping)
tumor_mapping = {'T1a': 0, 'T1b': 1, 'T2': 2, 'T3a': 3, 'T3b': 4, 'T4a': 5, 'T4b': 6}
data['T_numeric'] = data['T'].map(tumor_mapping)

# Convert categorical columns to 'category' dtype
categorical_columns = ['Gender', 'Smoking', 'Hx Smoking', 'Hx Radiothreapy', 'Thyroid Function', 
                       'Physical Examination', 'Adenopathy', 'Pathology', 'Focality', 'Risk', 
                       'N', 'M', 'Stage', 'Response', 'Recurred']
for col in categorical_columns:
    data[col] = data[col].astype('category')

# Define features and target
X = data.drop(columns=['T', 'T_numeric'])  # Drop target columns from features
#X = data.drop(columns=['T', 'T_numeric', 'N', 'M', 'Stage', 'Response', 'Recurred'])  # Drop past T as well
y = data['T_numeric']




## Calculate Class Weights

In [3]:

# Compute class weights
class_weights = compute_class_weight('balanced', classes=np.unique(y), y=y)
class_weights_dict = {i: w for i, w in enumerate(class_weights)}

# Add sample weights to XGBoost
sample_weights = y.map(class_weights_dict)
    

## Define and Optimize XGBoost Using Optuna with Cross-Validation

In [4]:
# Define Optuna objective function
def objective(trial):
    params = {
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.3),
        'n_estimators': trial.suggest_int('n_estimators', 50, 500),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_float('gamma', 0, 5),
        'reg_alpha': trial.suggest_float('reg_alpha', 0, 10),
        'reg_lambda': trial.suggest_float('reg_lambda', 0, 10)
    }

    # Cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    accuracy_scores = []

    for train_idx, test_idx in skf.split(X, y):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        model = XGBClassifier(**params, random_state=42, enable_categorical=True)
        model.fit(X_train, y_train, sample_weight=sample_weights.iloc[train_idx])
        y_pred = model.predict(X_test)
        accuracy_scores.append(accuracy_score(y_test, y_pred))

    return np.mean(accuracy_scores)

# Run Optuna
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)
print("Best Params:", study.best_params)
    

[I 2024-11-17 00:38:10,256] A new study created in memory with name: no-name-f70f17aa-d115-40c5-b4db-3b9b03d1c828
[I 2024-11-17 00:38:12,313] Trial 0 finished with value: 0.34982911825017093 and parameters: {'max_depth': 4, 'learning_rate': 0.20307275271873215, 'n_estimators': 242, 'subsample': 0.6583797523136187, 'colsample_bytree': 0.5758823835728761, 'gamma': 4.653855161700843, 'reg_alpha': 4.521255739673554, 'reg_lambda': 0.38457293853952246}. Best is trial 0 with value: 0.34982911825017093.
[I 2024-11-17 00:38:17,139] Trial 1 finished with value: 0.5562884483937116 and parameters: {'max_depth': 13, 'learning_rate': 0.046498939018230184, 'n_estimators': 364, 'subsample': 0.8137861931072348, 'colsample_bytree': 0.9533671614043828, 'gamma': 0.21833178533176956, 'reg_alpha': 0.7240836518015326, 'reg_lambda': 9.574201392491169}. Best is trial 1 with value: 0.5562884483937116.
[I 2024-11-17 00:38:20,060] Trial 2 finished with value: 0.4178058783321942 and parameters: {'max_depth': 3, 'l

Best Params: {'max_depth': 10, 'learning_rate': 0.16448093530100713, 'n_estimators': 495, 'subsample': 0.8641545932371121, 'colsample_bytree': 0.7755086140866356, 'gamma': 0.02186238870915474, 'reg_alpha': 1.2526083263937413, 'reg_lambda': 3.4393688635155324}


## Train Final Model and Evaluate

In [5]:
# Train final model on the best parameters
best_params = study.best_params
final_model = XGBClassifier(**best_params, random_state=42, enable_categorical=True)
final_model.fit(X, y)

# Evaluate on the original test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
y_pred = final_model.predict(X_test)

print("Model Performance:")
print('Accuracy: ', accuracy_score(y_test, y_pred))

print("\nTumor Classification Report:")
print(classification_report(y_test, y_pred, zero_division=0))
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Model Performance:
Accuracy:  0.935064935064935

Tumor Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      0.67      0.80         9
           2       0.88      1.00      0.94        30
           3       1.00      0.95      0.97        19
           4       1.00      1.00      1.00         3
           5       0.80      1.00      0.89         4
           6       1.00      0.50      0.67         2

    accuracy                           0.94        77
   macro avg       0.95      0.87      0.90        77
weighted avg       0.94      0.94      0.93        77

Confusion Matrix:
[[10  0  0  0  0  0  0]
 [ 0  6  3  0  0  0  0]
 [ 0  0 30  0  0  0  0]
 [ 0  0  1 18  0  0  0]
 [ 0  0  0  0  3  0  0]
 [ 0  0  0  0  0  4  0]
 [ 0  0  0  0  0  1  1]]
