In [101]:
!pip install -U scipy
!pip install scikit-plot
!pip install kagglehub
!pip install xgboost



In [102]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
from scipy.stats import uniform, randint
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [103]:
path = kagglehub.dataset_download("jainaru/thyroid-disease-data")
path = f"{path}/Thyroid_Diff.csv"
data = pd.read_csv(path) #uses pandas to read the CSV file into dataframe named 'data'
data = data.sample(frac=1, random_state=1) #randomly shuffles the rows in the 'data'

#view first few rows of 'data'
data.head()

Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
189,42,F,No,No,No,Euthyroid,Multinodular goiter,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No
381,61,M,Yes,Yes,Yes,Clinical Hyperthyroidism,Multinodular goiter,Extensive,Hurthel cell,Multi-Focal,High,T4b,N1b,M0,IVA,Structural Incomplete,Yes
120,37,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No
207,17,F,No,No,No,Euthyroid,Multinodular goiter,Right,Papillary,Uni-Focal,Intermediate,T2,N1b,M0,I,Indeterminate,No
321,62,F,No,No,No,Euthyroid,Multinodular goiter,Right,Papillary,Multi-Focal,Intermediate,T3a,N1b,M0,II,Structural Incomplete,Yes


- **Age**: The age of the patient at the time of diagnosis or treatment.
- **Gender**: The gender of the patient (male or female).
- **Smoking**: Whether the patient is a smoker or not.
- **Hx Smoking**: Smoking history of the patient (e.g., whether they have ever smoked).
- **Hx Radiotherapy**: History of radiotherapy treatment for any condition.
- **Thyroid Function**: The status of thyroid function, possibly indicating if there are any abnormalities.
- **Physical Examination**: Findings from a physical examination of the patient, which may include palpation of the thyroid gland and surrounding structures.
- **Adenopathy**: Presence or absence of enlarged lymph nodes (adenopathy) in the neck region.
- **Pathology**: Specific types of thyroid cancer as determined by pathology examination of biopsy samples.
- **Focality**: Whether the cancer is unifocal (limited to one location) or multifocal (present in multiple locations).
- **Risk**: The risk category of the cancer based on various factors, such as tumor size, extent of spread, and histological type.
- **T**: Target Variable; Tumor classification based on its size and extent of invasion into nearby structures.
- **N**: Nodal classification indicating the involvement of lymph nodes.
- **M**: Metastasis classification indicating the presence or absence of distant metastases.
- **Stage**: The overall stage of the cancer, typically determined by combining T, N, and M classifications.
- **Response**: Response to treatment, indicating whether the cancer responded positively, negatively, or remained stable after treatment.
- **Recurred**: Indicates whether the cancer has recurred after initial treatment.

Step 1: Prepare your data

In [104]:
# Rename the columns for better understanding
data.rename(columns={'Hx Smoking': 'Smoking History',
                   'Hx Radiothreapy': 'Radiotherapy History',
                   'Pathology': 'Types of Thyroid Cancer (Pathology)',
                   'T': 'Tumor',
                   'N': 'Lymph Nodes',
                   'M': 'Cancer Metastasis',
                  'Response' : 'Treatment Response'}, inplace=True)

In [105]:
# Define features and target variable
X = data.drop(['Tumor'], axis=1)  # Dropping T to use it as the target
y = data['Tumor']  # Target for tumor classification

In [106]:
# Encode categorical features using one-hot encoding
X = pd.get_dummies(X, columns=['Gender', 'Smoking', 'Smoking History', 'Radiotherapy History', 'Thyroid Function', 
                               'Physical Examination', 'Adenopathy', 'Types of Thyroid Cancer (Pathology)', 'Focality', 'Risk', 
                               'Lymph Nodes', 'Cancer Metastasis', 'Stage', 'Treatment Response', 'Recurred'], drop_first=True)

In [107]:
# Encode target variable as well
encoder = LabelEncoder()
y = encoder.fit_transform(data['Tumor'])

In [108]:
# Split the data into training and testing sets
# stratify=y ensures that the class distribution of the target variable y (in this case, 'T') is maintained in both the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

<h2> Resampling

In [46]:
#without resampling:
y_train_resampled = y_train
X_train_resampled = X_train
print("Class distribution without resampling:", Counter(y_train_resampled))

Class distribution without resampling: Counter({2: 121, 3: 77, 0: 39, 1: 34, 5: 16, 4: 13, 6: 6})


In [65]:
# Initialize SMOTE with k_neighbors set to 1
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=1)

# Initialize SMOTEENN with the customized SMOTE instance
smote_enn = SMOTEENN(smote=smote, random_state=42)

# Apply SMOTEENN on the training set
X_train_resampled, y_train_resampled = smote_enn.fit_resample(X_train, y_train)

# Check the new class distribution after resampling
print("Class distribution after SMOTEENN:", Counter(y_train_resampled))

Class distribution after SMOTEENN: Counter({6: 120, 0: 111, 4: 110, 5: 108, 1: 72, 3: 43, 2: 23})


In [47]:
# Apply SMOTE to balance classes
# Apply SMOTE with adjusted k_neighbors
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) #only apply on training set
#otherwise causes synthetic samples created by SMOTE to leak into both the training and test sets, leading to artificially inflated performance metrics (data leakage)

# Optional: Check class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_train_resampled))

Class distribution after SMOTE: Counter({4: 121, 2: 121, 3: 121, 0: 121, 1: 121, 6: 121, 5: 121})


In [109]:
#use adasyn
from imblearn.over_sampling import ADASYN
from collections import Counter

# Initialize ADASYN with n_neighbors set to a smaller value (e.g., 1)
adasyn = ADASYN(sampling_strategy='auto', random_state=42, n_neighbors=1)

# Apply ADASYN on the training set
X_train_resampled, y_train_resampled = adasyn.fit_resample(X_train, y_train)

# Check the new class distribution after resampling
print("Class distribution after ADASYN:", Counter(y_train_resampled))


Class distribution after ADASYN: Counter({3: 131, 1: 130, 5: 128, 4: 121, 2: 121, 6: 120, 0: 116})


<h2> Class Weights

Class weights after resampling to equalize things so that it doesn't prioritize minority calsses

In [110]:
# Calculate class weights based on the resampled data
unique_classes_resampled = np.unique(y_train_resampled)
class_weights_resampled = compute_class_weight(class_weight='balanced', classes=unique_classes_resampled, y=y_train_resampled)
class_weight_dict_resampled = dict(zip(unique_classes_resampled, class_weights_resampled))

# Print the updated class weights based on the resampled data
print("Class weights based on resampled data:", class_weight_dict_resampled)

Class weights based on resampled data: {0: 1.0677339901477831, 1: 0.9527472527472527, 2: 1.0236127508854782, 3: 0.945474372955289, 4: 1.0236127508854782, 5: 0.9676339285714286, 6: 1.0321428571428573}


In [111]:
# Create instance weights for the training set based on class labels
sample_weights = np.array([class_weight_dict_resampled[class_label] for class_label in y_train_resampled])

Class weights of original data to emphasize learning on minority classes

In [100]:
# Calculate class weights based on the original y_train
unique_classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=y_train)
class_weight_dict = dict(zip(unique_classes, class_weights))

print("Class weights based on original data:", class_weight_dict)
sample_weights = np.array([class_weight_dict[class_label] for class_label in y_train_resampled])

Class weights based on original data: {0: 1.120879120879121, 1: 1.2857142857142858, 2: 0.3612750885478158, 3: 0.5677179962894249, 4: 3.3626373626373627, 5: 2.732142857142857, 6: 7.285714285714286}


In [112]:
# Scale numerical features
scaler = MinMaxScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

Step 2: Define the parameter distribution for tuning

In [113]:
#Define Parameter Grid for RandomizedSearchCV
param_dist = {
    'max_depth': randint(2, 15),
    'learning_rate': uniform(0.01, 0.3),
    'n_estimators': randint(100, 300),
    'gamma': uniform(0.01, 1),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),         # Ensures values in [0.5, 1.0]
    'colsample_bytree': uniform(0.5, 0.5),  # Ensures values in [0.5, 1.0]
    'reg_alpha': uniform(0.01, 1.0),
    'reg_lambda': uniform(0.01, 1.0)
}

Step 3: Initialize XGBoost classifier and RandomizedSearchCV

In [114]:
# Initialize the XGBoost Classifier and Stratified Cross-Validation
xgb_clf = XGBClassifier(random_state=42)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#Set up RandomizedSearchCV with StratifiedKFold and Macro F1 Score
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    scoring='f1_macro',
    n_iter=100,
    cv=stratified_kfold,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    #error_score='raise'  # Stop and show error for problematic parameters
)

Step 4: Fit RandomizedSearchCV to the training data

In [115]:
# Fit the model with sample weights
random_search.fit(X_train_resampled, y_train_resampled, sample_weight=sample_weights)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


Step 5: Get the best model and parameters

In [116]:
best_params = random_search.best_params_
best_score = random_search.best_score_
best_xgb_clf = random_search.best_estimator_

print("Best Parameters:", best_params)
print("Best F1 Score:", best_score)

Best Parameters: {'colsample_bytree': 0.6886295416139879, 'gamma': 0.04896313231023363, 'learning_rate': 0.19547616530136785, 'max_depth': 8, 'min_child_weight': 1, 'n_estimators': 189, 'reg_alpha': 0.3289756302937613, 'reg_lambda': 0.8548753109694546, 'subsample': 0.511635967867913}
Best F1 Score: 0.8779467590328854


Step 6: Evaluate the best model on the test set

In [117]:
y_pred = best_xgb_clf.predict(X_test)

In [118]:
# Print performance metrics
print('\nModel Performance:')
print('Accuracy: ', accuracy_score(y_test, y_pred))

# Print the classification report and confusion matrix with class names
print("\nTumor Classification Report:")
print(classification_report(y_test, y_pred, target_names=encoder.classes_, zero_division=0))

print("Tumor Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Explanation of each metric in the classification report
# Precision: The proportion of true positives among all predictions for a given class. Higher precision means fewer false positives.
# Recall: The proportion of true positives out of all actual instances of a given class. Higher recall means fewer false negatives.
# F1-Score: The harmonic mean of precision and recall. It balances the two metrics.
# Support: The number of true instances for each class in the dataset.


Model Performance:
Accuracy:  0.7012987012987013

Tumor Classification Report:
              precision    recall  f1-score   support

         T1a       1.00      1.00      1.00        10
         T1b       0.67      0.22      0.33         9
          T2       0.76      0.87      0.81        30
         T3a       0.61      0.58      0.59        19
         T3b       0.20      0.33      0.25         3
         T4a       0.60      0.75      0.67         4
         T4b       0.50      0.50      0.50         2

    accuracy                           0.70        77
   macro avg       0.62      0.61      0.59        77
weighted avg       0.71      0.70      0.69        77

Tumor Confusion Matrix:
 [[10  0  0  0  0  0  0]
 [ 0  2  4  3  0  0  0]
 [ 0  0 26  3  1  0  0]
 [ 0  1  3 11  3  0  1]
 [ 0  0  1  0  1  1  0]
 [ 0  0  0  1  0  3  0]
 [ 0  0  0  0  0  1  1]]


In [98]:
# Display the count of each unique class in the 'T' column
print(data['Tumor'].value_counts())

Tumor
T2     151
T3a     96
T1a     49
T1b     43
T4a     20
T3b     16
T4b      8
Name: count, dtype: int64


In [23]:
class_to_filter = "T4b"
filtered_data = data[data['Tumor'] == class_to_filter]
print(filtered_data)

     Age Gender Smoking Smoking History Radiotherapy History  \
381   61      M     Yes             Yes                  Yes   
380   72      M     Yes             Yes                   No   
379   81      M     Yes              No                  Yes   
376   40      M     Yes              No                   No   
382   67      M     Yes              No                   No   
377   46      M     Yes              No                   No   
375   59      F      No              No                   No   
378   72      M     Yes             Yes                  Yes   

             Thyroid Function         Physical Examination Adenopathy  \
381  Clinical Hyperthyroidism          Multinodular goiter  Extensive   
380                 Euthyroid          Multinodular goiter  Bilateral   
379                 Euthyroid          Multinodular goiter  Extensive   
376                 Euthyroid          Multinodular goiter  Bilateral   
382                 Euthyroid          Multinodular goiter