In [214]:
!pip install -U scipy
!pip install scikit-plot
!pip install kagglehub
!pip install xgboost



In [2]:
import pandas as pd
import numpy as np
import kagglehub
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTEENN
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
from scipy.stats import uniform, randint
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

In [3]:
path = kagglehub.dataset_download("jainaru/thyroid-disease-data")
path = f"{path}/Thyroid_Diff.csv"
data = pd.read_csv(path) #uses pandas to read the CSV file into dataframe named 'data'
data = data.sample(frac=1, random_state=1) #randomly shuffles the rows in the 'data'

#view first few rows of 'data'
data.head()



Unnamed: 0,Age,Gender,Smoking,Hx Smoking,Hx Radiothreapy,Thyroid Function,Physical Examination,Adenopathy,Pathology,Focality,Risk,T,N,M,Stage,Response,Recurred
189,42,F,No,No,No,Euthyroid,Multinodular goiter,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No
381,61,M,Yes,Yes,Yes,Clinical Hyperthyroidism,Multinodular goiter,Extensive,Hurthel cell,Multi-Focal,High,T4b,N1b,M0,IVA,Structural Incomplete,Yes
120,37,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low,T2,N0,M0,I,Excellent,No
207,17,F,No,No,No,Euthyroid,Multinodular goiter,Right,Papillary,Uni-Focal,Intermediate,T2,N1b,M0,I,Indeterminate,No
321,62,F,No,No,No,Euthyroid,Multinodular goiter,Right,Papillary,Multi-Focal,Intermediate,T3a,N1b,M0,II,Structural Incomplete,Yes


- **Age**: The age of the patient at the time of diagnosis or treatment.
- **Gender**: The gender of the patient (male or female).
- **Smoking**: Whether the patient is a smoker or not.
- **Hx Smoking**: Smoking history of the patient (e.g., whether they have ever smoked).
- **Hx Radiotherapy**: History of radiotherapy treatment for any condition.
- **Thyroid Function**: The status of thyroid function, possibly indicating if there are any abnormalities.
- **Physical Examination**: Findings from a physical examination of the patient, which may include palpation of the thyroid gland and surrounding structures.
- **Adenopathy**: Presence or absence of enlarged lymph nodes (adenopathy) in the neck region.
- **Pathology**: Specific types of thyroid cancer as determined by pathology examination of biopsy samples.
- **Focality**: Whether the cancer is unifocal (limited to one location) or multifocal (present in multiple locations).
- **Risk**: The risk category of the cancer based on various factors, such as tumor size, extent of spread, and histological type.
- **T**: Target Variable; Tumor classification based on its size and extent of invasion into nearby structures.
- **N**: Nodal classification indicating the involvement of lymph nodes.
- **M**: Metastasis classification indicating the presence or absence of distant metastases.
- **Stage**: The overall stage of the cancer, typically determined by combining T, N, and M classifications.
- **Response**: Response to treatment, indicating whether the cancer responded positively, negatively, or remained stable after treatment.
- **Recurred**: Indicates whether the cancer has recurred after initial treatment.

Step 1: Prepare your data

In [4]:
# Rename the columns for better understanding
data.rename(columns={'Hx Smoking': 'Smoking History',
                   'Hx Radiothreapy': 'Radiotherapy History',
                   'Pathology': 'Types of Thyroid Cancer (Pathology)',
                   'T': 'Tumor',
                   'N': 'Lymph Nodes',
                   'M': 'Cancer Metastasis',
                  'Response' : 'Treatment Response'}, inplace=True)

In [5]:
# Define features and target variable
X = data.drop(['Tumor', 'Lymph Nodes', 'Cancer Metastasis', 'Treatment Response', 'Stage', 'Recurred'], axis=1)
y = data['Tumor'] # Target for tumor classification

In [219]:
X.head() #dropped features after T because they are not predictors

Unnamed: 0,Age,Gender,Smoking,Smoking History,Radiotherapy History,Thyroid Function,Physical Examination,Adenopathy,Types of Thyroid Cancer (Pathology),Focality,Risk
189,42,F,No,No,No,Euthyroid,Multinodular goiter,No,Papillary,Uni-Focal,Low
381,61,M,Yes,Yes,Yes,Clinical Hyperthyroidism,Multinodular goiter,Extensive,Hurthel cell,Multi-Focal,High
120,37,F,No,No,No,Euthyroid,Single nodular goiter-right,No,Papillary,Uni-Focal,Low
207,17,F,No,No,No,Euthyroid,Multinodular goiter,Right,Papillary,Uni-Focal,Intermediate
321,62,F,No,No,No,Euthyroid,Multinodular goiter,Right,Papillary,Multi-Focal,Intermediate


In [6]:
# Encode categorical features using one-hot encoding
X = pd.get_dummies(X, columns=['Gender', 'Smoking', 'Smoking History', 'Radiotherapy History', 'Thyroid Function',
                               'Physical Examination', 'Adenopathy', 'Types of Thyroid Cancer (Pathology)', 
                               'Focality', 'Risk'], drop_first=True)

In [221]:
print(X)

     Age  Gender_M  Smoking_Yes  Smoking History_Yes  \
189   42     False        False                False   
381   61      True         True                 True   
120   37     False        False                False   
207   17     False        False                False   
321   62     False        False                False   
..   ...       ...          ...                  ...   
203   33      True        False                False   
255   37     False        False                False   
72    31     False        False                False   
235   60     False        False                False   
37    43      True        False                False   

     Radiotherapy History_Yes  Thyroid Function_Clinical Hypothyroidism  \
189                     False                                     False   
381                      True                                     False   
120                     False                                     False   
207                     Fal

In [7]:
# Encode target variable as well
encoder = LabelEncoder()
y = encoder.fit_transform(data['Tumor'])
print(y)

[2 6 2 2 3 4 0 2 3 2 2 2 4 2 2 1 6 3 5 1 3 1 2 0 3 5 2 2 2 2 0 2 3 2 3 2 3
 3 4 2 2 3 3 0 1 2 2 3 1 2 3 1 2 0 3 0 3 1 2 2 2 3 3 0 2 3 5 5 0 3 0 6 3 6
 2 5 2 4 2 2 3 3 2 1 5 2 0 3 1 5 3 3 2 2 2 3 2 2 2 0 3 2 2 3 1 0 3 2 1 3 6
 3 0 2 3 1 1 4 2 2 2 3 2 4 5 3 2 1 1 2 2 0 3 2 2 1 2 0 2 0 4 0 2 2 2 1 0 2
 2 5 2 0 3 2 2 2 6 3 2 2 3 2 0 2 3 2 5 2 0 3 2 3 0 3 3 2 2 3 2 1 5 3 0 2 2
 3 3 3 0 0 1 4 0 3 0 5 3 0 5 0 3 1 2 2 0 1 5 3 2 5 6 3 0 2 2 2 0 2 2 2 3 0
 0 2 2 1 1 2 3 2 0 3 2 2 2 2 3 2 2 4 2 5 3 2 3 2 3 2 1 1 3 4 3 1 2 2 2 2 3
 2 3 1 2 2 2 2 3 2 2 2 3 1 2 2 2 2 2 2 0 3 2 2 2 1 3 0 4 0 3 1 2 1 3 6 2 3
 0 1 3 3 3 2 2 4 3 2 1 3 5 1 2 2 3 3 3 2 2 1 4 2 3 2 2 3 4 2 2 3 2 0 2 1 0
 3 2 3 0 1 0 3 0 4 1 5 0 2 2 5 2 3 2 3 0 3 0 3 3 0 2 1 4 2 2 1 1 2 3 5 3 3
 2 3 2 1 2 2 3 2 2 3 1 2 0]


In [16]:
# Split the data into training anxd testing sets
# stratify=y ensures that the class distribution of the target variable y (in this case, 'T') is maintained in both the training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y, random_state=42)

In [224]:
# Apply SMOTE to balance classes
# Apply SMOTE with adjusted k_neighbors
smote = SMOTE(sampling_strategy='auto', random_state=42, k_neighbors=1)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train) #only apply on training set
#otherwise causes synthetic samples created by SMOTE to leak into both the training and test sets, leading to artificially inflated performance metrics (data leakage)

# Optional: Check class distribution after SMOTE
print("Class distribution after SMOTE:", Counter(y_train_resampled))

Class distribution after SMOTE: Counter({2: 136, 3: 136, 1: 136, 0: 136, 6: 136, 5: 136, 4: 136})


In [225]:
# Calculate class weights based on the original y_train
unique_classes = np.unique(y_train)
class_weights = compute_class_weight(class_weight='balanced', classes=unique_classes, y=y_train)
class_weight_dict = dict(zip(unique_classes, class_weights))

print("Class weights based on original data:", class_weight_dict)
sample_weights = np.array([class_weight_dict[class_label] for class_label in y_train_resampled])

Class weights based on original data: {0: 1.1168831168831168, 1: 1.26007326007326, 2: 0.36134453781512604, 3: 0.5714285714285714, 4: 3.510204081632653, 5: 2.7301587301587302, 6: 7.020408163265306}


In [226]:
# Scale numerical features
scaler = MinMaxScaler()
X_train_resampled = scaler.fit_transform(X_train_resampled)
X_test = scaler.transform(X_test)

In [18]:
#Define Parameter Grid for RandomizedSearchCV
param_dist = {
    'max_depth': randint(2, 15),
    'learning_rate': uniform(0.01, 0.3),
    'n_estimators': randint(100, 300),
    'gamma': uniform(0.01, 1),
    'min_child_weight': randint(1, 10),
    'subsample': uniform(0.5, 0.5),         # Ensures values in [0.5, 1.0]
    'colsample_bytree': uniform(0.5, 0.5),  # Ensures values in [0.5, 1.0]
    'reg_alpha': uniform(0.01, 1.0),
    'reg_lambda': uniform(0.01, 1.0)
}

In [19]:
# Initialize the XGBoost Classifier and Stratified Cross-Validation
xgb_clf = XGBClassifier(random_state=42)
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

#Set up RandomizedSearchCV with StratifiedKFold and Macro F1 Score
random_search = RandomizedSearchCV(
    estimator=xgb_clf,
    param_distributions=param_dist,
    scoring='f1_macro',
    n_iter=100,
    cv=stratified_kfold,
    verbose=1,
    random_state=42,
    n_jobs=-1,
    #error_score='raise'  # Stop and show error for problematic parameters
)

In [20]:
# Fit the model with sample weights
random_search.fit(X_train_resampled, y_train_resampled, sample_weight=sample_weights)

NameError: name 'sample_weights' is not defined

In [21]:
# Fit the model with sample weights
random_search.fit(X_train_resampled, y_train_resampled)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [22]:
best_params = random_search.best_params_
best_score = random_search.best_score_
best_xgb_clf = random_search.best_estimator_

print("Best Parameters:", best_params)
print("Best F1 Score:", best_score)

Best Parameters: {'colsample_bytree': 0.9747603118288211, 'gamma': 0.15707348092903795, 'learning_rate': 0.28797628754844834, 'max_depth': 11, 'min_child_weight': 1, 'n_estimators': 183, 'reg_alpha': 0.4691357562382613, 'reg_lambda': 0.9900325752854771, 'subsample': 0.7463090469964349}
Best F1 Score: 0.5073790500170008


In [23]:
y_pred = best_xgb_clf.predict(X_test)

In [24]:
# Print performance metrics
print('Model Performance:')
print('Accuracy: ', accuracy_score(y_test, y_pred))

# Print the classification report and confusion matrix with class names
print("\nTumor Classification Report:")
print(classification_report(y_test, y_pred, target_names=encoder.classes_, zero_division=0))

print("Tumor Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

# Explanation of each metric in the classification report
# Precision: The proportion of true positives among all predictions for a given class. Higher precision means fewer false positives.
# Recall: The proportion of true positives out of all actual instances of a given class. Higher recall means fewer false negatives.
# F1-Score: The harmonic mean of precision and recall. It balances the two metrics.
# Support: The number of true instances for each class in the dataset.


Model Performance:
Accuracy:  0.5897435897435898

Tumor Classification Report:
              precision    recall  f1-score   support

         T1a       1.00      1.00      1.00         5
         T1b       0.00      0.00      0.00         4
          T2       0.67      0.80      0.73        15
         T3a       0.50      0.50      0.50        10
         T3b       0.00      0.00      0.00         2
         T4a       0.00      0.00      0.00         2
         T4b       0.50      1.00      0.67         1

    accuracy                           0.59        39
   macro avg       0.38      0.47      0.41        39
weighted avg       0.53      0.59      0.55        39

Tumor Confusion Matrix:
 [[ 5  0  0  0  0  0  0]
 [ 0  0  2  2  0  0  0]
 [ 0  0 12  2  1  0  0]
 [ 0  0  3  5  1  1  0]
 [ 0  0  1  0  0  1  0]
 [ 0  0  0  1  0  0  1]
 [ 0  0  0  0  0  0  1]]
