In [2]:
!pip install ray\[tune\] optuna scikit-learn xgboost category-encoders numpy pandas
!pip install -U ipywidgets
!pip install -U scipy
!pip install scikit-plot
!pip install kagglehub



In [3]:
from ray import tune 
from sklearn.preprocessing import QuantileTransformer, OneHotEncoder
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np
from category_encoders import CatBoostEncoder
import pandas as pd 

from ray import tune
from ray.tune.search.optuna import OptunaSearch
from ray.tune import trainable, with_parameters

import kagglehub
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [4]:
#### UTILS FOR TESTING SCRIPT, Can Ignore####
def make_rand_data(n_samples, n_num, n_cat): 
    """
    Make random data for testing purposes
    """
    X = np.random.rand(n_samples, n_num)
    X_cat = np.random.randint(0, 10, (n_samples, n_cat))
    # Make y a function of X, and map to integers from 1-20
    y = np.sum(X, axis=1)
    y = y + X_cat[:, 1]
    y = y - (X_cat[:, 2] * X_cat[:, 3])    
    y = np.digitize(y, bins=np.linspace(0, 20, 5))
    return X, X_cat, y

def make_rand_df(n_samples, n_num, n_cat):
    """
    Make random dataframe for testing purposes
    """
    X, X_cat, y = make_rand_data(n_samples, n_num, n_cat)
    df = pd.DataFrame(X, columns=[f"num_{i}" for i in range(n_num)])
    df_cat = pd.DataFrame(X_cat, columns=[f"cat_{i}" for i in range(n_cat)])
    df["target"] = y
    return pd.concat([df, df_cat], axis=1)

In [5]:
def downcast_dtypes(arr: np.ndarray) -> np.ndarray:
    """
    Downcast the datatypes of a numpy array to save memory
    """
    arr = arr.copy()
    for i in range(arr.shape[1]):
        # Check if column is int, float or object
        # For floats, check min & max, downcast to float32 if possible
        # For ints, check min & max, downcast to int8 if possible
        if arr[:, i].dtype == "int64":
            if np.all(arr[:, i] == arr[:, i].astype("int8")):
                arr[:, i] = arr[:, i].astype("int8")
            elif np.all(arr[:, i] == arr[:, i].astype("int16")):
                arr[:, i] = arr[:, i].astype("int16")
            elif np.all(arr[:, i] == arr[:, i].astype("int32")):
                arr[:, i] = arr[:, i].astype("int32")
        if arr[:, i].dtype == "float64":
            if np.allclose(arr[:, i], arr[:, i].astype("float32"), atol=1e-5):
                arr[:, i] = arr[:, i].astype("float32")
    return arr


def preprocess_df(df, numerical_columns, categorical_columns, target_column, train_pct=0.8):
    # Encode the target column
    label_encoder = LabelEncoder()
    df[target_column] = label_encoder.fit_transform(df[target_column])  # Converts 'T2', 'T3a', etc., into integers
    
    # Split the data
    X = df.drop(columns=[target_column])
    y = df[target_column]

    # Ensure `y` is a Pandas Series
    if not isinstance(y, pd.Series):
        y = pd.Series(y, name=target_column)

    # Train-test split
    X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=train_pct, random_state=42)

    # Validate types of `y_train`
    if not isinstance(y_train, pd.Series):
        print(f"Converting y_train to Pandas Series")
        y_train = pd.Series(y_train, index=X_train.index, name=target_column)
    
    # Split numerical and categorical columns
    x_num_train = X_train[numerical_columns].values
    x_num_test = X_test[numerical_columns].values
    x_cat_train = X_train[categorical_columns]
    x_cat_test = X_test[categorical_columns]

    # Debugging output for inputs
    print("Debugging Input Types:")
    print(f"Type of x_cat_train: {type(x_cat_train)}")
    print(f"Type of y_train: {type(y_train)}")
    print(f"First few values of y_train:\n{y_train.head()}")

    # Categorical encoding
    try:
        cbe = CatBoostEncoder(cols=categorical_columns)
        x_cat_train_encoded = cbe.fit_transform(x_cat_train, y_train)
        x_cat_test_encoded = cbe.transform(x_cat_test)
    except Exception as e:
        print(f"Error during CatBoostEncoder transformation: {e}")
        raise

    # Combine numerical and encoded categorical features
    X_train_processed = np.concatenate([x_num_train, x_cat_train_encoded.values], axis=1)
    X_test_processed = np.concatenate([x_num_test, x_cat_test_encoded.values], axis=1)

    return X_train_processed, X_test_processed, y_train.values, y_test.values


search_space = {
    "AdaBoostClassifier": {
        "name": AdaBoostClassifier,
        "params": {
            "n_estimators": tune.qrandint(10, 200, 10),
            "learning_rate": tune.qloguniform(0.01, 2.0, 0.01),
        }
    },
    "RandomForestClassifier": {
        "name": RandomForestClassifier,
        "params": {
            "n_estimators": tune.qrandint(50, 500, 50),
            "max_depth": tune.qrandint(2, 20, 2),
            "min_samples_split": tune.quniform(0.01, 0.5, 0.01),
            "min_samples_leaf": tune.quniform(0.01, 0.5, 0.01)
        }
    },
    "GaussianProcessClassifier": {
        "name": GaussianProcessClassifier,
        "params": {
            "kernel": tune.choice([1.0 * RBF(length_scale) for length_scale in [1.0, 2.0, 3.0, 5.0]]),
            "max_iter_predict": tune.qrandint(50, 500, 50)
        }
    },
    "GaussianNB": {
        "name": GaussianNB,
        "params": {
            "var_smoothing": tune.qloguniform(1e-9, 1e-3, 1e-9)
        }
    },
    "KNeighborsClassifier": {
        "name": KNeighborsClassifier,
        "params": {
            "n_neighbors": tune.randint(3, 20),
            "weights": tune.choice(["uniform", "distance"]),
            "metric": tune.choice(["euclidean", "manhattan", "minkowski"])
        }
    },
    "MLPClassifier": {
        "name": MLPClassifier,
        "params": {
            "hidden_layer_sizes": tune.choice([(50,), (100,), (50, 50), (100, 50)]),
            "activation": tune.choice(["logistic", "tanh", "relu"]),
            "solver": "adam",
            "alpha": tune.qloguniform(1e-5, 1e-1, 1e-5),
            "learning_rate": tune.choice(["constant", "invscaling", "adaptive"])
        }
    },
    "SVC": {
        "name": SVC,
        "params": {
            "C": tune.qloguniform(1e-5, 1e2, 1e-5),
            "kernel": tune.choice(["linear", "poly", "rbf", "sigmoid"]),
            "gamma": tune.choice(["scale", "auto"]),
            "degree": tune.randint(2, 5)  # for poly kernel
        }
    },
    "DecisionTreeClassifier": {
        "name": DecisionTreeClassifier,
        "params": {
            "criterion": tune.choice(["gini", "entropy"]),
            "max_depth": tune.qrandint(2, 20, 2),
            "min_samples_split": tune.quniform(0.01, 0.5, 0.01),
            "min_samples_leaf": tune.quniform(0.01, 0.5, 0.01)
        }
    },
    "LogisticRegression": {
        "name": LogisticRegression,
        "params": {
            "C": tune.qloguniform(1e-5, 1e2, 1e-5),
            "penalty": tune.choice(["l1", "l2"]),
            "max_iter": tune.qrandint(50, 500, 50),
            "solver": "liblinear", 
        }
    },
    "XGBClassifier": {
        "name": XGBClassifier,
        "params": {
            "n_estimators": tune.qrandint(50, 500, 50),
            "max_depth": tune.qrandint(2, 20, 2),
            "learning_rate": tune.qloguniform(0.01, 2.0, 0.01),
            "gamma": tune.quniform(0.01, 0.5, 0.01),
            "min_child_weight": tune.quniform(0.01, 0.5, 0.01),
            "subsample": tune.quniform(0.1, 1.0, 0.1),
            "colsample_bytree": tune.quniform(0.5, 1.0, 0.1),
            "reg_alpha": tune.qloguniform(1e-5, 1e2, 1e-5),
            "reg_lambda": tune.qloguniform(1e-5, 1e2, 1e-5),
        }
    }
}


# Trainable function to use with Ray Tune
def train_model(config, data):
    X_train, X_test, y_train, y_test = data
    classifier_name = config["classifier_name"]
    classifier_class = search_space[classifier_name]["name"]
    params = config["params"]
    
    # Initialize and train the classifier
    model = classifier_class(**params)
    model.fit(X_train, y_train)
    
    # Make predictions and evaluate
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    f1 = f1_score(y_test, predictions, average='weighted')
    precision = precision_score(y_test, predictions, average='weighted')
    recall = recall_score(y_test, predictions, average='weighted')
    
    # Report the results to Tune
    return {
        "accuracy": accuracy,
        "f1": f1,
        "precision": precision,
        "recall": recall
    }

# Main function to run trials for each classifier
def run_tune_trials(df, num_cols, cat_cols, target_col, search_space, n_trials=50, time_budget_s=None):
    all_results = []
    
    X_train, X_test, y_train, y_test = preprocess_df(df, num_cols, cat_cols, target_col)
    
    for classifier_name, classifier_info in search_space.items():
        config = {
            "classifier_name": classifier_name,
            "params": classifier_info["params"]
        }
        
        trainable = tune.with_parameters(train_model, data=(X_train, X_test, y_train, y_test))
        # Run Tune for the classifier
        tuner = tune.Tuner(
            trainable = trainable,
            param_space=config,
            tune_config=tune.TuneConfig(
                metric="accuracy",
                mode="max",
                num_samples=n_trials,
                search_alg=OptunaSearch(),
                time_budget_s=time_budget_s
            )
        )

        tuner.fit()

        # Collect results for this classifier
        results = tuner.get_results().get_dataframe()
        results["classifier_name"] = classifier_name
        all_results.append(results)
    
    # Combine all results into a single DataFrame
    final_results = pd.concat(all_results, ignore_index=True)
    return final_results



In [6]:
path = kagglehub.dataset_download("jainaru/thyroid-disease-data")
path = f"{path}/Thyroid_Diff.csv"
data = pd.read_csv(path) #uses pandas to read the CSV file into dataframe named 'data'
data = data.sample(frac=1, random_state=1) #randomly shuffles the rows in the 'data'



In [7]:
# Rename the columns for better understanding
data.rename(columns={'Hx Smoking': 'Smoking History',
                   'Hx Radiothreapy': 'Radiotherapy History',
                   'Pathology': 'Types of Thyroid Cancer (Pathology)',
                   'T': 'Tumor',
                   'N': 'Lymph Nodes',
                   'M': 'Cancer Metastasis',
                  'Response' : 'Treatment Response'}, inplace=True)

- **Age**: The age of the patient at the time of diagnosis or treatment.
- **Gender**: The gender of the patient (male or female).
- **Smoking**: Whether the patient is a smoker or not.
- **Hx Smoking**: Smoking history of the patient (e.g., whether they have ever smoked).
- **Hx Radiotherapy**: History of radiotherapy treatment for any condition.
- **Thyroid Function**: The status of thyroid function, possibly indicating if there are any abnormalities.
- **Physical Examination**: Findings from a physical examination of the patient, which may include palpation of the thyroid gland and surrounding structures.
- **Adenopathy**: Presence or absence of enlarged lymph nodes (adenopathy) in the neck region.
- **Pathology**: Specific types of thyroid cancer as determined by pathology examination of biopsy samples.
- **Focality**: Whether the cancer is unifocal (limited to one location) or multifocal (present in multiple locations).
- **Risk**: The risk category of the cancer based on various factors, such as tumor size, extent of spread, and histological type.
- **T**: Target Variable; Tumor classification based on its size and extent of invasion into nearby structures.
- **N**: Nodal classification indicating the involvement of lymph nodes.
- **M**: Metastasis classification indicating the presence or absence of distant metastases.
- **Stage**: The overall stage of the cancer, typically determined by combining T, N, and M classifications.
- **Response**: Response to treatment, indicating whether the cancer responded positively, negatively, or remained stable after treatment.
- **Recurred**: Indicates whether the cancer has recurred after initial treatment.

# How To Use: 
1. Replace "df = make_rand_df()" with your own dataframe.
2. Replace num_cols with the names of your numerical columns e.g. P/E ratio etc.
3. Replace cat_cols with your categorical parameters e.g. Sector, Industry etc. (obv not that since thats what you're trying to predict)
4. Replace target_col with the name of the column you're trying to predict e.g. Sector 
5. n_trials is the number of trials to run per estimator, and time_budget_s is the time budget in seconds for **all trials for that model**. 

I'd be pretty shocked if XGBoost doesn't win, as that's literally the only thing used in practice. If you're getting like 99-100% accuracy, you're probably overfitting and would want to switch to cross validation (if you're familiar). 

In [8]:
# Map T column to numeric values
tumor_mapping = {
    'T1a': 0, 'T1b': 1, 'T2': 2, 
    'T3a': 3, 'T3b': 4, 'T4a': 5, 'T4b': 6
}

# Apply the mapping
data['Tumor'] = data['Tumor'].map(tumor_mapping)

# Prepare the dataset for training
df = data.drop(['Lymph Nodes', 'Cancer Metastasis', 'Treatment Response', 'Stage', 'Recurred'], axis=1)
num_cols = ['Age']
cat_cols = [
    'Gender', 'Smoking', 'Smoking History', 'Radiotherapy History', 'Thyroid Function',
    'Physical Examination', 'Adenopathy', 'Types of Thyroid Cancer (Pathology)', 
    'Focality', 'Risk'
]
target_col = "Tumor"

# Pass the updated dataset into your model pipeline
final_results = run_tune_trials(df, num_cols, cat_cols, target_col, search_space, n_trials=3, time_budget_s=100)
final_results[[col for col in final_results.columns if col in ["classifier_name", "accuracy", "f1", "precision", "recall"]]].sort_values("accuracy", ascending=False)

0,1
Current time:,2024-11-17 00:00:04
Running for:,00:00:05.46
Memory:,13.5/16.0 GiB

Trial name,status,loc,params/colsample_byt ree,params/gamma,params/learning_rate,params/max_depth,params/min_child_wei ght,params/n_estimators,params/reg_alpha,params/reg_lambda,params/subsample,iter,total time (s),accuracy,f1,precision
train_model_023fec78,TERMINATED,127.0.0.1:33536,0.7,0.25,0.0114691,12,0.36,50,7.65356e-05,0.0960428,0.2,1,0.100197,0.571429,0.482468,0.433766
train_model_9582621b,TERMINATED,127.0.0.1:33553,0.6,0.34,0.164654,14,0.32,350,1.99357e-05,0.0115786,0.7,1,0.162921,0.636364,0.549996,0.491862
train_model_1f72791c,TERMINATED,127.0.0.1:33567,0.5,0.12,0.151839,14,0.41,400,1.87639,0.128636,0.1,1,0.133907,0.558442,0.463575,0.4


[36m(train_model pid=33553)[0m   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))[32m [repeated 4x across cluster][0m
2024-11-17 00:00:04,778	INFO tune.py:1009 -- Wrote the latest version of all result files and experiment state to '/Users/freddieliang/ray_results/train_model_2024-11-16_23-59-59' in 0.0057s.
2024-11-17 00:00:04,782	INFO tune.py:1041 -- Total run time: 5.48 seconds (5.46 seconds for the tuning loop).


Unnamed: 0,accuracy,f1,precision,recall,classifier_name
28,0.636364,0.549996,0.491862,0.636364,XGBClassifier
9,0.636364,0.601808,0.610189,0.636364,GaussianNB
10,0.623377,0.571869,0.578355,0.623377,GaussianNB
17,0.61039,0.538597,0.489121,0.61039,MLPClassifier
11,0.61039,0.561507,0.566731,0.61039,GaussianNB
16,0.584416,0.520784,0.471503,0.584416,MLPClassifier
25,0.584416,0.52479,0.523282,0.584416,LogisticRegression
27,0.571429,0.482468,0.433766,0.571429,XGBClassifier
22,0.571429,0.502411,0.455185,0.571429,DecisionTreeClassifier
6,0.571429,0.480821,0.43899,0.571429,GaussianProcessClassifier


[36m(train_model pid=33567)[0m   _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
