First Model - Deep And Wide Neural Network Model 

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder

In [None]:
def data_preprocess(df):
    y = df["cancer_type"]
    label_encoder = LabelEncoder();
    y  = label_encoder.fit_transform(y)
    y = pd.Series(y)
    df = df.drop('cancer_type', axis = 1)

    # label encoding for cellularity 40 nan values transformed to 0
    mapping = {
        'Low': 1,
        'Moderate': 2,
        'High': 3,
    }
    df['cellularity'] = df['cellularity'].str.strip()
    df["cellularity"] = df["cellularity"].map(mapping)
    df["cellularity"] = df["cellularity"].fillna(0)


    # dropping patient_id (irrelevant info)
    df = df.drop('patient_id', axis=1)

    #label encoding pam50_+_claudin-low_subtype
    df['pam50_+_claudin-low_subtype'] =label_encoder.fit_transform( df['pam50_+_claudin-low_subtype'])

    df['er_status'] =label_encoder.fit_transform( df['er_status'])

    df['er_status_measured_by_ihc'] = label_encoder.fit_transform(df['er_status_measured_by_ihc'])

    df['her2_status'] = label_encoder.fit_transform(df['her2_status'])

    her2_mapping={
    'LOSS' : 0,
    'NEUTRAL' : 1,
    'GAIN' : 3,
    'UNDEF' : 1
    }

    df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].str.strip()
    df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].map(her2_mapping)

    df['inferred_menopausal_state'] = label_encoder.fit_transform(df['inferred_menopausal_state'])

    map_laterality = {
    'Right':1,
    'Left':-1,
    }
    df['primary_tumor_laterality'] = df['primary_tumor_laterality'].str.strip()
    df['primary_tumor_laterality'] = df['primary_tumor_laterality'].map(map_laterality)
    df['primary_tumor_laterality'] = df['primary_tumor_laterality'].fillna(0)

    df['pr_status'] = label_encoder.fit_transform(df['pr_status'])

    df = pd.get_dummies(df, columns=['3-gene_classifier_subtype'])

    df = pd.get_dummies(df, columns=['death_from_cancer'])

    tumor_mean = df['tumor_size'].mean()
    df["tumor_size"] = df["tumor_size"].fillna(tumor_mean)

    mutation_mean = df['mutation_count'].mean()
    df['mutation_count'] = df['mutation_count'].fillna(mutation_mean)

    df['neoplasm_histologic_grade'] = df['neoplasm_histologic_grade'].fillna(3)

    majority_value = df['tumor_stage'].mode()[0]
    df['tumor_stage'].fillna(majority_value, inplace=True)
    df['tumor_stage']=label_encoder.fit_transform(df['tumor_stage'])

    label_encoders = {}

    for column in df.columns:
        if df[column].dtype == 'object':
            # Create a label encoder for each categorical column
            le = LabelEncoder()

            # Fit the label encoder and transform the data
            df[column] = le.fit_transform(df[column].astype(str))

            # Store the label encoder in a dictionary in case you need to reverse the encoding or use it later
            label_encoders[column] = le
    
    last_seven = df.iloc[:, -7:]
    part_before = df.iloc[:, :2]  # Columns up to the 19th (0-based index, so it includes columns 0-18)
    part_after = df.iloc[:, 2:]
    df = pd.concat([part_before, last_seven, part_after], axis=1)
    df = df.iloc[:, :-7]


    
    return df,y

In [None]:
df = pd.read_csv('data.csv')
X, y = data_preprocess(df)

In [None]:

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, shuffle=True)

from sklearn.utils.class_weight import compute_class_weight
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(enumerate(class_weights))

In [None]:
def deep_and_wide(input, index_tumor_stage ):
  input = tf.keras.layers.Input(shape = (input.shape[1],))
  
  input_d = input[:, :index_tumor_stage + 1]
  input_w = input[:, index_tumor_stage + 1:]
  d = tf.keras.layers.Dense(2048,activation='leaky_relu')(input_d)
  d=  tf.keras.layers.Dense(1024, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(512, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(256, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(256, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(128, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(64, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(32, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(16, activation = 'relu')(d)
  d=  tf.keras.layers.Dense(8, activation = 'relu')(d)

  w = tf.keras.layers.Dense(256,activation = 'relu')(input_w)
  w = tf.keras.layers.Dense(128 , activation = 'relu')(w)
  w = tf.keras.layers.Dense(8 , activation = 'relu')(w)

  combined = tf.keras.layers.concatenate([w, d])
  combined = tf.keras.layers.Dense(10, activation = 'relu')(combined)
  combined = tf.keras.layers.Dense(4, activation = 'softmax')(combined)

  model_deep_wide = tf.keras.Model(inputs = input, outputs = combined)

  model_deep_wide.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

  return model_deep_wide

In [None]:
excel = pd.DataFrame(X)
excel.to_csv('clean.csv', index=False)

In [None]:
frontier = X.columns.get_loc('tumor_stage')
d_w_model = deep_and_wide(X_train, frontier)

In [None]:
history=d_w_model.fit(X_train,y_train,  epochs = 15,validation_data=(X_test, y_test))

In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 6))

# Plot training & validation accuracy values
plt.subplot(1, 2, 1)
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.title('Model accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

# Plot training & validation loss values
plt.subplot(1, 2, 2)
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Train', 'Test'], loc='upper left')

plt.show()

In [None]:
predictions = d_w_model.predict(X_test)
predicted_classes = np.argmax(predictions, axis=1)

In [None]:
conf_matrix = confusion_matrix(y_test, predicted_classes)
plt.figure(figsize=(10, 8))
sns.heatmap(conf_matrix, annot=True, fmt='g', cmap='Blues', xticklabels=[0, 1, 2, 3], yticklabels=[0, 1, 2, 3])
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix')
plt.show()

In [None]:
evl=d_w_model.evaluate(X_test,y_test)

Second Model - Hierarchical XGBoost Approach 

In [None]:
# Loading the necessary libraries 
import pandas as pd
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, RandomizedSearchCV, KFold
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from imblearn.over_sampling import RandomOverSampler
from xgboost import XGBClassifier
from xgboost.callback import TrainingCallback

In [None]:
# Section 1 - Data Preprocessing 
# Load the data
data = pd.read_csv('/content/data.csv', low_memory=False)

# Function to drop columns that have the same value across all rows
def remove_infrequent_categories(data, threshold=0.05):
    filtered_data = data.copy()
    categorical_columns = filtered_data.select_dtypes(include='object').columns
    categorical_columns = [column for column in categorical_columns if column.endswith('_mut')]

    for column in categorical_columns:
        filtered_data = filtered_data.loc[filtered_data[column].isin(filtered_data[column].value_counts().index[filtered_data[column].value_counts()/len(filtered_data) > threshold])]
    return filtered_data

def drop_single_class_columns(df):
    unique_value_counts = df.nunique()
    single_value_columns = unique_value_counts[unique_value_counts == 1].index
    return df.drop(columns=single_value_columns)

# Function to one-hot encode specified categorical columns
def one_hot_encode_columns(df, columns, encoder, isTrain):
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False)
        encoded_data = encoder.fit_transform(df[columns])
    else:
        encoded_data = encoder.transform(df[columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns))
    return df.drop(columns, axis=1).join(encoded_df), encoder

# Main preprocessing function
def data_preprocess(df, encoder=None, isTrain=True):
    df = drop_single_class_columns(df)

    # Fill missing values for numerical columns
    numerical_columns = ['neoplasm_histologic_grade', 'mutation_count', 'tumor_size', 'tumor_stage']
    df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

    # Identify and fill missing values for all other numerical columns just in case
    other_numerical_columns = df.select_dtypes(include=[np.number]).columns.difference(numerical_columns)
    df[other_numerical_columns] = df[other_numerical_columns].fillna(df[other_numerical_columns].median())

    # Handle missing values and encode categorical variables
    categorical_columns = ['pr_status', 'pam50_+_claudin-low_subtype', 'primary_tumor_laterality', 'inferred_menopausal_state', 'her2_status', 'er_status', 'er_status_measured_by_ihc', '3-gene_classifier_subtype', 'death_from_cancer']
    for column in categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # Ensure all other categorical columns are also filled with the most frequent value
    other_categorical_columns = df.select_dtypes(include=['object', 'category']).columns.difference(categorical_columns)
    for column in other_categorical_columns:
        df[column] = df[column].fillna(df[column].mode()[0])

    # One-hot encode categorical variables
    if isTrain:
        encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
        encoded_data = encoder.fit_transform(df[categorical_columns])
    else:
        encoded_data = encoder.transform(df[categorical_columns])
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(categorical_columns))
    df = df.drop(categorical_columns, axis=1).join(encoded_df)

    # Handling 'cellularity' with predefined mapping and filling NaNs
    if 'cellularity' in df.columns:
        df['cellularity'] = df['cellularity'].str.strip()  # Strip whitespace
        mapping = {
            'Low': 1,
            'Moderate': 2,
            'High': 3,
        }
        df['cellularity'] = df['cellularity'].map(mapping).fillna(2)  # Filling NaNs with 'Moderate' assumed as 2

    # Handling 'her2_status_measured_by_snp6' with predefined mapping and dropping rows with 'UNDEF'
    if 'her2_status_measured_by_snp6' in df.columns:
        df = df[df['her2_status_measured_by_snp6'] != 'UNDEF']
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].str.strip()
        her2_mapping = {
            'LOSS': -1,
            'NEUTRAL': 0,
            'GAIN': 1,
        }
        df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].map(her2_mapping)

    # Label encoding for the target variable
    label_encoder = LabelEncoder()
    y = df.pop("cancer_type")
    y_encoded = label_encoder.fit_transform(y) if isTrain else label_encoder.transform(y)
    y_binary = (y_encoded == 0).astype(int)

    return df, y_encoded, y_binary, label_encoder, encoder

# Preprocess the data
X, y, y_binary, label_encoder, encoder = data_preprocess(data.copy(), isTrain=True)

# Split data into train, validate, test sets
X_train, X_temp, y_train, y_temp, y_train_binary, y_temp_binary = train_test_split(X, y, y_binary, test_size=0.4, random_state=42)

In [None]:
# Section 2 - Train binary classification model
# Split the training data for binary classification
X_train_binary, _, y_train_binary, _ = train_test_split(X_train, y_train_binary, test_size=0.2, random_state=42)
X_test_binary, X_val_binary, y_test_binary, y_val_binary = train_test_split(X_temp, y_temp_binary, test_size=0.5, random_state=42)

# Apply Random Over Sampler to the training data
ros = RandomOverSampler(random_state=23)
X_train_ros, y_train_ros = ros.fit_resample(X_train_binary, y_train_binary)

# Adjusting the scale_pos_weight parameter for XGBoost based on class distribution
scale_pos_weight = sum(y_train_ros == 0) / sum(y_train_ros == 1)

# Convert all columns of type 'object' to 'category'
categorical_columns = X_train_ros.select_dtypes(include=['object']).columns
X_train_ros[categorical_columns] = X_train_ros[categorical_columns].astype('category')
X_test_binary[categorical_columns] = X_test_binary[categorical_columns].astype('category')
X_val_binary[categorical_columns] = X_val_binary[categorical_columns].astype('category')

# Create DMatrix for train and test sets
dtrain_binary = xgb.DMatrix(X_train_ros, label=y_train_ros, enable_categorical=True)
dtest_binary = xgb.DMatrix(X_test_binary, label=y_test_binary, enable_categorical=True)
dval_binary = xgb.DMatrix(X_val_binary, label=y_val_binary, enable_categorical=True)

# Specify parameters for XGBoost
params = {
    'objective': 'binary:logistic',
    'eval_metric': 'rmse',
    'num_class': 1,
    'device':'cuda',
    'learning_rate': 0.1,
    'max_depth': 6,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1,
    'alpha': 0.01,
    'scale_pos_weight': scale_pos_weight,  # Applying class weight
    'num_parallel_tree':5,
}

# Define the evaluation log storage callback, inheriting from TrainingCallback
class EvaluationHistoryCallback(TrainingCallback):
    def __init__(self):
        super().__init__()
        self.evaluation_results = {}

    def after_iteration(self, model, epoch, evals_log):
        if not evals_log:
            return False
        for dataset_name, metric_dict in evals_log.items():
            for metric_name, log in metric_dict.items():
                full_metric_name = f"{dataset_name}-{metric_name}"
                if full_metric_name not in self.evaluation_results:
                    self.evaluation_results[full_metric_name] = []
                self.evaluation_results[full_metric_name].append(log[-1])
        return False  # Return False to continue training

# Initialize callback
eval_history = EvaluationHistoryCallback()

# Train the model with evaluation set and custom callback
eval_set = [(dtrain_binary, 'train'), (dval_binary, 'eval')]
binary_model = xgb.train(params, dtrain_binary, num_boost_round=100, evals=eval_set, early_stopping_rounds=10, verbose_eval=True, callbacks=[eval_history])

# Access the stored evaluation results
results = eval_history.evaluation_results
train_rmse = results.get('train-rmse', [])
eval_rmse = results.get('eval-rmse', [])
epochs = len(train_rmse)
x_axis = range(0, epochs)

# Plot learning curves
fig, ax = plt.subplots()
ax.plot(x_axis, train_rmse, label='Train RMSE')
ax.plot(x_axis, eval_rmse, label='Validation RMSE')
ax.legend()
plt.ylabel('RMSE')
plt.title('Learning Curve for Binary Model')
plt.show()

# Make predictions
y_pred_binary = binary_model.predict(dtest_binary)
y_pred_binary = np.round(y_pred_binary)  # Convert probabilities to binary output

# Evaluate model
print("Binary Model Accuracy:", accuracy_score(y_test_binary, y_pred_binary))
print("Binary Model Confusion Matrix:\n", confusion_matrix(y_test_binary, y_pred_binary))
print("Binary Model Classification Report:\n", classification_report(y_test_binary, y_pred_binary))

In [None]:
# Section 3 - Prepare and train multi-class model
# Apply preprocessing and ensure the original target reflects correctly
processed_data, y_encoded, y_binary, label_encoder, encoder = data_preprocess(data.copy(), isTrain=True)

# Reset indices of processed_data to ensure alignment
processed_data.reset_index(drop=True, inplace=True)
y_encoded = pd.Series(y_encoded)

# Confirming the filtered targets
print("Unique values in y_encoded:", np.unique(y_encoded))
print("Classes found by LabelEncoder:", label_encoder.classes_)

# Assuming 'cellularity' should map directly for filtering but not as the target
if 'cellularity' in processed_data.columns:
    # Print unique values for diagnostics
    print("Unique values in cellularity before filtering:", processed_data['cellularity'].unique())

    # Filter data based on 'cellularity' values and exclude class 0
    filtered_data = processed_data[(processed_data['cellularity'].isin([1, 2, 3])) & (y_encoded != 0)]
    
    # Ensure y_encoded aligns with the filtered data
    filtered_y_encoded = y_encoded[filtered_data.index].reset_index(drop=True)
    
     # Re-encode the target variable to adjust for the missing class
    label_encoder = LabelEncoder()
    filtered_y_encoded = label_encoder.fit_transform(filtered_y_encoded)
    print("Re-encoded classes after filtering and exclusion:", label_encoder.classes_)

    # check the filtered results
    print("Filtered Data Shape:", filtered_data.shape)
    print("Filtered Target Shape:", filtered_y_encoded.shape)

unique_classes_filtered = np.unique(filtered_y_encoded)
print("Unique classes after filtering:", unique_classes_filtered)

# Proceed with train-test split and resampling
X_train_multi, X_temp_multi, y_train_multi, y_temp_multi = train_test_split(filtered_data, filtered_y_encoded, test_size=0.2, random_state=42)
X_test_multi, X_val_multi, y_test_multi, y_val_multi = train_test_split(X_temp_multi, y_temp_multi, test_size=0.5, random_state=42)

# Resample the training data
ros_multi = RandomOverSampler(random_state=42)
X_train_multi_ros, y_train_multi_ros = ros_multi.fit_resample(X_train_multi, y_train_multi)

# Recheck the unique values in y_train_multi_ros
print("Unique y values in training data after resample:", np.unique(y_train_multi_ros))

# Adjusting the scale_pos_weight parameter for XGBoost based on class distribution
scale_pos_weight_multi = {}
for class_label in np.unique(y_train_multi_ros):
    scale_pos_weight_multi[class_label] = sum(y_train_multi_ros == class_label) / sum(y_train_multi_ros != class_label)

# Convert object columns to category type
object_columns = X_train_multi_ros.select_dtypes(include=['object']).columns
if not object_columns.empty:
    X_train_multi_ros[object_columns] = X_train_multi_ros[object_columns].astype('category')
    X_test_multi[object_columns] = X_test_multi[object_columns].astype('category')
    X_val_multi[object_columns] = X_val_multi[object_columns].astype('category')

print(np.unique(y_train_multi_ros))

# Specify parameters
params_multi = {
    'objective': 'multi:softmax',
    'num_class': len(np.unique(filtered_y_encoded)),
    'eval_metric': 'mlogloss',  # Changed from rmse to mlogloss for multi-class
    'device': 'cuda',
    'learning_rate': 0.1,
    'max_depth': 15,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1.5,
    'alpha': 0.01,
    'num_parallel_tree': 5
}

# Prepare DMatrix for training and validation
dtrain_multi = xgb.DMatrix(X_train_multi_ros, label=y_train_multi_ros, enable_categorical=True)
dval_multi = xgb.DMatrix(X_val_multi, label=y_val_multi, enable_categorical=True)
dtest_multi = xgb.DMatrix(X_test_multi, label=y_test_multi, enable_categorical=True)

# Check the sizes right before training
print("Training size:", dtrain_multi.num_row(), "Labels:", len(y_train_multi_ros))
print("Validation size:", dval_multi.num_row(), "Labels:", len(y_val_multi))

# Initialize callback
eval_history_multi = EvaluationHistoryCallback()

# Train the model with evaluation set and custom callback
eval_set_multi = [(dtrain_multi, 'train'), (dval_multi, 'eval')]
multi_class_model = xgb.train(params_multi, dtrain_multi, num_boost_round=100, evals=eval_set_multi, early_stopping_rounds=10, verbose_eval=True, callbacks=[eval_history_multi])

# Access the stored evaluation results
results_multi = eval_history_multi.evaluation_results
train_mlogloss = results_multi.get('train-mlogloss', [])
eval_mlogloss = results_multi.get('eval-mlogloss', [])
epochs_multi = len(train_mlogloss)
x_axis_multi = range(0, epochs_multi)

# Plot learning curves
fig, ax = plt.subplots()
ax.plot(x_axis_multi, train_mlogloss, label='Train MLogLoss')
ax.plot(x_axis_multi, eval_mlogloss, label='Validation MLogLoss')
ax.legend()
plt.ylabel('MLogLoss')
plt.title('Learning Curve for Multi-Class Model')
plt.show()

# Make predictions
y_pred_multi = multi_class_model.predict(dtest_multi)

# Evaluate model
print("Multi-Class Model Accuracy:", accuracy_score(y_test_multi, y_pred_multi))
print("Multi-Class Model Confusion Matrix:\n", confusion_matrix(y_test_multi, y_pred_multi))
print("Multi-Class Model Classification Report:\n", classification_report(y_test_multi, y_pred_multi))

In [None]:
# Section 4 - Tuning the first Model
import numpy as np
from sklearn.model_selection import KFold
from sklearn.metrics import roc_auc_score

# Possible ranges of hyperparameters
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 9],
    'min_child_weight': [1, 5, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2],
    'lambda': [1, 1.5, 2],
    'alpha': [0, 0.1, 0.2],
}

# Function to perform manual cross-validation
def manual_cv(params, n_splits=3):
    kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)
    auc_scores = []

    for train_index, test_index in kf.split(X_train_ros):
        X_train_kf, X_test_kf = X_train_ros.iloc[train_index], X_train_ros.iloc[test_index]
        y_train_kf, y_test_kf = y_train_ros[train_index], y_train_ros[test_index]

        dtrain = xgb.DMatrix(X_train_kf, label=y_train_kf, enable_categorical=True)
        dtest = xgb.DMatrix(X_test_kf, label=y_test_kf, enable_categorical=True)

        model = xgb.train(params, dtrain, num_boost_round=params['num_boost_round'])
        y_pred = model.predict(dtest)
        y_pred_binary = np.round(y_pred)
        auc = roc_auc_score(y_test_kf, y_pred_binary)
        auc_scores.append(auc)

    return np.mean(auc_scores)

# Iterate over combinations of parameters
from itertools import product

best_score = 0
best_params = None

for combination in product(*param_grid.values()):
    params = dict(zip(param_grid.keys(), combination))
    params['objective'] = 'binary:logistic'
    params['eval_metric'] = 'logloss'
    params['scale_pos_weight'] = scale_pos_weight  # Keep the scale_pos_weight from your original setup
    params['tree_method'] = 'gpu_hist'  # Ensure to use GPU if available

    current_score = manual_cv(params)
    if current_score > best_score:
        best_score = current_score
        best_params = params

# Print best parameters and score
print("Best Score:", best_score)
print("Best Parameters:", best_params)

# Train the model with the best parameters
final_dtrain = xgb.DMatrix(X_train_ros, label=y_train_ros, enable_categorical=True)
final_model = xgb.train(best_params, final_dtrain, num_boost_round=best_params['num_boost_round'])

# Evaluate the final model
final_dtest = xgb.DMatrix(X_test_binary, label=y_test_binary)
y_pred_final = final_model.predict(final_dtest)
y_pred_final = np.round(y_pred_final)  # Convert probabilities to binary output

print("Final Model Accuracy:", accuracy_score(y_test_binary, y_pred_final))
print("Final Model Confusion Matrix:\n", confusion_matrix(y_test_binary, y_pred_final))
print("Final Model Classification Report:\n", classification_report(y_test_binary, y_pred_final))

Third Model - Another Neural Network Approach 

In [None]:
%pip install optuna
%pip install scikeras

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import numpy as np
from sklearn.preprocessing import MinMaxScaler

def data_preprocess(df):
    # One-hot encoding for the 'cancer_type' target variable
    y = pd.get_dummies(df['cancer_type'], prefix='cancer_type')
    df = df.drop('cancer_type', axis=1)

    # Assume all other categorical data has been appropriately handled and included in df

    # Scaling numeric columns
    numeric_cols = df.select_dtypes(include=['float64', 'int64']).columns
    scaler = MinMaxScaler()
    df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

    # Convert the one-hot encoded y DataFrame to a single column with integer labels
    y = np.argmax(y.values, axis=1)

    return df, y

# Load data
df = pd.read_csv('dataset.csv')

# Preprocess data
X, y = data_preprocess(df)

# Calculate class weights (for one-dimensional integer-encoded y)
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(y), y=y)
class_weights_dict = dict(enumerate(class_weights))

# Splitting data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

print("Class weights:", class_weights_dict)

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.regularizers import l2
import numpy as np  

# Assuming you have your training data loaded into X_train and y_train
# Number of unique classes
num_classes = len(np.unique(y_train))  # Adjusted to find the number of unique classes directly

model = Sequential([
    Dense(32, activation='relu', kernel_regularizer=l2(0.02), input_shape=(X_train.shape[1],)),
    Dropout(0.13),
    Dense(32, activation='relu', kernel_regularizer=l2(0.02)),
    Dropout(0.13),
    Dense(num_classes, activation='softmax', kernel_regularizer=l2(0.02))
])

# Compile the model
model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',  # Make sure this matches your label format
              metrics=['accuracy'])

# Print model summary to check the architecture
model.summary()

In [None]:
import numpy as np

# Check for any remaining NaN values in the dataset
if np.any(np.isnan(X_train)) or np.any(np.isnan(y_train)):
    X_train = np.nan_to_num(X_train)
    y_train = np.nan_to_num(y_train)

# Fitting the model
history = model.fit(
    X_train, y_train,
    epochs=50,
    batch_size=32,
    validation_split=0.2,
    class_weight=class_weights_dict,
    verbose=1
)

# Plot accuracy
import matplotlib.pyplot as plt
plt.plot(history.history['accuracy'], label='Training accuracy')
plt.plot(history.history['val_accuracy'], label='Validation accuracy')
plt.title('Model Accuracy')
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(loc='upper left')
plt.show()

# Plot loss
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.title('Model Loss')
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(loc='upper right')
plt.show()

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, classification_report

# Evaluate the model
try:
    test_loss, test_acc = model.evaluate(X_test, y_test, verbose=1)
    print(f"Test Accuracy: {test_acc * 100:.2f}%")

    # Predict the test set results
    y_pred = model.predict(X_test)
    y_pred_classes = np.argmax(y_pred, axis=1)

    # Generate the confusion matrix
    cm = confusion_matrix(y_test, y_pred_classes)
    plt.figure(figsize=(10, 7))
    sns.heatmap(cm, annot=True, fmt='g', cmap='Blues')
    plt.xlabel('Predicted labels')
    plt.ylabel('True labels')
    plt.title('Confusion Matrix')
    plt.show()

    # Print classification report
    print("Classification Report:\n", classification_report(y_test, y_pred_classes))

except ValueError as e:
    print("ValueError during model evaluation:", e)

In [None]:
import numpy as np
import optuna
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.optimizers import Adam, SGD
from tensorflow.keras.callbacks import EarlyStopping

def create_model(trial):
    # Hyperparameters to tune
    optimizer_options = ['adam', 'sgd']
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    neurons = trial.suggest_categorical('neurons', [32, 64, 128])
    lr = trial.suggest_float('learning_rate', 1e-5, 1e-1, log=True)
    n_layers = trial.suggest_int('n_layers', 1, 5)  # Number of hidden layers to add

    model = Sequential([Input(shape=(X_train.shape[1],))])

    for _ in range(n_layers):
        model.add(Dense(neurons, activation='relu'))
        model.add(Dropout(dropout_rate))

    model.add(Dense(num_classes, activation='softmax'))

    # Compile model with dynamic optimizer
    optimizer = trial.suggest_categorical('optimizer', optimizer_options)
    if optimizer == 'adam':
        opt = Adam(learning_rate=lr)
    elif optimizer == 'sgd':
        opt = SGD(learning_rate=lr)

    model.compile(optimizer=opt, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

    return model

def objective(trial):
    model = create_model(trial)
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    model.fit(
        X_train, y_train,
        epochs=100,
        batch_size=trial.suggest_categorical('batch_size', [32, 64, 128]),
        verbose=0,
        validation_split=0.1,
        callbacks=[early_stopping]
    )

    loss, accuracy = model.evaluate(X_test, y_test, verbose=0)
    return accuracy

# Create a study object
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=100)

# Output the best trial
print("Best trial:")
trial = study.best_trial
print("  Value: ", trial.value)
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import accuracy_score
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.regularizers import l2
from scikeras.wrappers import KerasClassifier

# Function to create the model, required for KerasClassifier
def create_model():
    model = Sequential([
        Input(shape=(X_train.shape[1],)),  # Using Input to specify input shape
        Dense(32, activation='relu', kernel_regularizer=l2(0.02)),
        Dropout(0.13),
        Dense(32, activation='relu', kernel_regularizer=l2(0.02)),
        Dropout(0.13),
        Dense(len(np.unique(y_train)), activation='softmax', kernel_regularizer=l2(0.02))
    ])
    model.compile(optimizer='sgd', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

# Wrap the model with KerasClassifier
model = KerasClassifier(model=create_model, epochs=100, batch_size=32, verbose=0)

# Train the model
model.fit(X_train, y_train)

# Evaluate the model on the test set to report its accuracy
y_pred = model.predict(X_test)
NN_score = accuracy_score(y_test, y_pred)
print('Accuracy score of my best Neural Network: ', NN_score)


Final Model - XGBoost 

IMPORTING NECESSARY LIBRARIES 

In [None]:
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, accuracy_score,classification_report
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
import itertools as itr
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTEN
import seaborn as sns

LOADING THE DATASET

In [None]:
data=pd.read_csv('data.csv')

PREPROCESSING 

In [None]:
cat_columns=['pr_status',  'pam50_+_claudin-low_subtype', 'primary_tumor_laterality',   'inferred_menopausal_state',   'her2_status',   'er_status',   'er_status_measured_by_ihc',   '3-gene_classifier_subtype', 'death_from_cancer']

def one_hot_encode_columns(df, columns,isTrain):
  enc=OneHotEncoder(sparse_output=False)
  encoded_data=enc.fit_transform(df[columns])
  encoded_df = pd.DataFrame(encoded_data, columns=enc.get_feature_names_out(columns))
  df_encoded = pd.concat([df.reset_index(drop=True),encoded_df.reset_index(drop=True)],axis=1)
  return  df_encoded.drop(columns,axis=1)

def data_preprocess(df,isTrain):

  df = df.drop('patient_id', axis=1)
  df=df.drop('cohort',axis=1)

  numerical_columns = ['neoplasm_histologic_grade', 'mutation_count', 'tumor_size', 'tumor_stage']
  df[numerical_columns] = df[numerical_columns].fillna(df[numerical_columns].mean())

  categorical_columns = ['er_status_measured_by_ihc', 'primary_tumor_laterality', '3-gene_classifier_subtype', 'death_from_cancer']
  for column in categorical_columns:
    most_frequent_class = df[column].mode()[0]
    df[column] = df[column].fillna(most_frequent_class)

  # label encoding for cellularity 40 nan values transformed to 0
  mapping = {
      'Low': 1,
      'Moderate': 2,
      'High': 3,
  }

  df['cellularity'] = df['cellularity'].str.strip()
  df["cellularity"] = df["cellularity"].map(mapping)
  df["cellularity"] = df["cellularity"].fillna(3)

  her2_mapping={
  'LOSS' : -1,
  'NEUTRAL' : 0,
  'GAIN' : 1,
  }

  df=df[df.her2_status_measured_by_snp6 != 'UNDEF']
  df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].str.strip()
  df['her2_status_measured_by_snp6'] = df['her2_status_measured_by_snp6'].map(her2_mapping)

  label_encoders = {}
  y = df["cancer_type"]
  df = df.drop('cancer_type', axis = 1) 
  label_encoder = LabelEncoder()
  y  = label_encoder.fit_transform(y)
  y = pd.Series(y,name='cancer_type')

  df=one_hot_encode_columns(df,cat_columns,isTrain=isTrain)
  
  for column in df.columns:
      if df[column].dtype == 'object' and column not in ['pr_status',  'pam50_+_claudin-low_subtype', 'primary_tumor_laterality',   'inferred_menopausal_state',   'her2_status',   'er_status',   'er_status_measured_by_ihc',   '3-gene_classifier_subtype', 'death_from_cancer']:
          
          le = LabelEncoder()

          df[column] = le.fit_transform(df[column].astype(str))

          label_encoders[column] = le
  
  last_seven = df.iloc[:, -7:]
  part_before = df.iloc[:, :2] 
  part_after = df.iloc[:, 2:]
  df = pd.concat([part_before, last_seven, part_after], axis=1)
  df = df.iloc[:, :-7]
  
  return df,y

In [None]:
X, y=data_preprocess(data.copy(),isTrain=True)

SPLITTING

In [None]:
def split_data_by_samples(X, y, desired_samples):
    def get_class_indices(y):
        class_indices = {}
        for i, label in enumerate(np.unique(y)):
            class_indices[label] = np.where(y == label)[0]
        return class_indices
    
    class_indices = get_class_indices(y)
    
    all_indices = np.arange(len(X))
    selected = []
    X_train_selected = pd.DataFrame()
    y_train_selected = []

    X_test_selected = pd.DataFrame()
    y_test_selected = []

    for class_label, num_samples in desired_samples.items():
        indices = class_indices[class_label]
        selected_indices = np.random.choice(indices, size=num_samples, replace=False)
        selected.extend(selected_indices.tolist())
        X_train_selected = pd.concat([X_train_selected, X.loc[selected_indices]])
        y_train_selected.extend(y[selected_indices])

    test_indices = np.setdiff1d(all_indices, sorted(selected))
    X_test_selected = pd.concat([X_test_selected, X.loc[test_indices]])
    y_test_selected.extend(y[test_indices])

    return X_train_selected, pd.Series(y_train_selected), X_test_selected, pd.Series(y_test_selected)



desired_samples = {0: 700, 1: 100,2:12,3:150}
X_train, y_train,X_test, y_test=split_data_by_samples(X,y,desired_samples=desired_samples)
X_validate, X_test, y_validate, y_test =train_test_split(X_test,y_test,shuffle=True,test_size=0.5,random_state=42)

OVERSAMPLING 

In [None]:
import imblearn
ROS=imblearn.over_sampling.RandomOverSampler(random_state=0)
X_train, y_train=ROS.fit_resample(X_train,y_train)

TRAINING 

In [None]:
from xgboost import XGBClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.model_selection import learning_curve
import matplotlib.pyplot as plt

xgb_classifier = XGBClassifier(num_class=4,n_estimators=100, random_state=42)

# Train the classifier
xgb_classifier.fit(X_train, y_train)

# Predict on validation set
y_pred = xgb_classifier.predict(X_validate)

# Calculate F1 score
f1 = f1_score(y_validate, y_pred)

print("acc Score on Validation Set:", f1)

LEARNING CURVE PLOT 

In [None]:
from xgboost.callback import TrainingCallback
class EvaluationHistoryCallback(TrainingCallback):
    def __init__(self):
        super().__init__()
        self.evaluation_results = {}

    def after_iteration(self, model, epoch, evals_log):
        if not evals_log:
            return False
        for dataset_name, metric_dict in evals_log.items():
            for metric_name, log in metric_dict.items():
                full_metric_name = f"{dataset_name}-{metric_name}"
                if full_metric_name not in self.evaluation_results:
                    self.evaluation_results[full_metric_name] = []
                self.evaluation_results[full_metric_name].append(log[-1])
        return False  # Return False to continue training

# Initialize callback
eval_history_multi = EvaluationHistoryCallback()
params_multi = {
    'objective': 'multi:softmax',
    'num_class': 4,
    'eval_metric': 'mlogloss',  
    'device': 'cuda',
    'learning_rate': 0.1,
    'max_depth': 15,
    'min_child_weight': 1,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'gamma': 0.1,
    'lambda': 1.5,
    'alpha': 0.01,
    'num_parallel_tree': 5
}

dtrain=xgb.DMatrix(X_train,label=y_train)
dval=xgb.DMatrix(X_validate,label=y_validate)
eval_set_multi = [(dtrain, 'train'), (dval, 'eval')]
multi_class_model = xgb.train(params_multi, dtrain, num_boost_round=100, evals=eval_set_multi, early_stopping_rounds=10, verbose_eval=True, callbacks=[eval_history_multi])

# Access the stored evaluation results
results_multi = eval_history_multi.evaluation_results
train_mlogloss = results_multi.get('train-mlogloss', [])
eval_mlogloss = results_multi.get('eval-mlogloss', [])
epochs_multi = len(train_mlogloss)
x_axis_multi = range(0, epochs_multi)

# Plot learning curves
fig, ax = plt.subplots()
ax.plot(x_axis_multi, train_mlogloss, label='Train MLogLoss')
ax.plot(x_axis_multi, eval_mlogloss, label='Validation MLogLoss')
ax.legend()
plt.ylabel('MLogLoss')
plt.title('Learning Curve for Multi-Class Model')
plt.show()

CONFUSION MATRIX

In [None]:
from sklearn.metrics import f1_score
import seaborn as sns
#predicted_classes = np.argmax(predictions, axis=1)

# Compute F1 score
f1 = f1_score(y_validate, y_pred, average='macro')

# Visualize confusion matrix
cm = confusion_matrix(y_validate, y_pred)
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=np.unique(y_validate), yticklabels=np.unique(y_validate))
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()

print(f'F1 Score: {f1:.4f}')

HYPERPARAMETER TUNING 

In [None]:
import cupy as cp
hyper_params={
    'objective':['multi:softmax'],
    'eval_metric':['logloss','mse','mae'],
	'eta':[0.01,0.1,0.3,0.9],
	'gamma':[0.05,0.1,0.2],
	'max_depth':[3,6,12],
	'n_estimators':[100,200,300],
    'num_parallel_tree':[3,10,30],
	'subsample':[0.3,0.5,0.8],
	'sampling_method':['uniform','gradient_based'],
	'lambda':[0.1, 0.5 , 1, 5, 7,10],
    'alpha':[0.01,0.05,0.1,0.3,1],
	'tree_method':['hist','approx'],
	'grow_policy':['depthwise','lossguide'],
}
allParams=sorted(hyper_params)

combinations=list(itr.product(*(hyper_params[param] for param in allParams)))


def grid_search_xgb(parameters, X_train, y_train, cv=2):
  """
  Performs grid search with KFold cross-validation for XGBoost with DMatrix.

  Args:
      params: Dictionary of hyperparameter grids to search over.
      x_train: DMatrix containing training data features.
      y_train: Training data target labels.
      cv: Number of folds for KFold cross-validation (default=5).
      scoring: Evaluation metric for scoring models (default='neg_mean_squared_error').
      eval_metric: Evaluation metric reported during training (default='rmse').

  Returns:
      best_params: Dictionary containing the best hyperparameters found.
      best_model: The XGBoost model with the best hyperparameters.
      cv_results: Dictionary containing cross-validation results for each parameter combination.
  """

  best_params = None
  f1_scores=[float('-inf'),float('-inf'),float('-inf')]

  cv_results = {}


  kfold = KFold(n_splits=cv, shuffle=True)

  for combination in parameters:
    for train_idx, val_idx in kfold.split(X_train, y_train):
      x_train_fold = X_train.iloc[train_idx]
      x_val_fold = X_train.iloc[val_idx]
      y_train_fold = y_train[train_idx]
      y_val_fold = y_train[val_idx]


      train_dmatrix = xgb.DMatrix(x_train_fold, label=y_train_fold)
      val_dmatrix = xgb.DMatrix(x_val_fold, label=y_val_fold)

      params = {
          'num_class': 4,
          'alpha': combination[0],
          'device': 'cuda',
          'eta': combination[1],
          'eval_metric': 'logloss',
          'gamma': combination[3],
          'grow_policy': combination[4],
          'lambda': combination[5],
          'max_depth': combination[6],
          'num_parallel_tree': combination[8],
          'objective': 'multi:softmax',
          'sampling_method': combination[10],
          'subsample': combination[11],
          'tree_method': combination[12],
      }


      model = xgb.train(params, train_dmatrix,
                  num_boost_round=200)

      predictions=model.predict(val_dmatrix)
      report=classification_report(y_val_fold,predictions,output_dict=True)
      print(f'{report} for parameters {params}')
      flags=[False,False,False]
      for key, value in report.items():
        if key==1 and f1_scores[key]<value:
          f1_scores[key]=value
          flags[key-1]=True
        elif key==2 and f1_scores[key]<value:
          f1_scores[key]=value
          flags[key-1]=True
        elif key==3 and f1_scores[key]<value:
          f1_scores[key]=value
          flags[key-1]=value

      if flags[0]==True and flags[1]==True and flags[2]==True:
        best_scores = f1_scores
        best_params = combination
        best_model = model

      if combination not in cv_results:
        cv_results[combination] = []
      cv_results[combination].append(f1_scores)

  return best_params, best_model, cv_results


best_params, best_model, cv_results = grid_search_xgb(combinations, X_train, y_train)

# Access results
print(f"Best Hyperparameters: {best_params}")
print(f"Best Model Score: {cv_results}")