In [1]:
from models import define_model
from prepare_data import preprocess_data, split_patient_data_vert_level
from sklearn.model_selection import KFold
from constants import DATA_DIR, MODEL_TYPES, MSCC_LABELS
from utils import extract_patient_id
from evaluate_models import bootstrap_confidence_interval
from sklearn.metrics import mean_squared_error
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.callbacks import EarlyStopping

2023-11-14 14:25:13.815525: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-14 14:25:13.837241: E tensorflow/compiler/xla/stream_executor/cuda/cuda_dnn.cc:9342] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2023-11-14 14:25:13.837274: E tensorflow/compiler/xla/stream_executor/cuda/cuda_fft.cc:609] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2023-11-14 14:25:13.837288: E tensorflow/compiler/xla/stream_executor/cuda/cuda_blas.cc:1518] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-11-14 14:25:13.841501: I tensorflow/tsl/cuda/cudart_stub.cc:28] Could not find cuda drivers on your machine, GPU will not be used.
2023-11-14 14:25:13.841948: I tensorflow/core/platform/cpu_feature_guard.cc:182] This Tens

In [2]:
print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

Num GPUs Available:  0


2023-11-14 14:25:15.388519: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:880] could not open file to read NUMA node: /sys/bus/pci/devices/0000:2b:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-11-14 14:25:15.388697: W tensorflow/core/common_runtime/gpu/gpu_device.cc:2211] Cannot dlopen some GPU libraries. Please make sure the missing libraries mentioned above are installed properly if you would like to use GPU. Follow the guide at https://www.tensorflow.org/install/gpu for how to download and setup the required libraries for your platform.
Skipping registering GPU devices...


In [3]:
# Load mscc values as pandas dataframe from generated_data/ap_ratio_norm_PAM50.csv
ap_ratio_df = pd.read_csv(MSCC_LABELS, delimiter=',')

# Apply the function to each filename in the column
patients = ap_ratio_df['filename'].apply(extract_patient_id).unique()

data_splits = []  # List to store the split data and corresponding labels

for patient in patients:
    # Load metrics file
    metrics_file = f"{DATA_DIR}/{patient}.csv"
    metrics_df = pd.read_csv(metrics_file, delimiter=',')

    # Split the data
    split_data_vert_level = split_patient_data_vert_level(patient, ap_ratio_df, metrics_df)

    # For each split, find the corresponding label
    for split_df in split_data_vert_level:
        # Get the VertLevel range of the split
        min_vert_level = split_df['VertLevel'].min()
        max_vert_level = split_df['VertLevel'].max()

        # Find the corresponding label in ap_ratio_df
        label = ap_ratio_df[(ap_ratio_df['compression_level'] >= min_vert_level) &
                            (ap_ratio_df['compression_level'] <= max_vert_level) &
                            (ap_ratio_df['filename'].str.contains(patient))]['diameter_AP_ratio_PAM50_normalized']

        # Store the split data and label
        data_splits.append((split_df, label.tolist()))

In [8]:
# Define K-Fold Cross Validator
kfold = KFold(n_splits=5, shuffle=True, random_state=42)


# Loop over model types
for model_type in ['gradient_boosting']:
    print(f"Currently training {model_type}...")
    X_train, X_test, y_train, y_test = preprocess_data(data_splits, model_type=model_type)
    if model_type in ['cnn', 'lstm', 'cnn_lstm']:
        input_shape = (X_train.shape[1], X_train.shape[2])
    else:
        input_shape = None

    # K-fold Cross Validation model evaluation
    fold_no = 1
    mse_scores = []  # List to store the MSE for each fold
    confidence_intervals = []  # List to store the confidence interval for each fold

    for train, val in kfold.split(X_train, y_train):
        # Define and compile the model for this fold
        model = define_model(model_type, input_shape=input_shape)
        if model_type in ['cnn', 'lstm', 'cnn_lstm']:
            optimizer = tf.keras.optimizers.Adam(learning_rate=0.0001, clipvalue=1.0)
            model.compile(optimizer=optimizer, loss='mean_squared_error')
        elif model_type in ['random_forest', 'gradient_boosting']:
            # No specific compilation needed for sklearn models
            pass

        # Generate a print
        print(f'Training for fold {fold_no} ...')

        # Fit data to model
        if model_type in ['cnn', 'lstm', 'cnn_lstm']:
            early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
            model.fit(X_train[train], y_train[train], epochs=10, validation_data=(X_train[val], y_train[val]), callbacks=[early_stopping])
        elif model_type in ['random_forest', 'gradient_boosting']:
            model.fit(X_train[train], y_train[train])

        # Ensure consistent input shape and convert to TensorFlow tensor
        X_val_tensor = tf.convert_to_tensor(X_train[val], dtype=tf.float32)

        # Evaluate the model on the validation set
        if model_type in ['cnn', 'lstm', 'cnn_lstm']:
            y_pred = model.predict(X_val_tensor)
        elif model_type in ['random_forest', 'gradient_boosting']:
            y_pred = model.predict(X_train[val])

        # if y_pred contains NaN values, skip this fold and show error message
        if np.isnan(y_pred).any():
            print(f"Fold {fold_no} skipped due to NaN values in y_pred")
            continue

        mse = mean_squared_error(y_train[val], y_pred)
        mse_scores.append(mse)

        print(f"Fold {fold_no} MSE: {mse}")

        # Calculate the confidence interval for this fold and save it for calculating the confidence interval for the average MSE
        lower_bound, upper_bound = bootstrap_confidence_interval(model, X_train[val], y_train[val])
        print(f"95% Confidence Interval for fold {fold_no}: ({lower_bound:.4f}, {upper_bound:.4f})")
        confidence_intervals.append((lower_bound, upper_bound))
        print("===================================")

        # Increase fold number
        fold_no += 1

    # Calculate and print the average MSE over all folds
    average_mse = np.mean(mse_scores)
    print(f"Average MSE for {model_type}: {average_mse}")
    print("===================================")

    # Calculate the confidence interval for the average MSE using the confidence intervals for each fold
    lower_bound = np.mean([interval[0] for interval in confidence_intervals])
    upper_bound = np.mean([interval[1] for interval in confidence_intervals])
    print(f"95% Confidence Interval for {model_type}: ({lower_bound:.4f}, {upper_bound:.4f})")
    print("===================================")

Currently training gradient_boosting...
X_train shape: (76, 4518)
X_test shape: (20, 4518)
y_train shape: 76
y_test shape: 20
Training for fold 1 ...
Fold 1 MSE: 34.569775910352035
95% Confidence Interval for fold 1: (15.5396, 60.2733)
Training for fold 2 ...
Fold 2 MSE: 391.9664736010367
95% Confidence Interval for fold 2: (27.6575, 1037.9814)
Training for fold 3 ...
Fold 3 MSE: 464.45886957518
95% Confidence Interval for fold 3: (50.8445, 1207.1641)
Training for fold 4 ...
Fold 4 MSE: 28.477393944719935
95% Confidence Interval for fold 4: (16.1616, 44.2536)
Training for fold 5 ...
Fold 5 MSE: 44.900843320851195
95% Confidence Interval for fold 5: (24.3515, 63.9560)
Average MSE for gradient_boosting: 192.874671270428
95% Confidence Interval for gradient_boosting: (26.9109, 482.7257)
