# Setup

In [None]:
%pip install dill
%pip install pyreadr
%pip install scikit-learn
%pip install pandas
%pip install numpy
%pip install xgboost
%pip install -U matplotlib
%pip install -U seaborn
%pip install typeguard
%pip install PyQt6
%pip install smogn
%pip install seaborn
%pip install -U imbalanced-learn

In [None]:
# Library to check function types of imported modules
from typeguard import install_import_hook

# Data import and export
import pyreadr
import dill

# Data management libraries
import pandas as pd
import numpy as np
from scipy.stats import pearsonr, percentileofscore

# Plotting libraries
from matplotlib import pyplot as plt
import seaborn as sns
sns.set_theme(style='whitegrid')

# Miscellaneous
import os

# Preprocessing
from smogn import smoter
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler

# K-fold cross-validation
from sklearn.model_selection import KFold

# Custom functions for plotting, data operations, and model training
with install_import_hook('custom_ml_plots'):
    import custom_ml_plots as cmp
with install_import_hook('custom_dataset_tools'):
    import custom_dataset_tools as cdt
with install_import_hook('basic_ml_operations'):
    import basic_ml_operations as bmo
with install_import_hook('ml_data_objects'):
    import ml_data_objects as mdo

# Global parameters
RANDOM_STATE = 42
TOP_THRESHOLD_QUANTILE = 0.8  # Values to test: 0.5, 0.6, 0.7, 0.8, 0.9
SMOGN_PREPROCESS = True
UNDERSAMPLE = False


## Setup Plotting Functions

In [None]:
def plot_shaded_scatter_grids(y_preds_grid: np.ndarray, y_test_grid: np.ndarray, axis1_params: mdo.AxisParams, axis2_params: mdo.AxisParams, pearson_grid: np.ndarray, plot_title: str) -> plt.Figure:
    """
    Plot predictions vs actuals and colour by pearson coefficient and add best fit
    Created: 2024/11/30

    Args:
        y_preds_grid (np.ndarray): 2D array of predicted values from different models.
        y_test_grid (np.ndarray): 2D array of actual values corresponding to the predictions.
        axis1_params (mdo.AxisParams): Hyperparameters for the first axis.
        axis2_params (mdo.AxisParams): Hyperparameters for the second axis.
        pearson_grid (np.ndarray): 2D array of Pearson coefficients for each model.
        plot_title (str): Title of the plot.

    Returns:
        plt.Figure: The resulting figure object containing the scatter plots.
    """
    # Create a grid of scatter plots with predictions vs actuals
    fig, axs = cmp.create_scatter_grid(y_preds_grid, y_test_grid, axis1_params, axis2_params, plot_title)

    # Color the scatter plots by Pearson coefficient and add best fit lines and title
    cmp.color_spectrum(fig, axs, pearson_grid, label="Pearson Coefficient")
    cmp.add_best_fit(axs)

    return fig

In [None]:
def plot_shaded_roc_grids(y_preds_grid: np.ndarray, y_test_grid: np.ndarray, axis1_params: mdo.AxisParams, axis2_params: mdo.AxisParams, f1_grid: np.ndarray, plot_title: str) -> plt.Figure:
    """
    Plot predictions vs actuals and colour by f1 score and add best fit
    Created: 2024/12/22

    Args:
        y_preds_grid (np.ndarray): 2D array of predicted probabilities from different models.
        y_test_grid (np.ndarray): 2D array of actual binary values corresponding to the predictions.
        axis1_params (mdo.AxisParams): Hyperparameters for the first axis.
        axis2_params (mdo.AxisParams): Hyperparameters for the second axis.
        f1_grid (np.ndarray): 2D array of F1 scores for each model.
        plot_title (str): Title of the plot.

    Returns:
        plt.Figure: The resulting figure object containing the ROC plots.
    """
    # Create a grid of ROC plots with predictions vs actuals
    fig, axs = cmp.create_roc_grid(y_preds_grid, y_test_grid, axis1_params, axis2_params, plot_title)

    # Color the ROC plots by F1 score and add best fit lines and title
    cmp.color_spectrum(fig, axs, f1_grid, label="F1 Score")
    
    return fig

In [None]:
# Stores average metrics for each model for final comparison
# See Montesinos-Lopez research paper and README for details on metrics and the different models
B_average_metrics = pd.DataFrame(columns=['F1 Score', 'Sensitivity', 'Specificity', 'Kappa'])
R_average_metrics = pd.DataFrame(columns=['Pearson', 'F1 Score', 'Sensitivity', 'Specificity', 'Kappa'])
RO_average_metrics = pd.DataFrame(columns=['Pearson', 'F1 Score', 'Sensitivity', 'Specificity', 'Kappa'])

# Add existing GBLUP regression model data from Montesinos-Lopez paper
B_average_metrics.loc['GBLUP'] =  [0.411, 0.696, 0.577, 0.180]
R_average_metrics.loc['GBLUP'] =  [None, 0.215, 0.128, 0.987, 0.164]
RO_average_metrics.loc['GBLUP'] = [None, 0.487, 0.711, 0.699, 0.304]

In [None]:
def plot_GY_hist(GY_df: pd.DataFrame, title: str, x_ax_label: str = 'Grain Yield (GY)', y_ax_label: str = 'Frequency') -> None:
    """
    Create a histogram of grain yield values
    Created: 2024/01/12

    Args:
        GY_df (pd.DataFrame): DataFrame containing grain yield values
        title (str): Title of the plot
        x_ax_label (str): Label for the x-axis (default is 'Grain Yield (GY)')
        y_ax_label (str): Label for the y-axis (default is 'Frequency')
    """
    # Create histogram with 60 bins and black edges for each bin
    plt.hist(GY_df, bins=60, edgecolor='black')

    # Add labels for x-axis and y-axis
    plt.xlabel(x_ax_label)
    plt.ylabel(y_ax_label)
    
    # Add title to the plot
    plt.title(title)
    
    # Save the plot as an SVG file in the specified storage directory
    plt.savefig(f'{storage_dir}\\{title}.svg', format="svg")
    
    # Display the plot
    plt.show()
    
    # Close the plot to free up memory
    plt.close()


## Import Data

In [None]:
def create_numbered_subdir():
    """
    Creates a new subdirectory within the 'saved_data_and_plots' directory, 
    with a name that is the next available number in sequence, formatted as a 
    three-digit number (e.g., '001', '002', etc.).
    Created: 2024/01/01
    Returns:
        str: The path to the newly created numbered subdirectory.
    """
    # Define the parent directory where subdirectories will be created
    parent_dir = "saved_data_and_plots"
    
    # Create parent directory if it doesn't exist
    if not os.path.exists(parent_dir):
        os.makedirs(parent_dir)
    
    # List all existing directories within the parent directory
    existing_dirs = [d for d in os.listdir(parent_dir) 
                    if os.path.isdir(os.path.join(parent_dir, d))]
    
    # Extract numeric values from directory names and find the next available number
    existing_nums = [int(d) for d in existing_dirs if d.isdigit()]
    next_num = max(existing_nums + [-1]) + 1
    
    # Create the new numbered directory with the next available number
    new_dir = os.path.join(parent_dir, f"{next_num:03d}")
    os.makedirs(new_dir)
    
    # Return the path to the newly created directory
    return new_dir

# Create a new numbered subdirectory and store its path in the variable 'storage_dir'
storage_dir = create_numbered_subdir()

In [None]:
# Import dataset
eyt1 = pyreadr.read_r('./data/eyt1.RData')

# Extract training example labels
y = eyt1['Pheno_Disc_Env1'][['GY']]

# Set index to gene IDs and sort by index
y = y.set_index(eyt1['Pheno_Disc_Env1']['GID'])
y = y.sort_index()

# Display the first few rows of the dataset
display(y.head())

# Check for missing values
cdt.assert_no_bad_values(y)

# Each seed was planted in 4 different environments, but we don't care about environmental differences
# So we take the average of every group of four rows to reduce the dataset to 1/4 its original size
y = cdt.avg_rows(y, 4)

# Plot histogram of grain yield values
plot_GY_hist(y, 'GY, Unscaled')

# Display summary statistics of the dataset
y.describe()


In [None]:
# Extract the feature matrix and set the index to match y
X = eyt1['Geno_Env1'].sort_index()

# Display the feature matrix
display(X)

# Reset the index for both X and y to ensure they match
X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

In [None]:
def scale_features_and_target(X: pd.DataFrame, y: pd.DataFrame) -> (pd.DataFrame, pd.DataFrame, StandardScaler, StandardScaler):
    """
    Scale the feature matrix and target values using StandardScaler.

    Created: 2024/01/01
    
    Args:
        X (pd.DataFrame): Feature matrix.
        y (pd.DataFrame): Target values.
    
    Returns:
        X_sc (pd.DataFrame): Scaled feature matrix.
        y_sc (pd.DataFrame): Scaled target values.
        X_scaler (StandardScaler): Scaler used for features.
        y_scaler (StandardScaler): Scaler used for target.
        
    """
    X_sc, y_sc, X_scaler, y_scaler = None, None, None, None

    # Scale the feature matrix
    if X is not None:
        X_scaler = StandardScaler()
        X_sc = X_scaler.fit_transform(X)
        X_sc = pd.DataFrame(X_sc, index=X.index, columns=X.columns)

    # Scale the target values
    if y is not None:
        y_scaler = StandardScaler()
        y_sc = y_scaler.fit_transform(y)
        y_sc = pd.DataFrame(y_sc, index=y.index, columns=y.columns)
    
    return X_sc, y_sc, X_scaler, y_scaler

In [None]:
def smogn_prep(X: pd.DataFrame, y: pd.DataFrame, top_threshold_quantile: float, undersample: bool = True) -> None:
    """
    Preprocesses the dataset using the SMOGN algorithm
    Created: 2024/02/05

    Args:
        X (pd.DataFrame): Feature matrix
        y (pd.DataFrame): DataFrame containing grain yield values
        top_threshold_quantile (float): The quantile value to use as the threshold for the top class
        undersample (bool): Whether to undersample the majority class (default is True)
    """

    # Temporarily combine X and y for compatibility with the SMOGN library
    X = X.reset_index(drop=True)
    y = y.reset_index(drop=True)
    smogn_X_y = pd.concat([X, y], axis=1)

    # Get GY distribution points
    gy_min = y['GY'].min()
    gy_max = y['GY'].max()
    gy_just_under_threshold = y['GY'].quantile(top_threshold_quantile - 0.0001)
    gy_just_over_threshold = y['GY'].quantile(top_threshold_quantile + 0.0001)

    # Define control points for the SMOGN augmentation relevance function
    ctrl_points = [
        [gy_min, 0, 0],
        [gy_just_under_threshold, 0, 0],
        [gy_just_over_threshold, 1, 0],
        [gy_max, 1, 0]
    ]

    # Display the combined DataFrame
    display(smogn_X_y)

    n_tries = 0
    done = False

    # Unfortunately the library has a bug where it randomly throws exceptions occasionally regardless of input
    # So we have to try multiple times until it works... this is hacky but it will have to do until the library is fixed
    while not done:
        try:
            # Apply the SMOGN algorithm to balance the dataset
            X = smoter(
                data=smogn_X_y,
                y='GY',
                k=5,
                under_samp=undersample,
                samp_method='balance',
                rel_thres=top_threshold_quantile,
                rel_method='manual',
                rel_ctrl_pts_rg=ctrl_points,
                rel_xtrm_type='high',
                rel_coef=1.50
            )
            done = True

        except ValueError:
            if n_tries < 5:
                n_tries += 1
            else:
                raise

    # Split X and y back into separate DataFrames
    y = X[['GY']]
    X = X.drop(columns=['GY'])

    return X, y

In [None]:
# Identify the cutoff value for the "Top Line" classification of grain yield values
top_boundary_val = y["GY"].quantile(TOP_THRESHOLD_QUANTILE)

# Model R

In [None]:
def inner_CV_R(n_splits: int, X : pd.DataFrame, y : pd.DataFrame, axis1_params: mdo.AxisParams, axis2_params: mdo.AxisParams, 
               train_model_callback, kfold_random_state: int, plot_title: str = "", **kwargs):
    """Perform inner cross-validation with grid search to find the best model parameters.
    Created: 2024/12/03
    Parameters:
    -----------
    n_splits : int
        Number of splits for KFold cross-validation.
    X : pd.DataFrame
        Feature data.
    y : pd.DataFrame
        Target data.
    axis1_params : mdo.AxisParams
        Parameter grid for the first axis.
    axis2_params : mdo.AxisParams
        Parameter grid for the second axis.
    train_model_callback : callable
        Callback function to train the model.
    kfold_random_state : int
        Random state for KFold shuffling.
    plot_title : str, optional
        Title for the plot (default is "").
    **kwargs : dict
        Additional keyword arguments for the model training.
    Returns:
    --------
    avg_best_param1 : float
        Average best parameter value for the first axis over all folds.
    avg_best_param2 : float
        Average best parameter value for the second axis over all folds."""

    # Create KFold object for inner-fold cross-validation
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=kfold_random_state)

    # Store best parameters (param1, param2) for each fold
    best_params = pd.DataFrame(columns=['param1', 'param2'], index=range(n_splits))

    # Iterate through each train-test split
    for i, (train_index, test_index) in enumerate(kfold.split(X)):
        
        # Split the data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train a grid of models with every combination of parameters
        model_grid = bmo.train_model_grid(X_train, y_train, axis1_params, axis2_params, train_model_callback, **kwargs)

        # Use trained models to predict test set labels, and store in 2D array with each cell corresponding to a model with a specific combination of parameters
        y_preds_grid = bmo.grid_predict(X_test, model_grid)

        # Create 2D array of identical dataframes containing actual labels to compare against predictions
        y_test_grid = cdt.np_array_of_dfs(y_test, y_preds_grid.shape)

        # Evaluate predictions by comparing to actuals, calculating 2D array of Pearson coefficients
        pearson_grid = bmo.calculate_pearson_coefficients(y_preds_grid, y_test_grid)

        # Find index of best Pearson coefficient in the 2D array of Pearson coefficients
        best_row, best_col = np.unravel_index(np.argmax(pearson_grid), pearson_grid.shape)
        
        # Store hyperparameters of the most accurate model for this inner fold
        best_params.loc[i] = [axis1_params.values[best_row], axis2_params.values[best_col]]

        # Create grid of scatter plots with predictions vs actuals, colored by Pearson coefficient for each model
        scatter_grid = plot_shaded_scatter_grids(y_preds_grid, y_test_grid, axis1_params, axis2_params, pearson_grid, f'{plot_title} | Inner Fold {i}')        
        plt.savefig(f'{storage_dir}\\Model R {train_model_callback.__name__}, ({plot_title}, Inner Fold {i}).svg', format="svg")
        plt.show(scatter_grid)
        plt.close(scatter_grid)

    # Calculate average best parameters over all inner folds to return to outer CV
    avg_best_param1 = best_params['param1'].mean()
    avg_best_param2 = best_params['param2'].mean()

    return avg_best_param1, avg_best_param2


In [None]:
def outer_CV_R(n_outer_splits: int, n_inner_splits: int, X : pd.DataFrame, y : pd.DataFrame, 
               axis1_params: mdo.AxisParams, axis2_params: mdo.AxisParams, 
               train_model_callback : callable, kfold_random_state: int, top_boundary_val : float,
               smogn_preprocess = False, undersample = True, **kwargs) -> pd.DataFrame:
    """
    Perform nested cross-validation with an outer and inner loop to evaluate model performance.
    Created: 2024/12/03
    Parameters:
    -----------
    n_outer_splits : int
        Number of splits for the outer cross-validation loop.
    n_inner_splits : int
        Number of splits for the inner cross-validation loop.
    X : pd.DataFrame
        Feature data.
    y : pd.DataFrame
        Target data.
    axis1_params : mdo.AxisParams
        Object containing parameter list for the first hyperparameter axis (horizontal).
    axis2_params : mdo.AxisParams
        Object containing parameter list for the first hyperparameter axis (vertical).
    train_model_callback : callable
        Function to train the model. Should accept X, y, and hyperparameters as arguments.
    kfold_random_state : int
        Random state for reproducibility in KFold splitting.
    top_line_thresh : float
        Threshold to classify predictions as top or not top.
    **kwargs
        Additional arguments to pass to the train_model_callback function.
    Returns:
    --------
    pd.DataFrame
        DataFrame containing 5 metrics (Pearson, F1 Score, Sensitivity, Specificity, Kappa) for each outer fold.
    """
    
    # Create KFold object for outer loop to split data into train and test sets
    kfold = KFold(n_splits=n_outer_splits, shuffle=True, random_state=kfold_random_state)

    # Store metrics for each fold
    kfold_metrics = pd.DataFrame(columns=['Pearson', 'F1 Score', 'Sensitivity', 'Specificity', 'Kappa'])

    # Create arrays to store outer-fold final "super model"'s predictions and actuals
    super_model_preds = [None] * n_outer_splits
    super_model_actuals = [None] * n_outer_splits 

    # Iterate through each train-test split
    for i, (train_index, test_index) in enumerate(kfold.split(X)):

        # Split the data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if smogn_preprocess:
            top_boundary_quantile_in_train_set = percentileofscore(y_train.to_numpy().flatten(), top_boundary_val, kind='mean') / 100

            # Store pre-SMOGN data for later use
            X_train_pre_smogn = X_train.copy()
            y_train_pre_smogn = y_train.copy()
            X_train, y_train = smogn_prep(X_train, y_train, top_boundary_quantile_in_train_set, undersample)

            if not undersample:
                # Manually concatenate the original data below the augmentation threshold with the augmented data
                non_augmented_indices = y_train_pre_smogn[y_train_pre_smogn < top_boundary_val].index
                X_train_non_augmented = X_train_pre_smogn.loc[non_augmented_indices]
                y_train_non_augmented = y_train_pre_smogn.loc[non_augmented_indices]
                X_train = pd.concat([X_train, X_train_non_augmented], axis=0)
                y_train = pd.concat([y_train, y_train_non_augmented], axis=0)

            plot_GY_hist(y_train, f'Model R SMOGN-Augmented GY Histogram, Outer Fold {i}')
        else:
            plot_GY_hist(y_train, f'Model R Histogram, Outer Fold {i}')

        # Display summary statistics of y_train
        display(y_train.describe())

        # Scale features and target
        X_train, y_train, X_scaler, y_scaler = scale_features_and_target(X_train, y_train)
        top_boundary_val_scaled = y_scaler.transform([[top_boundary_val]])[0, 0]
        X_test = pd.DataFrame(X_scaler.transform(X_test))
        y_test = pd.DataFrame(y_scaler.transform(y_test))
            
        # Find mean best hyperparameter values based on prediction accuracy using inner-fold CV
        best_param1, best_param2 = inner_CV_R(n_inner_splits, X_train, y_train, axis1_params, axis2_params, train_model_callback, kfold_random_state, plot_title=f"Outer Fold {i}", **kwargs)

        # Train model with all training and CV data of outer fold using mean best hyperparameters
        super_model = train_model_callback(X_train, np.ravel(y_train), **dict(zip([axis1_params.name, axis2_params.name], [best_param1, best_param2])), **kwargs)

        # Use trained "super-model" to predict test set
        X_test.columns = X.columns
        y_pred = pd.DataFrame(super_model.predict(X_test), index=y_test.index, columns=y_test.columns)
        plot_GY_hist(y_pred, f'Model R Predicted GY Histogram, {train_model_callback.__name__}, Outer Fold {i}')

        # Calculate Pearson coefficient of continuous predictions
        pearson, _ = pearsonr(np.ravel(y_pred), np.ravel(y_test))

        # Classify predictions and actuals of super_model as top or not top (boolean)
        y_pred_top = bmo.continuous_to_binary_absolute(y_pred, top_boundary_val_scaled)
        super_model_preds[i] = y_pred_top
        y_test_top = bmo.continuous_to_binary_absolute(y_test, top_boundary_val_scaled)
        super_model_actuals[i] = y_test_top

        # Plot super model predictions vs actuals scatterplot
        cmp.plot_classification_results(y_pred, y_test, y_pred_top, y_test_top, 
                                        [f"Predicted vs Actual GY, {train_model_callback.__name__}, Outer Fold {i}"],
                                        save_path=f'{storage_dir}\\Super Model Predicted vs Actual GY, {train_model_callback.__name__}, Outer Fold {i}.svg')

        # Calculate classification metrics and add new row to kfold_metrics
        classification_metrics = cdt.classification_metrics(y_pred_top, y_test_top)
        pearson_df = pd.DataFrame([pearson], columns=['Pearson'])
        metrics_row = pd.concat([pearson_df, classification_metrics], axis=1)
        kfold_metrics = pd.concat([kfold_metrics, metrics_row], axis=0)

    # Label each row of kfold_metrics with the fold number 
    kfold_metrics.index = range(n_outer_splits)
    
    return kfold_metrics


## Support Vector Machine

In [None]:

# Define hyperparameter grids for SVM regression model and conduct 2-dimensional cross-validation
x_params_SVM_R = mdo.AxisParams('gamma', bmo.power_list(2, -14, -6))
y_params_SVM_R = mdo.AxisParams('C', bmo.power_list(2, -2, 6))
metrics_SVM_R = outer_CV_R(n_outer_splits=5, 
                           n_inner_splits=10, 
                           X=X, 
                           y=y,
                           axis1_params=x_params_SVM_R, 
                           axis2_params=y_params_SVM_R, 
                           train_model_callback=bmo.train_SVM_regressor, 
                           smogn_preprocess=SMOGN_PREPROCESS,
                           undersample=UNDERSAMPLE,
                           kfold_random_state=RANDOM_STATE, 
                           top_boundary_val=top_boundary_val, 
                           kernel='rbf')

"""
# Dummy values - Uncomment for quick debugging tests  
x_params_SVM_R = mdo.AxisParams('gamma', bmo.power_list(2, -1, 0))  
y_params_SVM_R = mdo.AxisParams('C', bmo.power_list(2, 1, 2))  
metrics_SVM_R = outer_CV_R(n_outer_splits=2,   
                           n_inner_splits=2,   
                           X=X, 
                           y=y,
                           axis1_params=x_params_SVM_R, 
                           axis2_params=y_params_SVM_R, 
                           train_model_callback=bmo.train_SVM_regressor, 
                           smogn_preprocess=SMOGN_PREPROCESS,
                           undersample=UNDERSAMPLE,
                           kfold_random_state=RANDOM_STATE, 
                           top_boundary_val=top_boundary_val, 
                           kernel='rbf')  
  """

In [None]:
# Display classification metrics for best model from each outer fold
display(metrics_SVM_R)

In [None]:
# Save average of each metric
metrics_SVM_R_mean = metrics_SVM_R.mean().to_frame().T
R_average_metrics.loc['SVM'] = metrics_SVM_R_mean.iloc[0]
display(metrics_SVM_R_mean)

## XGBoost

In [None]:

# Test values
x_params_XGB_R = mdo.AxisParams('n_estimators', [13, 25, 50, 100, 200])
y_params_XGB_R = mdo.AxisParams('max_depth', [1, 2, 3, 4, 6, 10, 16])
metrics_XGB_R = outer_CV_R(n_outer_splits=5, 
                           n_inner_splits=10, 
                           X=X, 
                           y=y,  
                           axis1_params=x_params_XGB_R, 
                           axis2_params=y_params_XGB_R, 
                           train_model_callback=bmo.train_XGB_regressor, 
                           smogn_preprocess=SMOGN_PREPROCESS,
                           undersample=UNDERSAMPLE,
                           kfold_random_state=RANDOM_STATE, 
                           top_boundary_val=top_boundary_val, 
                           objective="reg:squarederror", eval_metric="rmse")


"""
# Dummy values for quick debugging tests
x_params_XGB_R = mdo.AxisParams('n_estimators', [1, 2])
y_params_XGB_R = mdo.AxisParams('max_depth', [1, 2])
metrics_XGB_R = outer_CV_R(2, 2,  
                           X=X, 
                           y=y,  
                           axis1_params=x_params_XGB_R, 
                           axis2_params=y_params_XGB_R, 
                           train_model_callback=bmo.train_XGB_regressor, 
                           smogn_preprocess=SMOGN_PREPROCESS,
                           undersample=UNDERSAMPLE,
                           kfold_random_state=RANDOM_STATE, 
                           top_boundary_val=top_boundary_val, 
                           objective="reg:squarederror", eval_metric="rmse")
"""

In [None]:
# Display classification metrics for best model from each outer fold
display(metrics_XGB_R)

In [None]:
# Print average of each metric and store results for analysis
metrics_XGB_R_mean = metrics_XGB_R.mean().to_frame().T
display(metrics_XGB_R_mean)
R_average_metrics.loc['XGB'] = metrics_XGB_R_mean.iloc[0]

In [None]:
# Save serialized session variables and models to disk for later use
dill.dump_session(f'{storage_dir}\\project_ipynb_env_R.db')

# Model B

In [None]:
def inner_CV_B(n_splits: int, X : pd.DataFrame, y_bin : pd.DataFrame, axis1_params: mdo.AxisParams, axis2_params: mdo.AxisParams, 
               train_model_callback, kfold_random_state: int, classification_col : int, plot_title: str = "", 
               **kwargs):
    """
    Perform inner cross-validation to tune model hyperparameters and find the optimal classification threshold.
    Created: 2024/12/04
    Parameters:
    n_splits (int): Number of splits for KFold cross-validation.
    X (pd.DataFrame): Feature data.
    y (pd.DataFrame): Target data.
    axis1_params (mdo.AxisParams): Object containing hyperparameter values for the first (horizontal) axis.
    axis2_params (mdo.AxisParams): Object containing hyperparameter values for the second (vertical) axis.
    train_model_callback (function): Callback function to train the model.
    kfold_random_state (int): Random state for KFold shuffling.
    classification_col (int): Column index to pull classification probabilities from - 0 for not top, 1 for top.
    top_thresh_quantile (float): Threshold to classify predictions as top or not top.
    plot_title (str, optional): Title for the ROC plot. Defaults to "".
    **kwargs: Additional arguments for the train_model_callback function.
    Returns:
    tuple: Average best parameters (param1, param2) and the best classification threshold.
    """
    
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=kfold_random_state)

    # Arrays to store parameters and binary classification thresholds of the most accurate model for each inner fold
    best_params = pd.DataFrame(columns=['param1', 'param2'], index=range(n_splits))
    best_thresholds = pd.DataFrame(columns=['threshold'], index=range(n_splits))

    # Iterate through each train-test split
    for i, (train_index, test_index) in enumerate(kfold.split(X)):
        # Split the data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_bin_train, y_bin_test = y_bin.iloc[train_index], y_bin.iloc[test_index]

        # Train a grid of models with every combination of parameters
        model_grid = bmo.train_model_grid(X_train, y_bin_train, axis1_params, axis2_params, train_model_callback, **kwargs)
        
        # Use trained models to predict test set probabilities, and store in a 2D array with each cell corresponding to a model with a specific combination of parameters
        y_proba_preds_grid = bmo.grid_predict_proba(X_test, model_grid, classification_col)

        # Use probabilities to classify predictions as top or not top. This isn't the final classification, 
        # but a step towards finding the optimal threshold, so we just use the default 0.5 threshold for now.
        y_binary_preds_grid = bmo.continuous_to_binary_absolute_grid(y_proba_preds_grid, 0.5)

        # Create a 2D array of identical dataframes containing actual labels to compare against predictions
        y_bin_test_grid = cdt.np_array_of_dfs(y_bin_test, y_proba_preds_grid.shape)

        # Evaluate predictions by comparing to actuals, calculating a 2D array of F1 scores.
        f1_grid = bmo.calculate_f1_scores(y_binary_preds_grid, y_bin_test_grid)

        # Find the index of the best F1 score in the 2D array of F1 scores
        best_row, best_col = np.unravel_index(np.argmax(f1_grid), f1_grid.shape)

        # Store hyperparameters of the most accurate model for this inner fold
        best_params.loc[i] = [axis1_params.values[best_row], axis2_params.values[best_col]]

        # Find the classification probability threshold between zero and one that yields the lowest squared difference between sensitivity and 
        # specificity using this optimal model. To do this, we feed find_optimal_threshold() the probabilities predicted by the model, not the binary predictions.
        best_model_proba_preds = y_proba_preds_grid[best_row, best_col]
        best_thresholds.iloc[i, 0] = bmo.find_optimal_threshold_absolute(y_bin_test, best_model_proba_preds)

        # Create a grid of ROC plots with predictions vs actuals, colored by F1 score for each model
        roc_grid = plot_shaded_roc_grids(y_proba_preds_grid, y_bin_test_grid, axis1_params, axis2_params, f1_grid, f'{plot_title} | Inner Fold {i}')        
        plt.savefig(f'{storage_dir}\\model_B, {train_model_callback.__name__}, ({plot_title}, Inner Fold {i}).svg', format="svg")

        plt.show(roc_grid)
        plt.close(roc_grid)

    # Calculate average best parameters over all inner folds to return to outer CV
    avg_best_param1 = best_params['param1'].mean()
    avg_best_param2 = best_params['param2'].mean()

    # Calculate average best threshold over all folds
    best_threshold = best_thresholds['threshold'].mean()

    return avg_best_param1, avg_best_param2, best_threshold


In [None]:
def outer_CV_B(n_outer_splits: int, n_inner_splits: int, X : pd.DataFrame, y : pd.DataFrame, 
               axis1_params: mdo.AxisParams, 
               axis2_params: mdo.AxisParams, train_model_callback : callable, random_state: int, 
               classification_col : int, top_boundary_val : float, smote_preprocess = False, **kwargs) -> pd.DataFrame:
    """
    Perform nested cross-validation with an outer and inner loop to evaluate model B performance.
    Created: 2024/12/29
    Parameters:
    -----------
    n_outer_splits : int
        Number of splits for the outer cross-validation.
    n_inner_splits : int
        Number of splits for the inner cross-validation.
    X : pd.DataFrame
        Feature data.
    y : pd.DataFrame
        Target data.
    axis1_params : mdo.AxisParams
        Object representing hyperparameter search space for the first axis.
    axis2_params : mdo.AxisParams
        Object representing hyperparameter search space for the second axis.
    train_model_callback : callable
        Function to train the model.
    kfold_random_state : int
        Random state for reproducibility in KFold.
    classification_col : int
        Column index to pull classification probabilities from - 0 for not top, 1 for top.
    top_line_thresh : float
        Threshold to classify predictions as top or not top.
    **kwargs : dict
        Additional parameters for the model training function.
    Returns:
    --------
    pd.DataFrame
        DataFrame containing the metrics for each outer fold, including F1 Score, Sensitivity, Specificity, and Kappa.
    """
    
    kfold = KFold(n_splits=n_outer_splits, shuffle=True, random_state=random_state)

    # Store metrics of best model for each fold
    kfold_metrics = pd.DataFrame(columns=['F1 Score', 'Sensitivity', 'Specificity', 'Kappa'])

    # Iterate through each train-test split
    for i, (train_index, test_index) in enumerate(kfold.split(X)):

        # Split the data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        y_train_bin = bmo.continuous_to_binary_absolute(y_train, top_boundary_val)
        y_test_bin = bmo.continuous_to_binary_absolute(y_test, top_boundary_val)

        if smote_preprocess:
            sm = SMOTE(random_state=random_state)
            X_train, y_train_bin = sm.fit_resample(X_train, y_train_bin)
            
        X_train, _, X_scaler, _ = scale_features_and_target(X_train, None)
        X_test = X_scaler.transform(X_test)

        # Find average best parameters and threshold based on F1 score using inner-fold CV
        best_param1, best_param2, best_threshold = inner_CV_B(n_inner_splits, X_train, y_train_bin, axis1_params, axis2_params, train_model_callback, random_state, classification_col, plot_title=f"Outer Fold {i}", **kwargs)

        # Train model with all training and CV data of outer fold using mean best hyperparameters
        super_model = train_model_callback(X_train, np.ravel(y_train_bin), **dict(zip([axis1_params.name, axis2_params.name], [best_param1, best_param2])), **kwargs)

        # Use trained "super-model" to predict test set probabilities
        y_pred_proba = pd.DataFrame(super_model.predict_proba(X_test)[:, classification_col], index=y_test_bin.index, columns=y_test_bin.columns)
        plot_GY_hist(y_pred_proba, f'Top Line Probability Histogram, {train_model_callback.__name__}, Outer Fold {i}', x_ax_label='Top Line Probability')

        # Classify predictions and actuals of super_model as top or not top (boolean) using the best threshold as determined by inner CV
        y_pred_bin = bmo.continuous_to_binary_absolute(y_pred_proba, best_threshold)

        # Plot ROC and PR curves using seaborn
        cmp.sns_plot_roc_curve(pd.DataFrame(y_test_bin), pd.DataFrame(y_pred_proba), f'Model B ROC Curve, {train_model_callback.__name__}, Outer Fold {i}')
        plt.savefig(f'{storage_dir}\\Model B ROC Curve, {train_model_callback.__name__}, Outer Fold {i}.svg', format="svg")
        plt.show()
        plt.close()

        cmp.sns_plot_pr_curve(pd.DataFrame(y_test_bin), pd.DataFrame(y_pred_proba), f'Model B PR Curve, {train_model_callback.__name__}, Outer Fold {i}')
        plt.savefig(f'{storage_dir}\\Model B PR Curve, {train_model_callback.__name__}, Outer Fold {i}.svg', format="svg")
        plt.show()
        plt.close()
    
        # Calculate classification metrics and add new row to kfold_metrics   
        classification_metrics = cdt.classification_metrics(y_pred_bin, y_test_bin)   
        kfold_metrics = pd.concat([kfold_metrics, classification_metrics], axis=0)   

    # Label each row of kfold_metrics with the fold number
    kfold_metrics.index = range(n_outer_splits)
    return kfold_metrics


## SVM

In [None]:

# Define hyperparameter grids for SVM classification model and conduct 2-dimensional cross-validation
x_params_SVM_B = mdo.AxisParams('gamma', bmo.power_list(2, -18, -8))
y_params_SVM_B = mdo.AxisParams('C', bmo.power_list(2, 2, 16))
metrics_SVM_B = outer_CV_B(5, 10, X, y, x_params_SVM_B, y_params_SVM_B, bmo.train_SVM_classifier, 
                           random_state=RANDOM_STATE, classification_col=1, top_boundary_val=top_boundary_val, 
                           smote_preprocess=SMOGN_PREPROCESS, probability=True, kernel='rbf')

"""
# Dummy values for tests
x_params_SVM_B = mdo.AxisParams('gamma', bmo.power_list(2, -10, -9))
y_params_SVM_B = mdo.AxisParams('C', bmo.power_list(2, 0, 1))
metrics_SVM_B = outer_CV_B(2, 2, X, y, x_params_SVM_B, y_params_SVM_B, bmo.train_SVM_classifier, kfold_random_state=RANDOM_STATE, kernel='rbf', classification_col=1, top_boundary_val=top_boundary_val, probability=True)
"""

In [None]:
# Display classification metrics for super-model trained on all data from each outer fold
display(metrics_SVM_B)

In [None]:
# Print average of each metric
metrics_SVM_B_mean = metrics_SVM_B.mean().to_frame().T
B_average_metrics.loc['SVM'] = metrics_SVM_B_mean.iloc[0]
display(metrics_SVM_B_mean)

## XGBoost

In [None]:
# Define hyperparameter grids for XGB classification model and conduct 2-dimensional cross-validation
x_params_XGB_B = mdo.AxisParams('n_estimators', [3, 7, 13, 25, 50, 100, 200, 400])
y_params_XGB_B = mdo.AxisParams('max_depth', [1, 2, 3, 4, 6, 10, 16, 32, 64])
metrics_XGB_B = outer_CV_B(5, 10, X, y, x_params_XGB_B, y_params_XGB_B, bmo.train_XGB_classifier, random_state=RANDOM_STATE, 
                           classification_col=1, top_boundary_val=top_boundary_val, smote_preprocess=SMOGN_PREPROCESS, 
                           objective="binary:logistic", eval_metric="logloss")

"""
# Dummy values for quick tests
x_params_XGB_B = mdo.AxisParams('n_estimators', [1, 2])
y_params_XGB_B = mdo.AxisParams('max_depth', [1, 2])
metrics_XGB_B = outer_CV_B(2, 2, X, y, x_params_XGB_B, y_params_XGB_B, bmo.train_XGB_classifier, random_state=RANDOM_STATE, 
                           classification_col=1, top_boundary_val=top_boundary_val, smote_preprocess=SMOGN_PREPROCESS, 
                           objective="binary:logistic", eval_metric="logloss")"""

In [None]:
display(metrics_XGB_B)

In [None]:
# Print average of each metric
metrics_XGB_B_mean = metrics_XGB_B.mean().to_frame().T
B_average_metrics.loc['XGB'] = metrics_XGB_B_mean.iloc[0]
display(metrics_XGB_B_mean)

In [None]:
# Save serialized session variables and models to disk for later use
dill.dump_session(f'{storage_dir}\\project_ipynb_env_B.db')

# Model RO

In [None]:
def inner_CV_RO(n_splits: int, X : pd.DataFrame, y : pd.DataFrame, y_top_bound : float, axis1_params: mdo.AxisParams, axis2_params: mdo.AxisParams, train_model_callback, 
                kfold_random_state: int, plot_title: str = "", **kwargs):
    """
    Perform inner cross-validation (RO) to find the best model parameters and classification threshold.
    Created: 2024/12/21
    Parameters:
    n_splits (int): Number of splits for K-Fold cross-validation.
    X (pd.DataFrame): Feature data.
    y (pd.DataFrame): Target data.
    axis1_params (mdo.AxisParams): Hyperparameters to explore for the horizontal axis.
    axis2_params (mdo.AxisParams): Hyperparameters to explore for the vertical axis.
    train_model_callback (callable): Callback function to train the model.
    kfold_random_state (int): Random state for K-Fold shuffling.
    top_line_threshold (float): Threshold to classify top values during intermediate step in inner CV.
    plot_title (str, optional): Title for the plot. Defaults to "".
    **kwargs: Additional keyword arguments for the model training callback.
    Returns:
    tuple: Average best parameters for axis1 and axis2, and the best classification threshold.
    """

    # Create KFold object for inner-fold cross-validation
    kfold = KFold(n_splits=n_splits, shuffle=True, random_state=kfold_random_state)

    # Store best parameters (param1, param2) for each fold
    best_params = pd.DataFrame(columns=['param1', 'param2'], index=range(n_splits))
    best_thresholds = pd.DataFrame(columns=['threshold'], index=range(n_splits))

    # Iterate through each train-test split
    for i, (train_index, test_index) in enumerate(kfold.split(X)):
        # Split the data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        
        # Train a grid of models with every combination of parameters
        model_grid = bmo.train_model_grid(X_train, y_train, axis1_params, axis2_params, train_model_callback, **kwargs)

        # Use trained models to predict test set labels, and store in a 2D array with each cell corresponding to a model with a specific combination of parameters
        y_preds_grid = bmo.grid_predict(X_test, model_grid)

        # Create a 2D array of identical dataframes containing actual labels to compare against predictions
        y_test_grid = cdt.np_array_of_dfs(y_test, y_preds_grid.shape)

        # Evaluate predictions by comparing to actuals, calculating a 2D array of Pearson coefficients
        pearson_grid = bmo.calculate_pearson_coefficients(y_preds_grid, y_test_grid)

        # Find the index of the best Pearson coefficient in the 2D array of Pearson coefficients
        best_row, best_col = np.unravel_index(np.argmax(pearson_grid), pearson_grid.shape)
        
        # Store hyperparameters of the most accurate model for this inner fold
        best_params.loc[i] = [axis1_params.values[best_row], axis2_params.values[best_col]]

        # Extract best model's continuous predictions
        best_model_y_preds = y_preds_grid[best_row, best_col]

        # Classify labels as top or not top (boolean)
        y_test_binary = bmo.continuous_to_binary_absolute(y_test, y_top_bound)
        
        # Find classification threshold that yields the lowest squared difference between sensitivity and specificity using this optimal model
        best_absolute_thresh = bmo.find_optimal_threshold_absolute(y_test_binary, best_model_y_preds)
        best_thresholds.iloc[i, 0] = best_absolute_thresh

        # Create a grid of scatter plots with predictions vs actuals, colored by Pearson coefficient for each model
        scatter_grid = plot_shaded_scatter_grids(y_preds_grid, y_test_grid, axis1_params, axis2_params, pearson_grid, f'{plot_title} | Inner Fold {i}')        
        plt.savefig(f'{storage_dir}\\model_RO, {train_model_callback.__name__}, ({plot_title}, Inner Fold {i}).svg', format="svg")
        plt.show(scatter_grid)
        plt.close(scatter_grid)

    # Calculate average best parameters over all folds
    avg_best_param1 = best_params['param1'].mean()
    avg_best_param2 = best_params['param2'].mean()

    # Calculate average best threshold over all folds
    best_threshold = best_thresholds['threshold'].mean()

    return avg_best_param1, avg_best_param2, best_threshold


In [None]:
def outer_CV_RO(n_outer_splits: int, n_inner_splits: int, X : pd.DataFrame, y : pd.DataFrame, 
                axis1_params: mdo.AxisParams, axis2_params: mdo.AxisParams, train_model_callback : callable, 
                kfold_random_state: int, top_boundary_val : float, smogn_preprocess : bool = False, undersample : bool = True,
                **kwargs) -> pd.DataFrame:
    """
    Perform outer cross-validation with nested inner cross-validation for model RO selection and evaluation.
    Parameters:
    -----------
    n_outer_splits : int
        Number of splits for the outer cross-validation.
    n_inner_splits : int
        Number of splits for the inner cross-validation.
    X : pd.DataFrame
        Feature data.
    y : pd.DataFrame
        Target data.
    axis1_params : mdo.AxisParams
        Object representing hyperparameter search space for the first axis.
    axis2_params : mdo.AxisParams
        Object representing hyperparameter search space for the second axis.
    train_model_callback : callable
        Callback function to train the model.
    kfold_random_state : int
        Random state for reproducibility in KFold splitting.
    top_line_threshold : float
        Threshold for classifying top predictions during intermediate step in inner CV.
    **kwargs : dict
        Additional parameters for the model training callback.
    Returns:
    --------
    pd.DataFrame
        DataFrame containing the evaluation metrics for each outer fold, including Pearson correlation, F1 Score, Sensitivity, Specificity, and Kappa.
    """

    # Create KFold object for outer loop to split data into train and test sets
    kfold = KFold(n_splits=n_outer_splits, shuffle=True, random_state=kfold_random_state)

    # Initialize DataFrame to store evaluation metrics for each outer fold
    kfold_metrics = pd.DataFrame(columns=['Pearson', 'F1 Score', 'Sensitivity', 'Specificity', 'Kappa'])

    # Iterate through each train-test split
    for i, (train_index, test_index) in enumerate(kfold.split(X)):

        # Split the data into train and test sets
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        if smogn_preprocess:
            # Calculate the quantile of the top boundary value in the training set
            top_boundary_quantile_in_train_set = percentileofscore(y_train.to_numpy().flatten(), top_boundary_val, kind='mean') / 100

            # Make copies of the original training data before SMOGN preprocessing
            X_train_pre_smogn = X_train.copy()
            y_train_pre_smogn = y_train.copy()
            # Apply SMOGN preprocessing to the training data
            X_train, y_train = smogn_prep(X_train, y_train, top_boundary_quantile_in_train_set, undersample)

            if not undersample:
                # Manually concatenate the original data below the augmentation threshold with the augmented data
                non_augmented_indices = y_train_pre_smogn[y_train_pre_smogn < top_boundary_val].index
                X_train_non_augmented = X_train_pre_smogn.loc[non_augmented_indices]
                y_train_non_augmented = y_train_pre_smogn.loc[non_augmented_indices]
                X_train = pd.concat([X_train, X_train_non_augmented], axis=0)
                y_train = pd.concat([y_train, y_train_non_augmented], axis=0)
            # Plot histogram of the augmented training data
            plot_GY_hist(y_train, f'Model RO SMOGN-Augmented GY Histogram, Outer Fold {i}')

        # Scale features and target for the training data
        X_train, y_train, X_scaler, y_scaler = scale_features_and_target(X_train, y_train)
        # Scale the top boundary value
        top_boundary_val_scaled = y_scaler.transform([[top_boundary_val]])[0, 0]
        # Scale the test data using the same scaler as the training data
        X_test = pd.DataFrame(X_scaler.transform(X_test))
        X_test.columns = X.columns
        y_test = pd.DataFrame(y_scaler.transform(y_test))

        # Find average best parameters and threshold based on Pearson score using inner-fold CV
        best_param1, best_param2, best_threshold_fixed = inner_CV_RO(n_inner_splits, X_train, y_train, top_boundary_val_scaled, axis1_params, axis2_params, train_model_callback, kfold_random_state, plot_title=f"Outer Fold {i}", **kwargs)

        # Train model with all training and CV data of outer fold using mean best hyperparameters
        super_model = train_model_callback(X_train, np.ravel(y_train), **dict(zip([axis1_params.name, axis2_params.name], [best_param1, best_param2])), **kwargs)

        # Use trained "super-model" to predict test set
        y_pred = pd.DataFrame(super_model.predict(X_test).reshape(-1, 1), index=y_test.index, columns=y_test.columns)
        # Plot histogram of the predicted values
        plot_GY_hist(y_pred, f'Predicted GY Histogram, {train_model_callback.__name__}, Outer Fold {i}')

        # Calculate Pearson coefficient of continuous predictions
        pearson, _ = pearsonr(np.ravel(y_pred), np.ravel(y_test))

        # Classify predictions and actuals of super_model as top or not top (boolean)
        y_pred_top = bmo.continuous_to_binary_absolute(y_pred, best_threshold_fixed)
        y_test_top = bmo.continuous_to_binary_absolute(y_test, top_boundary_val_scaled)

        # Plot classification results
        cmp.plot_classification_results(y_pred, y_test, y_pred_top, y_test_top, 
                                [f"Model RO Predicted vs Actual GY, {train_model_callback.__name__}, Outer Fold {i}"],
                                save_path=f'{storage_dir}\\Model RO Super Model Predicted vs Actual GY, {train_model_callback.__name__}, Outer Fold {i}.svg')

        # Calculate classification metrics and add new row to kfold_metrics
        classification_metrics = cdt.classification_metrics(y_pred_top, y_test_top)
        pearson_df = pd.DataFrame([pearson], columns=['Pearson'])
        metrics_row = pd.concat([pearson_df, classification_metrics], axis=1)
        kfold_metrics = pd.concat([kfold_metrics, metrics_row], axis=0)
    
    # Reset index of the metrics DataFrame
    kfold_metrics.index = range(n_outer_splits)
    return kfold_metrics

## SVM

In [None]:
# Set hyperparameter grids for SVM regression model and conduct 2-dimensional cross-validation
x_params_SVM_RO = mdo.AxisParams('gamma', bmo.power_list(2, -14, -6))
y_params_SVM_RO = mdo.AxisParams('C', bmo.power_list(2, -2, 6))
metrics_SVM_RO = outer_CV_RO(5, 10, X, y, x_params_SVM_RO, y_params_SVM_RO, bmo.train_SVM_regressor, 
                             kfold_random_state=RANDOM_STATE, top_boundary_val=top_boundary_val, smogn_preprocess=SMOGN_PREPROCESS, 
                            undersample=UNDERSAMPLE, kernel='rbf')
"""
# Quick test values
x_params_SVM_RO = mdo.AxisParams('gamma', bmo.power_list(2, -8, -7))
y_params_SVM_RO = mdo.AxisParams('C', bmo.power_list(2, 0, 1))
metrics_SVM_RO = outer_CV_RO(2, 2, X, y, x_params_SVM_RO, y_params_SVM_RO, bmo.train_SVM_regressor, 
                             kfold_random_state=RANDOM_STATE, top_boundary_val=top_boundary_val, smogn_preprocess=SMOGN_PREPROCESS, 
                             undersample=UNDERSAMPLE, kernel='rbf')
"""

In [None]:
display(metrics_SVM_RO)

In [None]:
# Print average of each metric
metrics_SVM_RO_mean = metrics_SVM_RO.mean().to_frame().T
RO_average_metrics.loc['SVM'] = metrics_SVM_RO_mean.iloc[0]
display(metrics_SVM_RO_mean)

## XGBoost

In [None]:
# Define hyperparameter grids for XGB regression model and conduct 2-dimensional cross-validation
x_params_XGB_RO = mdo.AxisParams('n_estimators', [3, 7, 13, 25, 50, 100, 200])
y_params_XGB_RO = mdo.AxisParams('max_depth', [1, 2, 3, 4, 6, 10, 16, 32, 64])
metrics_XGB_RO = outer_CV_RO(5, 10, X, y, x_params_XGB_RO, y_params_XGB_RO, bmo.train_XGB_regressor, 
                             kfold_random_state=RANDOM_STATE, random_state=RANDOM_STATE, top_boundary_val=top_boundary_val, 
                             smogn_preprocess=SMOGN_PREPROCESS, undersample=UNDERSAMPLE, objective="reg:squarederror", 
                             eval_metric="rmse")
"""
# Quick test values
x_params_XGB_RO = mdo.AxisParams('n_estimators', [1, 2])
y_params_XGB_RO = mdo.AxisParams('max_depth', [1, 2])
metrics_XGB_RO = outer_CV_RO(2, 2, X, y, x_params_XGB_RO, y_params_XGB_RO, bmo.train_XGB_regressor, 
                                kfold_random_state=RANDOM_STATE, random_state=RANDOM_STATE, top_boundary_val=top_boundary_val, 
                                smogn_preprocess=SMOGN_PREPROCESS, undersample=UNDERSAMPLE, objective="reg:squarederror", 
                                eval_metric="rmse")
"""

In [None]:
display(metrics_XGB_RO)

In [None]:
# Print average of each metric
metrics_XGB_RO_mean = metrics_XGB_RO.mean().to_frame().T
RO_average_metrics.loc['XGB'] = metrics_XGB_RO_mean.iloc[0]
display(metrics_XGB_RO_mean)

In [None]:
# Save serialized session variables and models to disk for later use
dill.dump_session(f'{storage_dir}\\project_ipynb_env_RO.db')

# Conclusion

In [None]:
# Plot average metrics for each model to compare GBLUP, SVM, and XGB
R_avg_metrics_plot = cmp.plot_model_metrics(R_average_metrics, "R")
R_avg_metrics_plot.savefig(f'{storage_dir}\\R_avg_metrics_plot.svg', format='svg')
plt.show(R_avg_metrics_plot)
plt.close(R_avg_metrics_plot)

B_avg_metrics_plot = cmp.plot_model_metrics(B_average_metrics, "B")
B_avg_metrics_plot.savefig(f'{storage_dir}\\B_avg_metrics_plot.svg', format='svg')
plt.show(B_avg_metrics_plot)
plt.close(B_avg_metrics_plot)

RO_avg_metrics_plot = cmp.plot_model_metrics(RO_average_metrics, "RO")
RO_avg_metrics_plot.savefig(f'{storage_dir}\\RO_avg_metrics_plot.svg', format='svg')
plt.show(RO_avg_metrics_plot)
plt.close(RO_avg_metrics_plot)