In [52]:
# Jupyter notebook in Jupytext format

## Initialisation

In [53]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find("project")
# slice the path from the index of 'project' to the end
project_path = path[: index_project + 7]
# set the working directory
os.chdir(project_path)
print(f"Project path set to: {os.getcwd()}")

Project path set to: c:\Github\ode-biomarker-project


In [54]:
from PathLoader import PathLoader #noqa: E402

path_loader = PathLoader("data_config.env", "current_user.env")

In [55]:
from DataLink import DataLink #noqa: E402

data_link = DataLink(path_loader, "data_codes.csv")

In [56]:
folder_name = "ThesisResult4-FeatureSelectionBenchmark"
exp_id = "v5_network_integration_rep10"

if not os.path.exists(f"{path_loader.get_data_path()}data/results/{folder_name}/{exp_id}"):
    os.makedirs(f"{path_loader.get_data_path()}data/results/{folder_name}/{exp_id}")

file_save_path = f"{path_loader.get_data_path()}data/results/{folder_name}/{exp_id}/"

In [57]:
# Load Proteomics Palbociclib dataset
loading_code = "goncalves-gdsc-2-Palbociclib-LN_IC50-sin"
proteomic_feature_data, proteomic_label_data = data_link.get_data_using_code(
    loading_code
)

print(f"Proteomic feature data shape: {proteomic_feature_data.shape}")
print(f"Proteomic label data shape: {proteomic_label_data.shape}")

Proteomic feature data shape: (737, 6692)
Proteomic label data shape: (737,)


In [58]:
# Data preparation and alignment
import numpy as np #noqa: E402

# Ensure numeric only
proteomic_feature_data = proteomic_feature_data.select_dtypes(include=[np.number])

# Align indices
common_indices = sorted(
    set(proteomic_feature_data.index) & set(proteomic_label_data.index)
)
feature_data = proteomic_feature_data.loc[common_indices]
label_data = proteomic_label_data.loc[common_indices]

print(f"Final aligned dataset shape: {feature_data.shape}")
print(f"Final aligned label shape: {label_data.shape}")

Final aligned dataset shape: (737, 6692)
Final aligned label shape: (737,)


## Functions

In [59]:
from typing import Dict, List, Literal #noqa: E402
import numpy as np #noqa: E402
import pandas as pd #noqa: E402
from scipy.stats import pearsonr, spearmanr #noqa: E402
from sklearn.metrics import r2_score #noqa: E402
from sklearn.dummy import DummyRegressor #noqa: E402
from sklearn.preprocessing import StandardScaler #noqa: E402
from toolkit import FirstQuantileImputer, f_regression_select, get_model_from_string #noqa: E402
from toolkit import (
    mrmr_select_fcq, 
    mrmr_select_fcq_fast,
    mutual_information_select,
    select_random_features,
) #noqa: E402
import time #noqa: E402

In [60]:
def random_select_wrapper(X: pd.DataFrame, y: pd.Series, k: int) -> tuple:
    """Wrapper function for random feature selection that returns dummy scores"""
    selected_features, _ = select_random_features(X, y, k)
    # Return dummy scores (all zeros) since random selection has no meaningful scores
    dummy_scores = np.zeros(len(selected_features))
    return selected_features, dummy_scores

In [61]:
def _drop_correlated_columns(X: pd.DataFrame, threshold: float = 0.95) -> List[str]:
    """Drop highly correlated columns to reduce redundancy"""
    corr = X.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = set()
    for col in sorted(upper.columns):
        if col in to_drop:
            continue
        high_corr = upper.index[upper[col] > threshold].tolist()
        to_drop.update(high_corr)
    return [c for c in X.columns if c not in to_drop]

In [62]:
def create_feature_selection_pipeline(
    selection_method: callable, k: int, method_name: str, model_name: str
):
    """Create pipeline for feature selection methods"""

    def pipeline_function(X_train: pd.DataFrame, y_train: pd.Series, rng: int):
        # 1) Sanitize inputs and imputation
        X_train = X_train.replace([np.inf, -np.inf], np.nan)
        y_train = pd.Series(y_train).replace([np.inf, -np.inf], np.nan)
        mask = ~y_train.isna()
        X_train, y_train = X_train.loc[mask], y_train.loc[mask]

        # 2) Imputation
        imputer = FirstQuantileImputer().fit(X_train)
        Xtr = imputer.transform(X_train, return_df=True).astype(float)
        Xtr = Xtr.fillna(0.0)

        # 3) Correlation filtering (applied to both train and test)
        # Use the working function from your baseline code [1]
        corr_keep_cols = _drop_correlated_columns(Xtr, threshold=0.95)
        Xtr_filtered = Xtr[corr_keep_cols]

        # 4) Feature selection
        k_sel = min(k, Xtr_filtered.shape[1]) if Xtr_filtered.shape[1] > 0 else 0
        if k_sel == 0:
            selected_features, selector_scores = [], np.array([])
            no_features = True
        else:
            selected_features, selector_scores = selection_method(
                Xtr_filtered, y_train, k_sel
            )
            no_features = False

        # 5) Standardization and model training
        if no_features or len(selected_features) == 0:
            model = DummyRegressor(strategy="mean")
            model_type = "DummyRegressor(mean)"
            model_params = {"strategy": "mean"}
            sel_train = Xtr_filtered.iloc[:, :0]
        else:
            sel_train = Xtr_filtered[selected_features]
            scaler = StandardScaler()
            sel_train_scaled = scaler.fit_transform(sel_train)
            sel_train_scaled = pd.DataFrame(
                sel_train_scaled, index=sel_train.index, columns=selected_features
            )

            # Train model
            if model_name == "LinearRegression":
                model = get_model_from_string("LinearRegression")
            elif model_name == "KNeighborsRegressor":
                model = get_model_from_string(
                    "KNeighborsRegressor", n_neighbors=5, weights="distance", p=2
                )
            elif model_name == "SVR":
                model = get_model_from_string("SVR", kernel="linear", C=1.0)
            else:
                raise ValueError(f"Unsupported model: {model_name}")

            model.fit(sel_train_scaled, y_train)
            model_type = model_name
            model_params = (
                model.get_params(deep=False) if hasattr(model, "get_params") else {}
            )

        return {
            "imputer": imputer,
            "corr_keep_cols": corr_keep_cols,
            "selected_features": list(selected_features),
            "selector_scores": np.array(selector_scores),
            "model": model,
            "model_type": model_type,
            "model_params": model_params,
            "scaler": scaler if not no_features else None,
            "no_features": no_features,
            "rng": rng,
        }

    return pipeline_function

In [63]:
def feature_selection_eval(
    X_test: pd.DataFrame,
    y_test: pd.Series,
    *,
    pipeline_components: Dict,
    metric_primary: Literal["r2", "pearson_r", "spearman_r"] = "r2",
) -> Dict:
    """Evaluation function for feature selection benchmarking"""

    # Unpack components following the structure from working baseline code [1]
    imputer = pipeline_components["imputer"]
    corr_keep = set(pipeline_components["corr_keep_cols"])
    selected = list(pipeline_components["selected_features"])
    selector_scores = pipeline_components["selector_scores"]
    model = pipeline_components["model"]
    model_name = pipeline_components["model_type"]
    scaler = pipeline_components.get("scaler", None)
    no_features = pipeline_components.get("no_features", False)

    # Apply identical transforms as training
    X_test = X_test.replace([np.inf, -np.inf], np.nan)
    y_test = pd.Series(y_test).replace([np.inf, -np.inf], np.nan)
    mask_y = ~y_test.isna()
    X_test, y_test = X_test.loc[mask_y], y_test.loc[mask_y]

    Xti = imputer.transform(X_test, return_df=True).astype(float).fillna(0.0)

    # Apply same correlation filtering as training [1]
    cols_after_corr = [c for c in Xti.columns if c in corr_keep]
    Xti = Xti[cols_after_corr]

    # Select features
    Xsel = Xti[selected] if len(selected) > 0 else Xti.iloc[:, :0]

    # Standardize if scaler exists (i.e., features were selected)
    if scaler is not None and len(selected) > 0:
        Xsel_scaled = scaler.transform(Xsel)
        Xsel_scaled = pd.DataFrame(Xsel_scaled, index=Xsel.index, columns=selected)
    else:
        Xsel_scaled = Xsel

    # Predict
    if no_features or Xsel.shape[1] == 0:
        y_pred = np.full_like(
            y_test.values, fill_value=float(y_test.mean()), dtype=float
        )
    else:
        y_pred = np.asarray(model.predict(Xsel_scaled), dtype=float)

    # Calculate metrics (following the exact structure from baseline_eval [1])
    mask_fin = np.isfinite(y_test.values) & np.isfinite(y_pred)
    y_t = y_test.values[mask_fin]
    y_p = y_pred[mask_fin]

    if len(y_t) < 2:
        r2 = np.nan
        pearson_r = pearson_p = np.nan
        spearman_rho = spearman_p = np.nan
    else:
        r2 = r2_score(y_t, y_p)
        pearson_r, pearson_p = pearsonr(y_t, y_p)
        spearman_rho, spearman_p = spearmanr(y_t, y_p)

    metrics = {
        "r2": float(r2) if np.isfinite(r2) else np.nan,
        "pearson_r": float(pearson_r) if np.isfinite(pearson_r) else np.nan,
        "pearson_p": float(pearson_p) if np.isfinite(pearson_p) else np.nan,
        "spearman_rho": float(spearman_rho) if np.isfinite(spearman_rho) else np.nan,
        "spearman_p": float(spearman_p) if np.isfinite(spearman_p) else np.nan,
        "n_test_samples_used": len(y_t),
    }

    # Feature importance
    if not no_features and hasattr(model, "feature_importances_") and len(selected) > 0:
        fi = (np.array(selected), model.feature_importances_)
    elif not no_features and model_name in ("LinearRegression",) and len(selected) > 0:
        coef = getattr(model, "coef_", np.zeros(len(selected)))
        fi = (np.array(selected), np.abs(coef))
    else:
        fi = (np.array(selected), np.zeros(len(selected)))

    primary = metrics.get(metric_primary, metrics["r2"])

    return {
        "feature_importance": fi,
        "feature_importance_from": "model",
        "model_performance": float(primary) if primary is not None else np.nan,
        "metrics": metrics,
        "selected_features": selected,
        "model_name": model_name,
        "selected_scores": selector_scores,
        "k": len(selected),
        "rng": pipeline_components.get("rng", None),
        "y_pred": y_p,
        "y_true_index": y_test.index[mask_fin],
    }

## Results and Visualisation

### Load data

In [64]:
def save_and_print(message, report_file=None, level="info"):
    """
    Print message to console and save to report file with proper formatting.
    
    Args:
        message: The message to print and save
        report_file: File object to save to (optional)
        level: Formatting level - "header", "section", "subsection", or "info"
    """
    # Print to console
    print(message)
    
    # Save to report with proper formatting
    if report_file:
        if level == "header":
            report_file.write(f"# {message}\n\n")
        elif level == "section":
            report_file.write(f"## {message}\n\n")
        elif level == "subsection":
            report_file.write(f"### {message}\n\n")
        else:  # info level
            report_file.write(f"{message}\n\n")
    
    return message

In [65]:
# Load saved feature selection benchmark (feature_selection_benchmark_v1.pkl)
import os
import pandas as pd
import time #noqa: E402

# Create a new report file for capturing print statements
print_report_path = f"{file_save_path}data_report_{exp_id}.md"
print_report_file = open(print_report_path, 'w', encoding='utf-8')

# Write header to the print report
print_report_file.write(f"# Feature Selection Print Report - {exp_id}\n\n")
print_report_file.write(f"**Generated**: {time.strftime('%Y-%m-%d %H:%M:%S')}\n\n")
print_report_file.write("This report captures all print statements from the Results section with proper formatting.\n\n")

pkl_path = f"{path_loader.get_data_path()}data/results/{folder_name}/feature_selection_benchmark_{exp_id}.pkl"
if not os.path.exists(pkl_path):
    raise FileNotFoundError(f"Pickle not found: {pkl_path}")

df_benchmark = pd.read_pickle(pkl_path)
save_and_print(f"Loaded df_benchmark with shape: {df_benchmark.shape}", print_report_file, level="section")

Loaded df_benchmark with shape: (1440, 15)


'Loaded df_benchmark with shape: (1440, 15)'

### Fix Data Structure Issues

In [66]:
def parse_condition_column(df_benchmark):
    """Parse the condition column to extract method, k_value, and model_name correctly"""
    
    save_and_print("## Fixing Data Structure Issues", print_report_file, level="section")
    save_and_print("Parsing condition column to extract correct method, k_value, and model_name", print_report_file, level="info")
    
    # Create new columns based on condition parsing
    parsed_data = []
    
    for idx, row in df_benchmark.iterrows():
        condition = row['condition']
        
        # Parse the condition format: {method}_k{value}_{model}
        # Example: "mrmr_anova_prefilter_k5_KNeighborsRegressor"
        parts = condition.split('_')
        
        # Extract method (everything before the k-value part)
        method_parts = []
        k_value = None
        model_name = None
        
        for part in parts:
            if part.startswith('k'):
                # Found k-value part, extract numeric value
                k_value = int(part[1:])  # Remove 'k' prefix and convert to int
                # Everything before this is the method
                method = '_'.join(method_parts)
                # Everything after this is the model
                model_parts = parts[parts.index(part) + 1:]
                model_name = '_'.join(model_parts)
                break
            else:
                method_parts.append(part)
        
        # If we didn't find a k-value (shouldn't happen with valid data)
        if k_value is None:
            save_and_print(f"Warning: Could not parse k-value from condition: {condition}", print_report_file, level="info")
            method = '_'.join(method_parts[:-1]) if len(method_parts) > 1 else method_parts[0]
            model_name = parts[-1] if parts else 'unknown'
            k_value = 0
        
        parsed_data.append({
            'condition': condition,
            'parsed_method': method,
            'parsed_k_value': k_value,
            'parsed_model_name': model_name
        })
    
    # Create a DataFrame with parsed values
    parsed_df = pd.DataFrame(parsed_data)
    
    # Compare with existing columns
    save_and_print("### Comparison of Original vs Parsed Values", print_report_file, level="subsection")
    
    # Check method consistency
    method_mismatch = df_benchmark['method'] != parsed_df['parsed_method']
    if method_mismatch.any():
        save_and_print(f"Method mismatches found: {method_mismatch.sum()}/{len(df_benchmark)}", print_report_file, level="info")
        for idx in df_benchmark[method_mismatch].index[:5]:  # Show first 5 mismatches
            save_and_print(f"  Row {idx}: Original='{df_benchmark.loc[idx, 'method']}', Parsed='{parsed_df.loc[idx, 'parsed_method']}'", 
                          print_report_file, level="info")
    
    # Check k_value consistency
    k_mismatch = df_benchmark['k_value'] != parsed_df['parsed_k_value']
    if k_mismatch.any():
        save_and_print(f"K-value mismatches found: {k_mismatch.sum()}/{len(df_benchmark)}", print_report_file, level="info")
        for idx in df_benchmark[k_mismatch].index[:5]:
            save_and_print(f"  Row {idx}: Original={df_benchmark.loc[idx, 'k_value']}, Parsed={parsed_df.loc[idx, 'parsed_k_value']}", 
                          print_report_file, level="info")
    
    # Check model_name consistency
    model_mismatch = df_benchmark['model_name'] != parsed_df['parsed_model_name']
    if model_mismatch.any():
        save_and_print(f"Model name mismatches found: {model_mismatch.sum()}/{len(df_benchmark)}", print_report_file, level="info")
        for idx in df_benchmark[model_mismatch].index[:5]:
            save_and_print(f"  Row {idx}: Original='{df_benchmark.loc[idx, 'model_name']}', Parsed='{parsed_df.loc[idx, 'parsed_model_name']}'", 
                          print_report_file, level="info")
    
    # Update the dataframe with parsed values
    df_benchmark['method'] = parsed_df['parsed_method']
    df_benchmark['k_value'] = parsed_df['parsed_k_value']
    df_benchmark['model_name'] = parsed_df['parsed_model_name']
    
    save_and_print("Dataframe columns updated with correctly parsed values", print_report_file, level="info")
    
    # Show unique values after parsing
    save_and_print("### Unique Values After Parsing", print_report_file, level="subsection")
    save_and_print(f"Methods: {df_benchmark['method'].unique()}", print_report_file, level="info")
    save_and_print(f"K-values: {sorted(df_benchmark['k_value'].unique())}", print_report_file, level="info")
    save_and_print(f"Models: {df_benchmark['model_name'].unique()}", print_report_file, level="info")
    
    return df_benchmark

# Apply the parsing fix
df_benchmark = parse_condition_column(df_benchmark)

# Display first rows (works in notebook)
try:
    from IPython.display import display

    display(df_benchmark.head())
except Exception:
    save_and_print(df_benchmark.head().to_string(), print_report_file, level="info")

# Re-define variables that might be needed in the loaded section
# Use actual k-values present in the data instead of predefined list
feature_set_sizes = sorted(df_benchmark['k_value'].unique())
print(f"Actual k-values present in data: {feature_set_sizes}")

## Fixing Data Structure Issues
Parsing condition column to extract correct method, k_value, and model_name
### Comparison of Original vs Parsed Values
Method mismatches found: 1200/1440
  Row 0: Original='mrmr', Parsed='mrmr_network_d2'
  Row 1: Original='mrmr', Parsed='mrmr_network_d2'
  Row 2: Original='mrmr', Parsed='mrmr_network_d2'
  Row 3: Original='mrmr', Parsed='mrmr_network_d2'
  Row 4: Original='mrmr', Parsed='mrmr_network_d2'
Model name mismatches found: 1200/1440
  Row 0: Original='d2', Parsed='KNeighborsRegressor'
  Row 1: Original='d2', Parsed='LinearRegression'
  Row 2: Original='d2', Parsed='SVR'
  Row 3: Original='d2', Parsed='KNeighborsRegressor'
  Row 4: Original='d2', Parsed='LinearRegression'
Dataframe columns updated with correctly parsed values
### Unique Values After Parsing
Methods: ['mrmr_network_d2' 'mrmr_network_d3' 'mrmr_network_d4' 'mrmr'
 'anova_filter' 'random_select']
K-values: [np.int64(5), np.int64(10), np.int64(20), np.int64(40), np.int64(50), np.in

Unnamed: 0,rng,condition,feature_importance,feature_importance_from,model_performance,metrics,selected_features,model_name,selected_scores,k,y_pred,y_true_index,feature_selection_time,k_value,method
0,15795,mrmr_network_d2_k5_KNeighborsRegressor,"([P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;...",model,0.227172,"{'r2': 0.22717155944145095, 'pearson_r': 0.481...","[P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;H...",KNeighborsRegressor,"[131.3165785071838, 327.1223954013686, 565.674...",5,"[4.145812627205843, 3.565269312907551, 4.00127...","Index(['SIDM00879', 'SIDM00213', 'SIDM00542', ...",0.631287,5,mrmr_network_d2
1,15795,mrmr_network_d2_k5_LinearRegression,"([P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;...",model,0.28611,"{'r2': 0.2861099002767664, 'pearson_r': 0.5542...","[P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;H...",LinearRegression,"[131.3165785071838, 327.1223954013686, 565.674...",5,"[4.336589260450065, 3.911674454641713, 4.34378...","Index(['SIDM00879', 'SIDM00213', 'SIDM00542', ...",0.538203,5,mrmr_network_d2
2,15795,mrmr_network_d2_k5_SVR,"([P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;...",model,0.297102,"{'r2': 0.297102349943117, 'pearson_r': 0.57271...","[P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;H...",SVR,"[131.3165785071838, 327.1223954013686, 565.674...",5,"[4.355579707685028, 3.9226434809363093, 4.3620...","Index(['SIDM00879', 'SIDM00213', 'SIDM00542', ...",0.440071,5,mrmr_network_d2
3,15795,mrmr_network_d2_k10_KNeighborsRegressor,"([P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;...",model,0.234771,"{'r2': 0.23477148708124618, 'pearson_r': 0.512...","[P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;H...",KNeighborsRegressor,"[131.3165785071838, 327.1223954013686, 565.674...",10,"[3.3206764321843623, 3.8438008415826626, 4.694...","Index(['SIDM00879', 'SIDM00213', 'SIDM00542', ...",0.43639,10,mrmr_network_d2
4,15795,mrmr_network_d2_k10_LinearRegression,"([P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;...",model,0.331693,"{'r2': 0.331692676057021, 'pearson_r': 0.59089...","[P06400;RB_HUMAN, P84022;SMAD3_HUMAN, P08238;H...",LinearRegression,"[131.3165785071838, 327.1223954013686, 565.674...",10,"[4.446959121790704, 4.260871526923133, 4.11757...","Index(['SIDM00879', 'SIDM00213', 'SIDM00542', ...",0.436604,10,mrmr_network_d2


Actual k-values present in data: [np.int64(5), np.int64(10), np.int64(20), np.int64(40), np.int64(50), np.int64(60), np.int64(80), np.int64(100)]


## Dataset Exploration - df_benchmark

In [67]:
# Basic Dataset Overview
save_and_print("## Basic Dataset Overview", print_report_file, level="section")

# Shape and memory usage
save_and_print(f"Dataset shape: {df_benchmark.shape} (rows × columns)", print_report_file, level="info")
save_and_print(f"Memory usage: {df_benchmark.memory_usage(deep=True).sum() / 1024**2:.2f} MB", print_report_file, level="info")

# Column names and data types
save_and_print("### Column Information", print_report_file, level="subsection")
column_info = df_benchmark.dtypes.reset_index()
column_info.columns = ['Column', 'Data Type']
save_and_print(column_info.to_string(index=False), print_report_file, level="info")

# Missing values analysis
save_and_print("### Missing Values Analysis", print_report_file, level="subsection")
missing_values = df_benchmark.isnull().sum()
missing_percentage = (missing_values / len(df_benchmark)) * 100
missing_df = pd.DataFrame({
    'Column': missing_values.index,
    'Missing Count': missing_values.values,
    'Missing Percentage': missing_percentage.values
})
save_and_print(missing_df.to_string(index=False), print_report_file, level="info")

## Basic Dataset Overview
Dataset shape: (1440, 15) (rows × columns)
Memory usage: 8.69 MB
### Column Information
                 Column Data Type
                    rng     int64
              condition    object
     feature_importance    object
feature_importance_from    object
      model_performance   float64
                metrics    object
      selected_features    object
             model_name    object
        selected_scores    object
                      k     int64
                 y_pred    object
           y_true_index    object
 feature_selection_time   float64
                k_value     int64
                 method    object
### Missing Values Analysis
                 Column  Missing Count  Missing Percentage
                    rng              0                 0.0
              condition              0                 0.0
     feature_importance              0                 0.0
feature_importance_from              0                 0.0
      model_perform

'                 Column  Missing Count  Missing Percentage\n                    rng              0                 0.0\n              condition              0                 0.0\n     feature_importance              0                 0.0\nfeature_importance_from              0                 0.0\n      model_performance              0                 0.0\n                metrics              0                 0.0\n      selected_features              0                 0.0\n             model_name              0                 0.0\n        selected_scores              0                 0.0\n                      k              0                 0.0\n                 y_pred              0                 0.0\n           y_true_index              0                 0.0\n feature_selection_time              0                 0.0\n                k_value              0                 0.0\n                 method              0                 0.0'

In [68]:
# Additional Key Insights
save_and_print("## Additional Key Insights", print_report_file, level="section")

# Unique combinations of key parameters
save_and_print("### Unique Combinations of Key Parameters", print_report_file, level="subsection")
key_params = ['method', 'model_name', 'k_value']
available_params = [col for col in key_params if col in df_benchmark.columns]

if len(available_params) > 0:
    unique_combinations = df_benchmark[available_params].drop_duplicates()
    save_and_print(f"Number of unique combinations: {len(unique_combinations)}", print_report_file, level="info")
    save_and_print("Unique combinations:", print_report_file, level="info")
    save_and_print(unique_combinations.to_string(index=False), print_report_file, level="info")
else:
    save_and_print("Key parameter columns not found in dataset", print_report_file, level="info")

# Performance metrics summary by method and model
save_and_print("### Performance Metrics Summary by Method and Model", print_report_file, level="subsection")
if 'method' in df_benchmark.columns and 'model_name' in df_benchmark.columns and 'model_performance' in df_benchmark.columns:
    performance_summary = df_benchmark.groupby(['method', 'model_name'])['model_performance'].agg(['mean', 'std', 'min', 'max', 'count'])
    save_and_print("Performance summary by method and model:", print_report_file, level="info")
    save_and_print(performance_summary.to_string(), print_report_file, level="info")
else:
    save_and_print("Required columns for performance summary not found", print_report_file, level="info")

# Close the report file
print_report_file.close()
save_and_print(f"Data report saved to: {print_report_path}", level="info")

## Additional Key Insights
### Unique Combinations of Key Parameters
Number of unique combinations: 144
Unique combinations:
         method          model_name  k_value
mrmr_network_d2 KNeighborsRegressor        5
mrmr_network_d2    LinearRegression        5
mrmr_network_d2                 SVR        5
mrmr_network_d2 KNeighborsRegressor       10
mrmr_network_d2    LinearRegression       10
mrmr_network_d2                 SVR       10
mrmr_network_d2 KNeighborsRegressor       20
mrmr_network_d2    LinearRegression       20
mrmr_network_d2                 SVR       20
mrmr_network_d2 KNeighborsRegressor       40
mrmr_network_d2    LinearRegression       40
mrmr_network_d2                 SVR       40
mrmr_network_d2 KNeighborsRegressor       50
mrmr_network_d2    LinearRegression       50
mrmr_network_d2                 SVR       50
mrmr_network_d2 KNeighborsRegressor       60
mrmr_network_d2    LinearRegression       60
mrmr_network_d2                 SVR       60
mrmr_network_d2 KNei

'Data report saved to: I:\\My Drive\\DAWSON PHD PROJECT\\Biomarker Data Repository\\data/results/ThesisResult4-FeatureSelectionBenchmark/v5_network_integration_rep10/data_report_v5_network_integration_rep10.md'