## Initialisation

In [1]:
import os

path = os.getcwd()
# find the string 'project' in the path, return index
index_project = path.find('project')
# slice the path from the index of 'project' to the end
project_path = path[:index_project+7]
# set the working directory
os.chdir(project_path)
print(f'Project path set to: {os.getcwd()}')


Project path set to: c:\Github\ode-biomarker-project


In [2]:
from PathLoader import PathLoader
path_loader = PathLoader('data_config.env', 'current_user.env')

In [3]:
from DataLink import DataLink
data_link = DataLink(path_loader, 'data_codes.csv')

In [4]:
folder_name = "ThesisResult3-BenchmarkingExpressionData"
exp_id = "v1"

if not os.path.exists(f'{path_loader.get_data_path()}data/results/{folder_name}'):
    os.makedirs(f'{path_loader.get_data_path()}data/results/{folder_name}')

file_save_path = f'{path_loader.get_data_path()}data/results/{folder_name}/'

### Load in Palbociclib datasets

In [5]:
# create a joint dataframe of cdk4 expression and drug response for palbociclib
# load in original ccle data
loading_code = "goncalves-gdsc-2-Palbociclib-LN_IC50-sin"
# generic-gdsc-{number}-{drug_name}-{target_label}-{dataset_name}-{replace_index}-{row_index}
proteomic_feature_data, proteomic_label_data = data_link.get_data_using_code(loading_code)

print(f'Proteomic feature data shape: {proteomic_feature_data.shape}', f'Proteomic label data shape: {proteomic_label_data.shape}')

loading_code = "ccle-gdsc-2-Palbociclib-LN_IC50"
ccle_feature_data, ccle_label_data = data_link.get_data_using_code(loading_code)

print(f'CCLE feature data shape: {ccle_feature_data.shape}', f'CCLE label data shape: {ccle_label_data.shape}')

Proteomic feature data shape: (737, 6692) Proteomic label data shape: (737,)
CCLE feature data shape: (584, 19221) CCLE label data shape: (584,)


## Functions 

### Random Forest F-Regression

In [10]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from toolkit import Powerkit, FirstQuantileImputer, select_stat_features


def pipeline_rf_freg(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    rng: int,
    k: int = 50,
    rf_kwargs: dict | None = None,
):
    # 1) Fit transformer(s) on X_train only
    imputer = FirstQuantileImputer().fit(X_train)
    X_train_imp = imputer.transform(X_train, return_df=True)

    # 2) Select features on X_train only
    selected_features, sel_train = select_stat_features(
        X_train_imp, y_train, selection_size=k
    )

    # 3) Train model
    rf_kwargs = rf_kwargs or {}
    model = RandomForestRegressor(random_state=rng, **rf_kwargs)
    model.fit(sel_train, y_train)

    # 4) Return components needed by eval
    return {
        "imputer": imputer,
        "selected_features": list(selected_features),
        "model": model,
    }


def eval_regression(
    X_test: pd.DataFrame,
    y_test: pd.Series,
    *,
    pipeline_components: dict,
    metric: str = "r2",
):
    # Unpack
    imputer = pipeline_components["imputer"]
    selected = pipeline_components["selected_features"]
    model = pipeline_components["model"]

    # Transform test
    X_test_imp = imputer.transform(X_test, return_df=True)
    X_test_sel = X_test_imp[selected]

    # Predict
    y_pred = model.predict(X_test_sel)

    # Metric
    if metric == "r2":
        perf = r2_score(y_test, y_pred)
    else:
        # Extend as needed (pearson, mse, etc.)
        from scipy.stats import pearsonr

        perf = pearsonr(y_test, y_pred)[0]

    # Feature importance tuple: (feature_names, scores)
    if hasattr(model, "feature_importances_"):
        fi = (np.array(selected), model.feature_importances_)
    else:
        # Fallback if model has no built-in importances
        fi = (np.array(selected), np.zeros(len(selected)))

    return {
        "feature_importance": fi,
        "model_performance": perf,
        # Optional: include other artifacts if desired
        "y_pred": y_pred,
    }

### Expression Data Benchmarking Pipe

In [11]:
from typing import Dict, List, Literal
import numpy as np # noqa: F811
import pandas as pd

from scipy.stats import pearsonr, spearmanr
from sklearn.feature_selection import VarianceThreshold
from sklearn.metrics import r2_score # noqa: F811
from sklearn.dummy import DummyRegressor

from toolkit import FirstQuantileImputer, f_regression_select, get_model_from_string  # noqa: F811


def _drop_correlated_columns(X: pd.DataFrame, threshold: float = 0.95) -> List[str]:
    corr = X.corr().abs()
    upper = corr.where(np.triu(np.ones(corr.shape), k=1).astype(bool))
    to_drop = set()
    for col in sorted(upper.columns):
        if col in to_drop:
            continue
        high_corr = upper.index[upper[col] > threshold].tolist()
        to_drop.update(high_corr)
    return [c for c in X.columns if c not in to_drop]

def _drop_correlated_columns_memory_efficient_optimized(X: pd.DataFrame, threshold: float = 0.95) -> List[str]:
    n_features = X.shape[1]
    to_drop = set()
    kept_columns = set()  # Track columns we're keeping
    
    batch_size = 1000
    
    for i in range(0, n_features, batch_size):
        end_idx = min(i + batch_size, n_features)
        batch_cols = X.columns[i:end_idx]
        
        # Calculate correlation for current batch against ALL kept columns
        if kept_columns:
            corr_batch = X[batch_cols].corrwith(X[list(kept_columns)], axis=0).abs()
        else:
            corr_batch = pd.DataFrame(index=batch_cols, columns=[])
        
        for j, col in enumerate(batch_cols):
            if col in to_drop:
                continue
            
            # Check if correlated with any kept columns
            should_drop = False
            if kept_columns:
                max_corr = corr_batch.loc[col].max() if col in corr_batch.index else 0
                if max_corr > threshold:
                    should_drop = True
            
            if should_drop:
                to_drop.add(col)
            else:
                kept_columns.add(col)
    
    return list(kept_columns)

def adaptive_drop_correlated(X, threshold=0.95):
    if X.shape[1] < 5000:
        return _drop_correlated_columns(X, threshold)  # Original
    else:
        return _drop_correlated_columns_memory_efficient_optimized(X, threshold)


def baseline_pipeline(
    X_train: pd.DataFrame,
    y_train: pd.Series,
    rng: int,
    *,
    k: int = 500,
    var_threshold: float = 0.0,
    corr_threshold: float = 0.95,
    model_name: Literal[
        "LinearRegression",
        "RandomForestRegressor",
        "SVR",
        "KNeighborsRegressor",
        "XGBRegressor",
    ] = "LinearRegression",
) -> Dict:
    # 0) Sanitize inputs
    X_train = X_train.replace([np.inf, -np.inf], np.nan)
    y_train = pd.Series(y_train).replace([np.inf, -np.inf], np.nan)
    mask = ~y_train.isna()
    X_train, y_train = X_train.loc[mask], y_train.loc[mask]

    # 1) Impute on train (ensure no residual NaNs)
    imputer = FirstQuantileImputer().fit(X_train)
    Xtr = imputer.transform(X_train, return_df=True).astype(float)
    Xtr = Xtr.fillna(0.0)

    # 2) Variance filter
    n_features_initial = Xtr.shape[1]
    vt = VarianceThreshold(threshold=var_threshold).fit(Xtr)
    vt_keep_cols = Xtr.columns[vt.get_support()].tolist()
    Xtr = Xtr[vt_keep_cols]
    n_features_post_variance = Xtr.shape[1]

    # 3) Correlation filter
    corr_keep_cols = adaptive_drop_correlated(Xtr, threshold=corr_threshold)
    Xtr = Xtr[corr_keep_cols]
    n_features_post_correlation = Xtr.shape[1]

    # 4) Univariate ANOVA F-test (fixed top-k)
    k_sel = min(k, Xtr.shape[1]) if Xtr.shape[1] > 0 else 0
    if k_sel == 0:
        selected_features, selector_scores = [], np.array([])
        sel_train = Xtr.iloc[:, :0]
        no_features = True
    else:
        selected_features, selector_scores = f_regression_select(Xtr, y_train, k=k_sel)
        sel_train = Xtr[selected_features]
        no_features = False

    # 5) Fixed model; robust fallback if no features
    if no_features:
        model = DummyRegressor(strategy="mean")
        model.fit(np.zeros((len(y_train), 1)), y_train)
        model_type = "DummyRegressor(mean)"
        model_params = {"strategy": "mean"}
    else:
        if model_name == "LinearRegression":
            model = get_model_from_string("LinearRegression")
        elif model_name == "RandomForestRegressor":
            model = get_model_from_string(
                "RandomForestRegressor", n_estimators=100, random_state=rng
            )
        elif model_name == "SVR":
            model = get_model_from_string("SVR", kernel="linear", C=1.0)
        elif model_name == "KNeighborsRegressor":
            model = get_model_from_string(
                "KNeighborsRegressor", n_neighbors=5, weights="distance", p=2
            )
        elif model_name == "XGBRegressor":
            model = get_model_from_string(
                "XGBRegressor",
                n_estimators=200,
                learning_rate=0.1,
                max_depth=6,
                subsample=1.0,
                colsample_bytree=1.0,
                random_state=rng,
            )
        else:
            raise ValueError("Unsupported model_name for baseline benchmarking.")
        model.fit(sel_train, y_train)
        model_type = model_name
        try:
            model_params = model.get_params(deep=False)
        except Exception:
            model_params = {}

    return {
        "imputer": imputer,
        "vt_keep_cols": vt_keep_cols,
        "corr_keep_cols": corr_keep_cols,
        "selected_features": list(selected_features),
        "selector_scores": np.array(selector_scores),
        "k_requested": int(k),
        "k_effective": int(len(selected_features)),
        "n_features_initial": int(n_features_initial),
        "n_features_post_variance": int(n_features_post_variance),
        "n_features_post_correlation": int(n_features_post_correlation),
        "var_threshold": float(var_threshold),
        "corr_threshold": float(corr_threshold),
        "model": model,
        "model_type": model_type,
        "model_params": model_params,
        "train_data": sel_train,  # may be empty if no_features
        "rng": int(rng),
        "no_features": bool(no_features),
        "n_train_samples_used": int(len(y_train)),
    }


def baseline_eval(
    X_test: pd.DataFrame,
    y_test: pd.Series,
    *,
    pipeline_components: Dict,
    metric_primary: Literal["r2", "pearson_r", "spearman_r"] = "r2",
    importance_from: Literal["selector", "model"] = "selector",
) -> Dict:
    # Unpack
    imputer = pipeline_components["imputer"]
    vt_keep = set(pipeline_components["vt_keep_cols"])
    corr_keep = set(pipeline_components["corr_keep_cols"])
    selected = list(pipeline_components["selected_features"])
    selector_scores = pipeline_components["selector_scores"]
    model = pipeline_components["model"]
    model_name = pipeline_components["model_type"]
    model_params = pipeline_components.get("model_params", {})
    rng = pipeline_components.get("rng", None)
    no_features = pipeline_components.get("no_features", False)

    k_requested = pipeline_components.get("k_requested", len(selected))
    k_effective = pipeline_components.get("k_effective", len(selected))
    n_features_initial = pipeline_components.get("n_features_initial", None)
    n_features_post_variance = pipeline_components.get("n_features_post_variance", None)
    n_features_post_correlation = pipeline_components.get(
        "n_features_post_correlation", None
    )
    var_threshold = pipeline_components.get("var_threshold", None)
    corr_threshold = pipeline_components.get("corr_threshold", None)

    # 0) Sanitize test inputs
    X_test = X_test.replace([np.inf, -np.inf], np.nan)
    y_test = pd.Series(y_test).replace([np.inf, -np.inf], np.nan)
    mask_y = ~y_test.isna()
    X_test, y_test = X_test.loc[mask_y], y_test.loc[mask_y]

    # Apply identical transforms
    Xti = imputer.transform(X_test, return_df=True).astype(float).fillna(0.0)
    cols_after_vt = [c for c in Xti.columns if c in vt_keep]
    Xti = Xti[cols_after_vt]
    cols_after_corr = [c for c in Xti.columns if c in corr_keep]
    Xti = Xti[cols_after_corr]
    Xsel = Xti[selected] if len(selected) > 0 else Xti.iloc[:, :0]

    # Predict robustly
    if no_features or Xsel.shape[1] == 0:
        y_pred = np.full_like(
            y_test.values, fill_value=float(y_test.mean()), dtype=float
        )
    else:
        y_pred = np.asarray(model.predict(Xsel), dtype=float)

    # Filter any non-finite values before metrics
    mask_fin = np.isfinite(y_test.values) & np.isfinite(y_pred)
    y_t = y_test.values[mask_fin]
    y_p = y_pred[mask_fin]
    n_test_used = int(y_t.shape[0])

    if n_test_used < 2:
        r2 = np.nan
        pearson_r = pearson_p = np.nan
        spearman_rho = spearman_p = np.nan
    else:
        r2 = r2_score(y_t, y_p)
        pearson_r, pearson_p = pearsonr(y_t, y_p)
        spearman_rho, spearman_p = spearmanr(y_t, y_p)

    metrics = {
        "r2": float(r2) if np.isfinite(r2) else np.nan,
        "pearson_r": float(pearson_r) if np.isfinite(pearson_r) else np.nan,
        "pearson_p": float(pearson_p) if np.isfinite(pearson_p) else np.nan,
        "spearman_rho": float(spearman_rho) if np.isfinite(spearman_rho) else np.nan,
        "spearman_p": float(spearman_p) if np.isfinite(spearman_p) else np.nan,
        "n_test_samples_used": n_test_used,
    }

    # Importance
    if importance_from == "selector":
        fi = (np.array(selected), np.array(selector_scores))
    else:
        if hasattr(model, "feature_importances_") and len(selected) > 0:
            fi = (np.array(selected), model.feature_importances_)
        elif model_name in ("LinearRegression",) and len(selected) > 0:
            coef = getattr(model, "coef_", np.zeros(len(selected)))
            fi = (np.array(selected), np.abs(coef))
        else:
            fi = (np.array(selected), np.zeros(len(selected)))

    primary = metrics.get(metric_primary, metrics["r2"])

    return {
        "feature_importance": fi,
        "feature_importance_from": importance_from,
        "model_performance": float(primary) if primary is not None else np.nan,
        "metrics": metrics,
        "k_requested": int(k_requested),
        "k_effective": int(k_effective),
        "n_features_initial": int(n_features_initial)
        if n_features_initial is not None
        else None,
        "n_features_post_variance": int(n_features_post_variance)
        if n_features_post_variance is not None
        else None,
        "n_features_post_correlation": int(n_features_post_correlation)
        if n_features_post_correlation is not None
        else None,
        "var_threshold": float(var_threshold) if var_threshold is not None else None,
        "corr_threshold": float(corr_threshold) if corr_threshold is not None else None,
        "model_name": model_name,
        "model_params": model_params,
        "rng": rng,
        "selected_features": selected,
        "selector_scores": np.array(selector_scores),
        "y_pred": y_p,  # filtered to finite entries
        "y_true_index": y_test.index[mask_fin],
        "n_train_samples_used": pipeline_components.get("n_train_samples_used", None),
    }


## Execution

### Benchmarking Expression Datasets (k=500)

In [None]:
import numpy as np  # noqa: F811
import pandas as pd
from toolkit import Powerkit  # noqa: F811

# 1) Align indices and order (critical for identical splits across modalities)
common = sorted(
    set(ccle_label_data.index)
    & set(ccle_feature_data.index)
    & set(proteomic_label_data.index)
    & set(proteomic_feature_data.index)
)
ccle_feature_data = ccle_feature_data.loc[common]
ccle_label_data = ccle_label_data.loc[common]
proteomic_feature_data = proteomic_feature_data.loc[common]
proteomic_label_data = proteomic_label_data.loc[common]

# Optional: ensure numeric only
ccle_feature_data = ccle_feature_data.select_dtypes(include=[np.number])
proteomic_feature_data = proteomic_feature_data.select_dtypes(include=[np.number])

fixed_args = {"k": 500, "var_threshold": 0.0, "corr_threshold": 0.95}
models = [
    "LinearRegression",
    "RandomForestRegressor",
    "SVR",
]  # extend with "KNeighborsRegressor","XGBRegressor"


def add_baselines(pk: Powerkit, model_list):
    for m in model_list:
        cond = f"baseline_{m.lower().replace('regressor', '')}"
        pk.add_condition(
            condition=cond,
            get_importance=False,  # 3.3.2 focuses on performance; set True if you need FI aggregation
            pipeline_function=baseline_pipeline,
            pipeline_args={**fixed_args, "model_name": m},
            eval_function=baseline_eval,
            eval_args={"metric_primary": "r2", "importance_from": "selector"},
        )


# 2) Build Powerkit instances and register conditions
pk_rna = Powerkit(ccle_feature_data, ccle_label_data)
add_baselines(pk_rna, models)

pk_prot = Powerkit(proteomic_feature_data, proteomic_label_data)
add_baselines(pk_prot, models)

# 3) Identical RNGs for fair repeated holdouts
rngs = np.random.RandomState(42).randint(0, 100000, size=50)

df_rna = pk_rna.run_all_conditions(rng_list=rngs, n_jobs=-1, verbose=True)
df_rna["modality"] = "RNASeq"
df_prot = pk_prot.run_all_conditions(rng_list=rngs, n_jobs=-1, verbose=True)
df_prot["modality"] = "Proteomic"
# 5) Concatenate and summarize
df_all = pd.concat([df_rna, df_prot], ignore_index=True)

# model_performance is primary metric (R2); full metrics dict per row in 'metrics'
summary = (
    df_all.groupby(["modality", "condition"])["model_performance"]
    .agg(["mean", "std", "count"])
    .reset_index()
)
print(summary)


In [None]:
from scipy.stats import pearsonr, spearmanr  # noqa: F811

# Expand stored 'metrics' dict (if present) and show pearson/spearman by modality+condition,
# plus an optional recompute from y_true_index + y_pred for verification.

# 1) If df_all contains a 'metrics' column with dicts, unpack it
if "metrics" in df_all.columns and "pearson_r" not in df_all.columns:
    metrics_df = pd.json_normalize(df_all["metrics"])
    for col in ("r2", "pearson_r", "spearman_rho"):
        if col in metrics_df.columns:
            df_all[col] = metrics_df[col]

# 2) Quick grouped summary from stored metrics (if available)
if {"pearson_r", "spearman_rho"}.issubset(df_all.columns):
    print("Stored metrics (mean ± std) by modality and condition:")
    print(
        df_all.groupby(["modality", "condition"])[["pearson_r", "spearman_rho"]]
        .agg(["mean", "std"])
        .round(4)
    )
else:
    print("Stored pearson/spearman not found in df_all; you can recompute them from y_true_index + y_pred (see step 3).")

# 3) Recompute metrics from y_true_index and y_pred to verify (uses ccle_label_data / proteomic_label_data)

def recompute_metrics(row):
    # pick correct label series based on modality
    labels = ccle_label_data if row["modality"] == "RNASeq" else proteomic_label_data
    idx = row["y_true_index"]
    y_true = labels.loc[idx].values
    y_pred = np.asarray(row["y_pred"], dtype=float)

    mask = np.isfinite(y_true) & np.isfinite(y_pred)
    if mask.sum() < 2:
        return pd.Series({"r2_re": np.nan, "pearson_r_re": np.nan, "spearman_rho_re": np.nan})

    r2 = r2_score(y_true[mask], y_pred[mask])
    pr = pearsonr(y_true[mask], y_pred[mask])[0]
    sr = spearmanr(y_true[mask], y_pred[mask])[0]
    return pd.Series({"r2_re": r2, "pearson_r_re": pr, "spearman_rho_re": sr})

recomputed = df_all.apply(recompute_metrics, axis=1)
df_all = pd.concat([df_all, recomputed], axis=1)

# print("\nRecomputed metrics (mean ± std) by modality and condition:")
# print(
#     df_all.groupby(["modality", "condition"])[["pearson_r_re", "spearman_rho_re"]]
#     .agg(["mean", "std"])
#     .round(4)
# )

# Optionally inspect row-level values
display_cols = ["modality", "condition", "pearson_r_re", "spearman_rho_re", "r2_re"]
# print("\nPer-row recomputed metrics:")


Stored metrics (mean ± std) by modality and condition:
                                    pearson_r     spearman_rho    
                                         mean std         mean std
modality  condition                                               
Proteomic baseline_linearregression    0.0644 NaN       0.0487 NaN
          baseline_randomforest        0.7172 NaN       0.6623 NaN
          baseline_svr                 0.3184 NaN       0.3376 NaN
RNASeq    baseline_linearregression    0.1578 NaN       0.1762 NaN
          baseline_randomforest        0.7368 NaN       0.7110 NaN
          baseline_svr                 0.2996 NaN       0.2886 NaN


In [None]:
df_all.to_pickle(f"{file_save_path}benchmarking_results_{exp_id}.pkl")

# Also save the individual modality results
df_rna.to_pickle(f"{file_save_path}rna_results_{exp_id}.pkl")
df_prot.to_pickle(f"{file_save_path}proteomic_results_{exp_id}.pkl")


### Benchmarking Expression Datasets (three k values)

In [None]:
import numpy as np  # noqa: F811
import pandas as pd
from toolkit import Powerkit  # noqa: F811

# 1) Align indices and order (critical for identical splits across modalities)
common = sorted(
    set(ccle_label_data.index)
    & set(ccle_feature_data.index)
    & set(proteomic_label_data.index)
    & set(proteomic_feature_data.index)
)
ccle_feature_data = ccle_feature_data.loc[common]
ccle_label_data = ccle_label_data.loc[common]
proteomic_feature_data = proteomic_feature_data.loc[common]
proteomic_label_data = proteomic_label_data.loc[common]

# Optional: ensure numeric only
ccle_feature_data = ccle_feature_data.select_dtypes(include=[np.number])
proteomic_feature_data = proteomic_feature_data.select_dtypes(include=[np.number])

# Define k values to test
k_values = [20, 100, 500]
fixed_args_base = {"var_threshold": 0.0, "corr_threshold": 0.95}
models = [
    "LinearRegression",
    "RandomForestRegressor",
    "SVR",
]  # extend with "KNeighborsRegressor","XGBRegressor"

def add_baselines(pk: Powerkit, model_list, k_values):
    for m in model_list:
        for k in k_values:
            cond = f"baseline_{m.lower().replace('regressor', '')}_k{k}"
            pk.add_condition(
                condition=cond,
                get_importance=False,  # 3.3.2 focuses on performance; set True if you need FI aggregation
                pipeline_function=baseline_pipeline,
                pipeline_args={**fixed_args_base, "model_name": m, "k": k},
                eval_function=baseline_eval,
                eval_args={"metric_primary": "r2", "importance_from": "selector"},
            )

# 2) Build Powerkit instances and register conditions
pk_rna = Powerkit(ccle_feature_data, ccle_label_data)
add_baselines(pk_rna, models, k_values)
pk_prot = Powerkit(proteomic_feature_data, proteomic_label_data)
add_baselines(pk_prot, models, k_values)

# 3) Identical RNGs for fair repeated holdouts
rngs = np.random.RandomState(42).randint(0, 100000, size=50)


In [None]:

# 4) Run all conditions
df_rna = pk_rna.run_all_conditions(rng_list=rngs, n_jobs=-1, verbose=True)
df_rna["modality"] = "RNASeq"


In [None]:

df_prot = pk_prot.run_all_conditions(rng_list=rngs, n_jobs=-1, verbose=True)
df_prot["modality"] = "Proteomic"


In [26]:

# 5) Concatenate and summarize
df_all = pd.concat([df_rna, df_prot], ignore_index=True)

# 6) Extract k value from condition name for grouping
df_all["k_value"] = df_all["condition"].str.extract(r'k(\d+)').astype(int)

# model_performance is primary metric (R2); full metrics dict per row in 'metrics'
summary = (
    df_all.groupby(["modality", "condition", "k_value"])["model_performance"]
    .agg(["mean", "std", "count"])
    .reset_index()
)
print(summary)

     modality                       condition  k_value        mean  std  count
0   Proteomic  baseline_linearregression_k100      100    0.400253  NaN      1
1   Proteomic   baseline_linearregression_k20       20    0.382698  NaN      1
2   Proteomic  baseline_linearregression_k500      500 -200.721790  NaN      1
3   Proteomic      baseline_randomforest_k100      100    0.434101  NaN      1
4   Proteomic       baseline_randomforest_k20       20    0.272204  NaN      1
5   Proteomic      baseline_randomforest_k500      500    0.453106  NaN      1
6   Proteomic               baseline_svr_k100      100    0.279440  NaN      1
7   Proteomic                baseline_svr_k20       20    0.397100  NaN      1
8   Proteomic               baseline_svr_k500      500   -2.178528  NaN      1
9      RNASeq  baseline_linearregression_k100      100    0.291513  NaN      1
10     RNASeq   baseline_linearregression_k20       20    0.474755  NaN      1
11     RNASeq  baseline_linearregression_k500      5

In [None]:
df_all.to_pickle(f"{file_save_path}benchmarking_results_{exp_id}.pkl")

# Also save the individual modality results
df_rna.to_pickle(f"{file_save_path}rna_results_{exp_id}.pkl")
df_prot.to_pickle(f"{file_save_path}proteomic_results_{exp_id}.pkl")


## Reading data 

This section can be ran to read in the data used for the analysis. The initialisation code block must be run first to load in the correct file paths.