In [None]:
import pandas as pd
import numpy as np
import shap 
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pickle 
from sklearn.metrics import r2_score, mean_squared_error, make_scorer
import os
import multiprocessing as mp
from sklearn.model_selection import cross_val_score
from multiprocessing import Manager
import optuna
from sklearn.model_selection import KFold

In [None]:
# Reading in full data files
gene_expression = pd.read_csv(('~/Zhang-Lab/Zhang Lab Data/Full data files/Geneexpression (full).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('~/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv'), sep='\t', header=0)

In [None]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)


# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

# For testing set
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
# Load model in with pickle file 
with open('/home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/Random Forest/RF_model.pkl', 'rb') as file:
    loaded = pickle.load(file)

In [None]:
 # Evaluate model 

 #loaded = pickle.load(file)
 #Loaded object is a list of models. n_models: 16101
 #y_pred.shape: (3187, 16101)
 #model.n_features_in_: None
 #model.n_outputs_: None
 #x_test.shape: (3187, 1198)
 #y_test.shape: (3187, 16101)
 #Multi-output R^2 (uniform_average): 0.7601639389405871


# List of estimators (one per target gene) so build predictions matrix 

if isinstance(loaded, list):
    models = loaded
    print("Loaded object is a list of models. n_models:", len(models))
    y_pred = np.column_stack([m.predict(x_test) for m in models])  # (n_samples, n_genes)
else:
    model = loaded
    print("Loaded object type:", type(model))
    # If single-output model but y_test is multi-column, this will raise -- handled later
    y_pred = model.predict(x_test)

print("y_pred.shape:", y_pred.shape)

# diagnostics (safe access)
def safe_attr(obj, name):
    return getattr(obj, name, None) if not isinstance(obj, list) else None

print("model.n_features_in_:", safe_attr(loaded, "n_features_in_"))
print("model.n_outputs_:", safe_attr(loaded, "n_outputs_"))
print("x_test.shape:", x_test.shape)
print("y_test.shape:", y_test.shape)

# Continue with your R^2 / MSE logic expecting y_pred shape (n_samples, n_targets)
from sklearn.metrics import r2_score, mean_squared_error

# If y_pred is 1D, treat as single-output
if y_pred.ndim == 1 or (y_pred.ndim == 2 and y_pred.shape[1] == 1):
    if y_test.ndim == 2 and y_test.shape[1] > 1:
        raise ValueError(
            "Model predicts a single target but y_test contains multiple targets (genes).\n"
            "Select the trained target column before splitting, e.g.:\n"
            "  target = 'GENE_NAME'\n"
            "  y = gene_expression[target]\n"
            "  then redo train_test_split and evaluation."
        )
    y_true = y_test.ravel()
    print("R^2:", r2_score(y_true, y_pred))
    print("MSE:", mean_squared_error(y_true, y_pred))
else:
    print("Multi-output R^2 (uniform_average):", r2_score(y_test, y_pred, multioutput='uniform_average'))
    
    
first = models[0]
print("first estimator type:", type(first))
print("n_features_in_:", getattr(first, "n_features_in_", None))
print("example feature_importances_ (first 10):", getattr(first, "feature_importances_", None)[:10])


In [None]:
def objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000)
    max_depth = trial.suggest_int("max_depth", 3, 20)

    subsample = trial.suggest_float("subsample", 0.5, 1.0)
    colsample_bytree = trial.suggest_float("colsample_bytree", 0.5, 1.0)
    reg_alpha = trial.suggest_float("reg_alpha", 0.0, 5.0)
    reg_lambda = trial.suggest_float("reg_lambda", 0.0, 10.0)
    learning_rate = trial.suggest_float("learning_rate", 0.1, 1.0)

    # Use the RF regressor class; note the name
    model = xgb.XGBRFRegressor(
        n_estimators=n_estimators,
        max_depth=max_depth,
        subsample=subsample,
        colsample_bytree=colsample_bytree,
        reg_alpha=reg_alpha,
        reg_lambda=reg_lambda,
        learning_rate=learning_rate,
        n_jobs=-1,
    )

    multi_r2 = make_scorer(r2_score, multioutput="uniform_average")

    score = cross_val_score(
        model, x_train, y_train, cv=5, n_jobs=-1, scoring=multi_r2
    ).mean()

    return score

In [None]:
study = optuna.create_study(direction='maximize', sampler=optuna.samplers.RandomSampler(seed=42)) # Default is random Search

In [None]:
from tqdm import tqdm

class TqdmCallback:
    def __init__(self, total_trials):
        self.pbar = tqdm(total=total_trials, desc="Optuna Optimization")
    
    def __call__(self, study, trial):
        self.pbar.update(1)
        self.pbar.set_postfix({"best_r2": f"{study.best_value:.4f}"})
    
    def __del__(self):
        self.pbar.close()

callback = TqdmCallback(total_trials=100)
study.optimize(objective, n_trials=100, callbacks=[callback])

FULLY TEST SCRIPT FROM HERE ONWARDS

In [40]:
#running a CPU-unburdened Optuna search
manager = Manager()
gpu_queue = manager.Queue()
n_gpus = 1
for i in range(n_gpus):
    gpu_queue.put(i)

In [41]:
#compute splits pre-optimization
cv_indices = []
kf = KFold(n_splits=3, shuffle=True, random_state=42)
for train_idx, val_idx in kf.split(x_train):
    cv_indices.append((train_idx, val_idx))

In [42]:
#defining the objective function
def objective(trial):
    gpu_id = gpu_queue.get()
    os.environ["CUDA_VISIBLE_DEVICES"] = str(gpu_id)
    
    params = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 200),
        "max_depth": trial.suggest_int("max_depth", 3, 5),
        "subsample": trial.suggest_float("subsample", 0.5, 0.8),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 0.9),
        "reg_alpha": trial.suggest_float("reg_alpha", 1e-3, 10.0, log=True),
        "reg_lambda": trial.suggest_float("reg_lambda", 1e-3, 100.0, log=True),
        "learning_rate": trial.suggest_float("learning_rate", 0.03, 0.07),
    }

    try:
        fold_scores = []
    
        #training each fold without recomputing splits
        for fold, (train_idx, val_idx) in enumerate(cv_indices):
            x_tr, x_val = x_train[train_idx], x_train[val_idx]
            y_tr, y_val = y_train[train_idx], y_train[val_idx]
        
            #native XGBoost is 10-15% faster than scikit wrapper
            model = xgb.XGBRFRegressor(device="cuda", tree_method="hist", **params, n_jobs=1)
            model.fit(x_tr, y_tr, verbose=False)
        
            y_pred = model.predict(x_val)
            score = r2_score(y_val, y_pred, multioutput="uniform_average")
            fold_scores.append(score)
        
            # Pruning support
            trial.report(score, fold)
            if trial.should_prune():
                gpu_queue.put(gpu_id)
                raise optuna.TrialPruned()
        return np.mean(fold_scores)
    finally:
        gpu_queue.put(gpu_id)

In [43]:
#Run optimization once at top level
study = optuna.create_study(
    direction="maximize",
    sampler=optuna.samplers.TPESampler(seed=42, n_startup_trials=2),
    pruner=optuna.pruners.HyperbandPruner(
        min_resource=1,
        max_resource=2,  
        reduction_factor=3,
    ),
)

[I 2025-12-06 02:16:07,443] A new study created in memory with name: no-name-c2ac8817-64f8-4dd9-82b9-f6fbc105b1c9


In [44]:
#custom callback of logs for fitting
def log_progress(study, trial):
    print(f"Trial {trial.number} ended with state={trial.state}, "
          f"value={trial.value}, best={study.best_value}")

In [None]:
study.optimize(
    objective,
    n_trials=20,
    n_jobs=n_gpus,            # Optuna-level parallelism
    show_progress_bar=True,
    callbacks=[log_progress],
    show_progress_bar=False
)

# ---- now it's safe to access best_params ----
if len([t for t in study.trials if t.state == optuna.trial.TrialState.COMPLETE]) > 0:
    best_params = study.best_params
    print(f"Best Hyperparameters: {best_params}")
else:
    print("No completed trials; check logs for errors.")

In [45]:
import time

def timed_objective(trial):
    t0 = time.time()
    score = objective(trial)
    print(f"Trial {trial.number} took {time.time() - t0:.1f} s")
    return score

t_start = time.time()
study.optimize(timed_objective, 
               n_trials=5, 
               n_jobs=n_gpus, 
               callbacks=[log_progress],
               show_progress_bar=False)
print(f"Total time: {time.time() - t_start:.1f} s")

[W 2025-12-06 02:17:01,928] Trial 0 failed with parameters: {'n_estimators': 137, 'max_depth': 5, 'subsample': 0.7195981825434216, 'colsample_bytree': 0.7394633936788146, 'reg_alpha': 0.004207988669606638, 'reg_lambda': 0.0060252157362038605, 'learning_rate': 0.03232334448672798} because of the following error: XGBoostError('[02:17:01] /home/conda/feedstock_root/build_artifacts/xgboost-split_1764148514279/work/src/common/device_vector.cu:23: Memory allocation error on worker 0: std::bad_alloc: cudaErrorMemoryAllocation: out of memory\n- Free memory: 47MB\n- Requested memory: 456.722MB\n\nStack trace:\n  [bt] (0) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x6e) [0x7160a62d796e]\n  [bt] (1) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x46d) [0x7160a6ba29cd]\n  [bt] (2) /hom

XGBoostError: [02:17:01] /home/conda/feedstock_root/build_artifacts/xgboost-split_1764148514279/work/src/common/device_vector.cu:23: Memory allocation error on worker 0: std::bad_alloc: cudaErrorMemoryAllocation: out of memory
- Free memory: 47MB
- Requested memory: 456.722MB

Stack trace:
  [bt] (0) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(dmlc::LogMessageFatal::~LogMessageFatal()+0x6e) [0x7160a62d796e]
  [bt] (1) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(dh::detail::ThrowOOMError(std::__cxx11::basic_string<char, std::char_traits<char>, std::allocator<char> > const&, unsigned long)+0x46d) [0x7160a6ba29cd]
  [bt] (2) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(thrust::THRUST_300003_SM_500_520_600_610_700_750_800_860_890_900_1000_1030_1200_1210_NS::detail::vector_base<float, dh::detail::XGBDefaultDeviceAllocatorImpl<float> >::append(unsigned long)+0x332) [0x7160a6bf1c92]
  [bt] (3) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(xgboost::common::SampleMean(xgboost::Context const*, bool, xgboost::linalg::Tensor<float, 2> const&, xgboost::linalg::Tensor<float, 1>*)+0xbd4) [0x7160a64a3d44]
  [bt] (4) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(xgboost::obj::FitInterceptGlmLike::InitEstimation(xgboost::MetaInfo const&, xgboost::linalg::Tensor<float, 1>*) const+0x61) [0x7160a67d5131]
  [bt] (5) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(xgboost::LearnerImpl::UpdateOneIter(int, std::shared_ptr<xgboost::DMatrix>)+0x94e) [0x7160a672c57e]
  [bt] (6) /home/christianl/miniconda3/envs/remote_training/lib/libxgboost.so(XGBoosterUpdateOneIter+0x71) [0x7160a621e9f1]
  [bt] (7) /home/christianl/miniconda3/envs/remote_training/lib/python3.12/lib-dynload/../../libffi.so.8(+0xa052) [0x71621bf17052]
  [bt] (8) /home/christianl/miniconda3/envs/remote_training/lib/python3.12/lib-dynload/../../libffi.so.8(+0x8925) [0x71621bf15925]



Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




In [None]:

from sklearn.model_selection import GridSearchCV

param_test1 = {'n_estimators': range(20, 350, 30)}

clf = xgb.XGBRFRegressor(random_state = 42,
                         oob_score = True,
                         max_depth = 6, 
                         max_features = 'sqrt')

gsearch1 = GridSearchCV(
    estimator=clf, 
    param_grid=param_test1,
    scoring='r2',
    cv=5)

gsearch1.fit(x_train, y_train)
gsearch1.grid_scores_, gsearch1.best_params_, gsearch1.best_score_

Parameters: { "max_features", "oob_score" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
