In [1]:
import pandas as pd
import numpy as np
import shap 
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pickle 
from sklearn.metrics import r2_score, mean_squared_error
import os
import multiprocessing as mp
from tqdm.contrib.concurrent import process_map, thread_map

In [2]:
# Reading in full data files
gene_expression = pd.read_csv(('~/Zhang-Lab/Zhang Lab Data/Full data files/Geneexpression (full).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('~/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv'), sep='\t', header=0)

In [3]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)


# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

# For testing set
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [4]:
models = []
n_targets = y_train.shape[1]

# progress bar 
pbar = tqdm(range(n_targets), desc="Training targets", unit="target",
            bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{percentage:3.0f}%]")

for i in pbar:
    est = xgb.XGBRFRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=100,
        max_depth=5,
        device='cuda',
        tree_method='hist'
    )
    # turn off verbose printing so tqdm stays clean (excess output from program)
    est.fit(x_train, y_train[:, i], eval_set=[(x_val, y_val[:, i])], verbose=False)
    models.append(est)
    pbar.set_postfix({'target': i})

predictions = np.column_stack([m.predict(x_test) for m in models])

display(predictions)

Training targets:   0%|           0/16101 [  0%]

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


  return func(**kwargs)


array([[1.2315812e+00, 3.0296464e+00, 6.6785753e-02, ..., 2.2102759e+00,
        1.5108478e+00, 1.3485320e+00],
       [2.1809340e-03, 2.8224082e+00, 2.5895961e-02, ..., 3.0538082e-02,
        1.3163745e-02, 3.7476420e-03],
       [6.2860358e-01, 2.7610064e+00, 2.5895961e-02, ..., 2.2753854e+00,
        1.0872849e+00, 1.1448900e+00],
       ...,
       [1.1926802e+00, 2.8280663e+00, 2.5895961e-02, ..., 2.1529679e+00,
        1.0944118e+00, 1.2094243e+00],
       [1.7747366e-01, 3.0109334e-01, 4.2595446e-02, ..., 2.3049946e+00,
        1.3215477e+00, 1.4591693e+00],
       [1.7719201e+00, 2.6453328e+00, 2.5895961e-02, ..., 2.3284318e+00,
        1.1106733e+00, 1.0753107e+00]], shape=(3187, 16101), dtype=float32)

In [8]:
 # Evaluate model 

from sklearn.metrics import r2_score
import numpy as np

# After your training loop and prediction code:

predictions = np.column_stack([m.predict(x_test) for m in models])

# Calculate R² values
r2_overall = r2_score(y_test, predictions, multioutput='uniform_average')
r2_per_target = r2_score(y_test, predictions, multioutput='raw_values')

print("Overall R² (uniform average):", r2_overall)

KeyboardInterrupt: 

In [9]:
# dedicated folder for this training run
model_dir = "/home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/Random Forest/Saved_Models_XGBRF_v2"
os.makedirs(model_dir, exist_ok=True)

# each model saved individually
print(f"Saving {len(models)} models to {model_dir}...")

for i, model in enumerate(models):
    # Save as standard JSON (portable, safer than pickle)
    save_path = os.path.join(model_dir, f"target_{i}.json")
    model.save_model(save_path)

print("All models saved successfully.")

Saving 16101 models to /home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/Random Forest/Saved_Models_XGBRF_v2...
All models saved successfully.
