In [None]:
import pandas as pd
import numpy as np
import shap 
import xgboost as xgb
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pickle 
from sklearn.metrics import r2_score, mean_squared_error

In [None]:
# Reading in full data files
gene_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/Geneexpression (full).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv'), sep='\t', header=0)

In [None]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

display(combined_data)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)


# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

# For testing set
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
# Training RF model

models = []
n_targets = y_train.shape[1]

# progress bar showing percent complete of target-level training
pbar = tqdm(range(n_targets), desc="Training targets", unit="target",
            bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{percentage:3.0f}%]")

for i in pbar:
    est = xgb.XGBRFRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=3,
        n_jobs=-1,      # use all cores
        verbosity=0
    )
    # turn off verbose printing so tqdm stays clean
    est.fit(x_train, y_train[:, i], eval_set=[(x_val, y_val[:, i])], verbose=False)
    models.append(est)
    pbar.set_postfix({'target': i})

predictions = np.column_stack([m.predict(x_test) for m in models])

display(predictions)

In [None]:
# Saving trained model 

with open('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Saved models/Random Forest/RF_model.pkl', 'wb') as f:
    pickle.dump(models, f)

In [None]:
# JUST TEST SCRIPT - ADDING IN A MULTI-OUTPUT REGRESSOR WRAPPER TO DO DEAL WITH MULTI-REGRESSION NON NATIVE TO RF MODELS 

import os
import multiprocessing as mp
from tqdm.contrib.concurrent import process_map, thread_map
from xgboost import XGBRFRegressor
from sklearn.multioutput import MultiOutputRegressor
import pickle
import numpy as np

n_targets = y_train.shape[1]

# train one estimator for a given target index (must be top-level in this cell)
def _train_target(i):
    est = XGBRFRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=100,
        n_jobs=1,        # avoid nested parallelism inside each estimator
        verbosity=0
    )
    est.fit(x_train, y_train[:, i], eval_set=[(x_val, y_val[:, i])], verbose=False)
    return est

# Prefer fork start method on mac to avoid pickling notebook-local functions
use_process = False
try:
    # only set if not already set; fork avoids pickling the _train_target function
    if mp.get_start_method(allow_none=True) != "fork":
        try:
            mp.set_start_method("fork", force=True)
        except RuntimeError:
            # already set by another part of the session; ignore
            pass
    use_process = True
except Exception:
    use_process = False

# Run parallel training with a visible tqdm progress bar.
# If fork-based processes are available, process_map will be used (best for CPU-bound work).
# Otherwise fall back to thread_map which works inside notebooks without pickling issues.
chunksize = max(1, n_targets // (os.cpu_count() or 1))
if use_process:
    models = process_map(_train_target, list(range(n_targets)), max_workers=None, chunksize=chunksize)
else:
    models = thread_map(_train_target, list(range(n_targets)), max_workers=os.cpu_count() or 1, chunksize=chunksize)

# quick prediction check (build matrix from list-of-estimators)
y_pred = np.column_stack([m.predict(x_test) for m in models])
print("y_pred.shape:", y_pred.shape)
print("multi-output R^2 (uniform_average):", r2_score(y_test, y_pred, multioutput='uniform_average'))

# Create a MultiOutputRegressor wrapper instance and attach trained estimators so it can be used like a single object
base = XGBRFRegressor(objective='reg:squarederror', random_state=42, n_estimators=100, n_jobs=1, verbosity=0)
multi = MultiOutputRegressor(base, n_jobs=None)   # n_jobs None because we already trained models
# manually set attributes so multi.predict(...) will work
multi.estimators_ = models
multi.n_features_in_ = x_train.shape[1]
multi.n_outputs_ = n_targets

# save both the list and the single-wrapper for convenience
with open('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Saved models/Random Forest/RF_model_list.pkl', 'wb') as f:
    pickle.dump(models, f)
with open('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Saved models/Random Forest/RF_model_multi.pkl', 'wb') as f:
    pickle.dump(multi, f)
# ...existing code...