In [1]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import shap 
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from sklearn.model_selection import KFold
import json

In [None]:
# Reading in pocket data files
gene_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Pocket data files/Geneexpression(pocket).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Pocket data files/TF(pocket).tsv'), sep='\t', header=0)

In [None]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

display(combined_data)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)


# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

# For testing set
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
# Model training 
models = []
n_targets = y_train.shape[1]

# progress bar showing percent complete of target-level training
pbar = tqdm(range(n_targets), desc="Training targets", unit="target",
            bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{percentage:3.0f}%]")

for i in pbar:
    est = xgb.XGBRFRegressor(
        objective='reg:squarederror',
        random_state=42,
        n_estimators=3,
        n_jobs=-1,      # use all cores
        verbosity=0
    )
    # turn off verbose printing so tqdm stays clean
    est.fit(x_train, y_train[:, i], eval_set=[(x_val, y_val[:, i])], verbose=False)
    models.append(est)
    pbar.set_postfix({'target': i})

predictions = np.column_stack([m.predict(x_test) for m in models])

display(predictions)

In [None]:
# Get the underlying booster
booster = est.get_booster()

# Get the model's JSON configuration as a string
model_dump_str = booster.save_config()

# Parse the JSON configuration
model_config = json.loads(model_dump_str)

# Extract base_score, which is currently a stringified list like "[7.965566E-1]"
base_score_str = model_config['learner']['learner_model_param']['base_score']

# Convert stringified list to float by extracting the first element
base_score_float = float(json.loads(base_score_str)[0])

# Update the base_score field in the config dict
model_config['learner']['learner_model_param']['base_score'] = base_score_float

# Convert back to JSON string
fixed_config_str = json.dumps(model_config)

# Load the fixed config back into the booster
booster.load_config(fixed_config_str)

# Now create the SHAP explainer with the fixed booster
explainer = shap.TreeExplainer(booster)
shap_values = explainer.shap_values(x_test)

print(f"SHAP values calculated for {shap_values.shape[0]} predictions")
print(f"Each prediction explained by {shap_values.shape[1]} features")