In [33]:
# Import all necessary libraries
import pandas as pd
import numpy as np
import shap 
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import torch as nn
from sklearn.multioutput import MultiOutputRegressor
import os

In [3]:
# Reading in pocket data files
gene_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Pocket data files/Geneexpression(pocket).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Pocket data files/TF(pocket).tsv'), sep='\t', header=0)

In [4]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)


# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

# For testing set
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()


In [7]:
base_xgb = xgb.XGBRegressor(
    objective='reg:squarederror',
    eval_metric='rmse',
    verbosity=1,
    n_estimators=100,
)

models = []
n_targets = y_train.shape[1]

# progress bar showing percent complete of target-level training
pbar = tqdm(range(n_targets), desc="Training targets", unit="target",
            bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{percentage:3.0f}%]")

for i in pbar:
    est = xgb.XGBRegressor(objective='reg:squarederror',
                           eval_metric='rmse', verbosity=1, n_estimators=100)
    # turn off verbose printing so tqdm stays clean
    est.fit(x_train, y_train[:, i], eval_set=[(x_val, y_val[:, i])], verbose=False)
    models.append(est)
    pbar.set_postfix({'target': i})

predictions = np.column_stack([m.predict(x_test) for m in models])



Training targets:   0%|           0/3960 [  0%]

In [None]:
# Checking training data integrity
print(x_train.dtype)
print(x_train)
print(np.isnan(x_train).sum())  # Check for NaN values
print(np.isinf(x_train).sum())  # Check for infinite values


float64
[[0.        0.        0.        ... 0.        1.93641   0.       ]
 [0.        0.        0.        ... 0.        1.7565801 0.       ]
 [2.7519422 0.        0.        ... 0.        1.2960573 0.       ]
 ...
 [0.        0.        0.        ... 0.        0.        0.       ]
 [0.        2.9892867 0.        ... 3.9883952 0.        0.       ]
 [0.        0.        0.        ... 0.        0.        0.       ]]
0
0


In [34]:
model_paths = []
for i, model in enumerate(models):
    model_path = f'/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Saved models/XGBoost/xgb_model_target_{i}.model'
    os.makedirs(os.path.dirname(model_path), exist_ok=True) 
    model.save_model(model_path)
    model_paths.append(model_path)

# Save the paths to a summary file
with open('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Saved models/XGBoost/model_paths.txt', 'w') as f:
    for path in model_paths:
        f.write(f"{path}\n")

  self.get_booster().save_model(fname)
  self.get_booster().save_model(fname)
  self.get_booster().save_model(fname)


In [None]:

import tempfile, os, re

feature_names = tf_expression.columns.tolist()
target_names = gene_expression.columns.tolist()

shap_values_per_target = {}
for i, model in enumerate(models):
    booster = model.get_booster() if hasattr(model, "get_booster") else model

    # save to JSON then normalize any stringified numeric params (e.g. base_score == "[2.991465E-1]")
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
    tmp.close()
    try:
        booster.save_model(tmp.name)

        # read JSON and replace base_score strings like "[2.991465E-1]" -> 0.2991465
        s = open(tmp.name, "r", encoding="utf-8").read()
        pattern = r'"base_score"\s*:\s*"\[([0-9Ee\+\-\.]+)\]"'
        def _repl(m):
            return f'"base_score": {float(m.group(1))}'
        s2, n = re.subn(pattern, _repl, s)
        if n > 0:
            with open(tmp.name, "w", encoding="utf-8") as fw:
                fw.write(s2)

        # reload normalized Booster
        booster = xgb.Booster(model_file=tmp.name)
    finally:
        os.remove(tmp.name)

    explainer = shap.TreeExplainer(booster)
    sv = explainer.shap_values(x_train)  # (n_samples, n_features)
    shap_values_per_target[target_names[i] if i < len(target_names) else f"target_{i}"] = pd.DataFrame(sv, columns=feature_names)

# Combine into one table with MultiIndex columns (target, feature)
shap_df = pd.concat(shap_values_per_target, axis=1)
shap_df.to_csv('/Users/christianlangridge/Desktop/Zhang-Lab/Zhang Lab Data/Saved models/XGBoost/shap_values_table.csv', index=False)
shap_df.head()
# ...existing code...