In [None]:
import pandas as pd
import numpy as np
import shap 
import xgboost as xgb
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
import pickle 
from sklearn.metrics import r2_score, mean_squared_error
import os
import multiprocessing as mp
from tqdm.contrib.concurrent import process_map, thread_map
from sklearn.ensemble import RandomForestRegressor

In [None]:
# Reading in full data files
gene_expression = pd.read_csv(('~/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/Geneexpression (full).tsv'), sep='\t', header=0)
tf_expression = pd.read_csv(('~/Desktop/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv'), sep='\t', header=0)

In [None]:
# Split into training, testing and validation sets and into numpy arrays + combining dataframes
x = tf_expression
y = gene_expression

combined_data = pd.concat([x, y], axis=1)

# First split: 70% train and 30% temp (test + val)
x_train, x_temp, y_train, y_temp = train_test_split(
    x, y, test_size=0.3, random_state=42)

# Second split: split the temp set into 20% test and 10% val (which is 2/3 and 1/3 of temp)
x_test, x_val, y_test, y_val = train_test_split(
    x_temp, y_temp, test_size=1/3, random_state=42)


# For training set
x_train = x_train.to_numpy()
y_train = y_train.to_numpy()

# For validation set
x_val = x_val.to_numpy()
y_val = y_val.to_numpy()

# For testing set
x_test = x_test.to_numpy()
y_test = y_test.to_numpy()

In [None]:
# RF model 
regressor = RandomForestRegressor(
                                  random_state=42,
                                  n_estimators=25, # 75 less than XGBRF().v2
                                  max_depth=5, 
                                  min_samples_leaf=3, # RF-specific step 
                                  n_jobs=-1     # use CPU cores
                                )

In [None]:
# Training loop with tqdm progress bar
models = []
n_targets = y_train.shape[1]

pbar = tqdm(range(n_targets), desc="Training targets", unit="target",
            bar_format="{l_bar}{bar} {n_fmt}/{total_fmt} [{percentage:3.0f}%]")

for i in pbar:
    # Fit one regressor per target
    est = regressor.fit(x_train, y_train[:, i])
    models.append(est)
    pbar.set_postfix({'target': i})

# Predict and stack results
predictions = np.column_stack([m.predict(x_test) for m in models])

display(predictions)