In [1]:
# testing uncentered-trained RNN 
# importing pytorch
import torch
import pandas as pd
from RNN_reconstructor import load_model_from_checkpoint
from scipy.stats import pearsonr


In [4]:
# running data config file
%run '/home/christianl/Zhang-Lab/Zhang Lab Code/Boilerplate_datahandling/Remote boilerplate/uncentered_RNN_remote.py'


In [14]:
#!/usr/bin/env python3
"""
COMPREHENSIVE DIAGNOSTIC: Find exact feature mismatch between RNN and current data

This script will tell you EXACTLY what 1197 TFs the RNN was trained on,
so you can align MLR/XGBoost to use the same features for fair benchmarking.
"""

import pandas as pd
import numpy as np
import torch

print("=" * 80)
print("COMPREHENSIVE RNN FEATURE DIAGNOSTIC")
print("=" * 80)

# ============================================================================
# STEP 1: Load your current data
# ============================================================================
print("\n[STEP 1] Loading current data files...")

tf_expression = pd.read_csv('/home/christianl/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv', 
                            sep='\t', header=0, index_col=0)
net = pd.read_csv('/home/christianl/Zhang-Lab/Zhang Lab Data/Full data files/network(full).tsv', sep='\t')

expression_tfs = set(tf_expression.columns)
network_tfs = set(net['TF'].unique())

print(f"✓ Expression data loaded: {tf_expression.shape}")
print(f"  - TFs in expression data: {len(expression_tfs)}")
print(f"  - Includes 'TF' gene: {'TF' in expression_tfs}")

print(f"\n✓ Network data loaded: {net.shape}")
print(f"  - Unique TFs in network: {len(network_tfs)}")
print(f"  - Includes 'TF' gene: {'TF' in network_tfs}")

# ============================================================================
# STEP 2: Analyze current data intersection
# ============================================================================
print(f"\n" + "=" * 80)
print("[STEP 2] Current data intersection analysis")
print("=" * 80)

in_network_not_in_data = network_tfs - expression_tfs
in_data_not_in_network = expression_tfs - network_tfs
usable_tfs_current = expression_tfs & network_tfs

print(f"\nTFs in NETWORK but NOT in expression data: {len(in_network_not_in_data)}")
if in_network_not_in_data:
    print(f"  {sorted(in_network_not_in_data)}")

print(f"\nTFs in EXPRESSION DATA but NOT in network: {len(in_data_not_in_network)}")
if in_data_not_in_network:
    print(f"  {sorted(in_data_not_in_network)}")

print(f"\nCurrently usable TFs (intersection): {len(usable_tfs_current)}")

# ============================================================================
# STEP 3: Load RNN checkpoint to see what it expects
# ============================================================================
print(f"\n" + "=" * 80)
print("[STEP 3] Loading RNN checkpoint")
print("=" * 80)

checkpoint_path = '/home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/RNN/uncentered_data_RNN/signaling_model.v1.pt'
checkpoint = torch.load(checkpoint_path, map_location='cpu', weights_only=False)

rnn_input_size = checkpoint['state_dict']['input_layer.weights'].shape[0]
rnn_output_size = checkpoint['state_dict']['output_layer.weights'].shape[0]

print(f"\n✓ Checkpoint loaded")
print(f"  - RNN expects INPUT features: {rnn_input_size}")
print(f"  - RNN expects OUTPUT features: {rnn_output_size}")

# ============================================================================
# STEP 4: Calculate the mystery
# ============================================================================
print(f"\n" + "=" * 80)
print("[STEP 4] THE MYSTERY")
print("=" * 80)

print(f"\nCurrent situation:")
print(f"  - Your expression data has: {len(expression_tfs)} TFs (including 'TF')")
print(f"  - Your network has: {len(network_tfs)} TFs (excluding 'TF')")
print(f"  - Current intersection: {len(usable_tfs_current)} TFs")
print(f"  - RNN was trained on: {rnn_input_size} TFs")
print(f"\n  Difference: {rnn_input_size} - {len(usable_tfs_current)} = {rnn_input_size - len(usable_tfs_current)} TFs")

# ============================================================================
# STEP 5: Reconstruct what the original training data must have been
# ============================================================================
print(f"\n" + "=" * 80)
print("[STEP 5] RECONSTRUCTING ORIGINAL TRAINING DATA")
print("=" * 80)

print(f"\nThe RNN was trained on {rnn_input_size} TFs.")
print(f"Your expression data has {len(expression_tfs)} TFs.")
print(f"Difference: {len(expression_tfs)} - {rnn_input_size} = {len(expression_tfs) - rnn_input_size}")

if len(expression_tfs) > rnn_input_size:
    print(f"\n⚠️  FINDING: Expression data has MORE TFs than RNN was trained on!")
    print(f"   This means {len(expression_tfs) - rnn_input_size} TF(s) were filtered out during training.")
    print(f"\n   Since 'TF' is in expression but NOT in current network,")
    print(f"   'TF' was likely one of the filtered TFs.")
    
elif len(expression_tfs) < rnn_input_size:
    print(f"\n⚠️  FINDING: Expression data has FEWER TFs than RNN expects!")
    print(f"   This means the original training had {rnn_input_size - len(expression_tfs)} additional TFs.")
    
else:
    print(f"\n✓ Expression data matches RNN input size exactly!")

# ============================================================================
# STEP 6: Determine the exact feature set for benchmarking
# ============================================================================
print(f"\n" + "=" * 80)
print("[STEP 6] BENCHMARKING STRATEGY")
print("=" * 80)

print(f"\nFor FAIR benchmarking, all models must use the SAME features.")
print(f"\nYou have THREE options:\n")

print(f"OPTION 1: Use current intersection ({len(usable_tfs_current)} TFs)")
print(f"  ✓ Pros: Uses only TFs that are in BOTH your data and network")
print(f"  ✗ Cons: Cannot use existing RNN checkpoint (expects {rnn_input_size})")
print(f"  → Action: Retrain ALL models (MLR, XGBoost, RNN) on {len(usable_tfs_current)} TFs")

print(f"\nOPTION 2: Try to match RNN's original {rnn_input_size} TFs")
print(f"  ✓ Pros: Can use existing RNN checkpoint")
print(f"  ✗ Cons: Need to figure out which {rnn_input_size} TFs the RNN used")
print(f"  → Action: Reconstruct or find original network.tsv that was used")

print(f"\nOPTION 3: Retrain RNN on current {len(usable_tfs_current)} TFs")
print(f"  ✓ Pros: Clear, reproducible benchmark using current data")
print(f"  ✗ Cons: Lose existing RNN weights, need to retrain")
print(f"  → Action: Retrain RNN with current network.tsv, then train MLR/XGBoost")

# ============================================================================
# STEP 7: Generate the correct feature list for current data
# ============================================================================
print(f"\n" + "=" * 80)
print("[STEP 7] RECOMMENDED APPROACH")
print("=" * 80)

print(f"\nRECOMMENDATION: Use OPTION 1 or 3")
print(f"\nHere's the exact code to use for fair benchmarking:")
print(f"\n" + "-" * 80)
print("""
# Load data WITH index_col=0
tf_expression = pd.read_csv('TF(full).tsv', sep='\\t', header=0, index_col=0)

# Filter to network TFs ONLY (don't manually drop anything!)
net = pd.read_csv('network(full).tsv', sep='\\t')
network_tfs = set(net['TF'].unique())
usable_tfs = [tf for tf in tf_expression.columns if tf in network_tfs]

# Use only the intersection
x = tf_expression[usable_tfs]  # Shape: (samples, {len(usable_tfs_current)})

# This gives you {len(usable_tfs_current)} features for ALL models
# Train MLR, XGBoost, and RNN on these SAME {len(usable_tfs_current)} TFs
""")
print("-" * 80)

print(f"\nThis approach:")
print(f"  1. Loads data correctly (with index_col=0)")
print(f"  2. Filters to network TFs automatically")
print(f"  3. Gives ALL models the SAME {len(usable_tfs_current)} features")
print(f"  4. Ensures fair comparison")

# ============================================================================
# STEP 8: Save the usable TF list for reference
# ============================================================================
print(f"\n" + "=" * 80)
print("[STEP 8] Saving reference files")
print("=" * 80)

# Save the current usable TF list
usable_tfs_sorted = sorted(usable_tfs_current)
output_file = '/home/christianl/Zhang-Lab/Zhang Lab Data/usable_tfs_for_benchmarking.txt'
with open(output_file, 'w') as f:
    f.write(f"# Usable TFs for benchmarking\n")
    f.write(f"# Total: {len(usable_tfs_sorted)}\n")
    f.write(f"# Generated from intersection of expression data and network.tsv\n")
    f.write(f"#\n")
    for tf in usable_tfs_sorted:
        f.write(f"{tf}\n")

print(f"\n✓ Saved usable TF list to: {output_file}")
print(f"  ({len(usable_tfs_sorted)} TFs)")

# ============================================================================
# SUMMARY
# ============================================================================
print(f"\n" + "=" * 80)
print("SUMMARY")
print("=" * 80)

print(f"""
KEY FINDINGS:
1. Your expression data has {len(expression_tfs)} TFs (including 'TF' gene)
2. Your network has {len(network_tfs)} TFs (excluding 'TF' gene)  
3. Current intersection: {len(usable_tfs_current)} TFs
4. RNN checkpoint expects: {rnn_input_size} TFs
5. Mismatch: {abs(rnn_input_size - len(usable_tfs_current))} TFs

FOR FAIR BENCHMARKING:
- Use the {len(usable_tfs_current)} TFs in the intersection
- Train ALL models (MLR, XGBoost, RNN) on these SAME features
- Your updated boilerplate script is already correct for this!

NEXT STEPS:
1. Either retrain the RNN on {len(usable_tfs_current)} TFs
   OR
2. Try to find/reconstruct the original network.tsv that gave {rnn_input_size} TFs

Current boilerplate script produces: {len(usable_tfs_current)} features ✓
Use this for MLR and XGBoost training ✓
Retrain RNN with same {len(usable_tfs_current)} features for fair comparison ✓
""")

print("=" * 80)
print("DIAGNOSTIC COMPLETE")
print("=" * 80)

COMPREHENSIVE RNN FEATURE DIAGNOSTIC

[STEP 1] Loading current data files...
✓ Expression data loaded: (15935, 1197)
  - TFs in expression data: 1197
  - Includes 'TF' gene: True

✓ Network data loaded: (1153904, 3)
  - Unique TFs in network: 1200
  - Includes 'TF' gene: False

[STEP 2] Current data intersection analysis

TFs in NETWORK but NOT in expression data: 4
  ['ARNTL', 'HKR1', 'HOMEZ', 'T']

TFs in EXPRESSION DATA but NOT in network: 1
  ['TF']

Currently usable TFs (intersection): 1196

[STEP 3] Loading RNN checkpoint

✓ Checkpoint loaded
  - RNN expects INPUT features: 1197
  - RNN expects OUTPUT features: 16100

[STEP 4] THE MYSTERY

Current situation:
  - Your expression data has: 1197 TFs (including 'TF')
  - Your network has: 1200 TFs (excluding 'TF')
  - Current intersection: 1196 TFs
  - RNN was trained on: 1197 TFs

  Difference: 1197 - 1196 = 1 TFs

[STEP 5] RECONSTRUCTING ORIGINAL TRAINING DATA

The RNN was trained on 1197 TFs.
Your expression data has 1197 TFs.
Dif

In [15]:
import pandas as pd
import numpy as np

# Load expression data correctly
tf_expression = pd.read_csv(
    '/home/christianl/Zhang-Lab/Zhang Lab Data/Full data files/TF(full).tsv',
    sep='\t',
    header=0,
    index_col=0
)

tf_col = tf_expression['TF']

print("=" * 80)
print("SANITY CHECK FOR 'TF' FEATURE")
print("=" * 80)

# 1. Missing values
print("\n[1] Missing values")
print(f"NaNs: {tf_col.isna().sum()} / {len(tf_col)}")

# 2. Variance check
print("\n[2] Variance")
print(f"Variance: {tf_col.var()}")
print(f"Unique values: {tf_col.nunique()}")

# 3. Basic statistics
print("\n[3] Distribution statistics")
print(tf_col.describe())

# 4. Compare scale with other TFs
print("\n[4] Scale comparison vs other TFs")
other_tfs = tf_expression.drop(columns=['TF'])

print(f"TF mean:        {tf_col.mean():.4f}")
print(f"Other TF mean:  {other_tfs.values.mean():.4f}")
print(f"TF std:         {tf_col.std():.4f}")
print(f"Other TF std:   {other_tfs.values.std():.4f}")

# 5. Check if TF is constant or binary
print("\n[5] Value inspection")
print("First 10 TF values:")
print(tf_col.head(10).to_list())

print("\nMin / Max:")
print(tf_col.min(), tf_col.max())

# 6. Correlation sanity (optional but informative)
print("\n[6] Correlation with other TFs (top 5 absolute)")
corrs = tf_expression.corr()['TF'].drop('TF').abs().sort_values(ascending=False)
print(corrs.head(5))

print("\n" + "=" * 80)


SANITY CHECK FOR 'TF' FEATURE

[1] Missing values
NaNs: 0 / 15935

[2] Variance
Variance: 1.1127584013917176
Unique values: 15162

[3] Distribution statistics
count    15935.000000
mean         1.911532
std          1.054874
min          0.000000
25%          1.053988
50%          2.315812
75%          2.745821
max          3.657355
Name: TF, dtype: float64

[4] Scale comparison vs other TFs
TF mean:        1.9115
Other TF mean:  0.5526
TF std:         1.0549
Other TF std:   0.5720

[5] Value inspection
First 10 TF values:
[0.9283725410804732, 0.5122308221456753, 1.9842021704106672, 1.962288404738203, 1.9692921684790228, 1.974130038763108, 1.925538077334, 2.4621607546884774, 0.1577601234054157, 1.4715700506346476]

Min / Max:
0.0 3.657354856936767

[6] Correlation with other TFs (top 5 absolute)
PROX1      0.797273
HLF        0.791430
MLXIPL     0.772802
CREB3L3    0.759595
SALL1      0.738318
Name: TF, dtype: float64



In [7]:
# reconstructing trained RNN from checkpoint file with learned weights, 
# the fixed network.tsv and the reconstructor script with the class initalisations 
loaded_RNN = load_model_from_checkpoint(
                checkpoint_path='/home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/RNN/uncentered_data_RNN/signaling_model.v1.pt',
                net_path='/home/christianl/Zhang-Lab/Zhang Lab Data/Full data files/network(full).tsv',
                X_in_df=x_test_df,  # passing as df not tensors
                y_out_df=y_test_df,  # passing as df not tensors
                device='cpu',
                use_exact_training_params=True)

LOADING MODEL - EXACT TRAINING SCRIPT SEQUENCE

1. Loading checkpoint from: /home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/RNN/uncentered_data_RNN/signaling_model.v1.pt

2. Loading network from: /home/christianl/Zhang-Lab/Zhang Lab Data/Full data files/network(full).tsv
   Network shape: (1153904, 3)
   Network columns: ['TF', 'Gene', 'Interaction']

3. Formatting network...

4. Using EXACT benchmark.py parameters
   projection_amplitude_in: 1.2
   projection_amplitude_out: 1.2
   bionet_params: {'target_steps': 150, 'max_steps': 10, 'exp_factor': 50, 'tolerance': 1e-20, 'leak': 0.01}

5. Initializing model with DataFrames...
   Input X_in shape: (3187, 1196)
   Input y_out shape: (3187, 16100)
  Filtered X_in: 1196 → 1196 features
  Filtered y_out: 16100 → 16100 features
   ✓ Model initialized (data automatically filtered)

6. Converting DataFrames to tensors...
   ✓ Tensors created

7. Applying training settings...
   ✓ Set input_layer.weights.requires_grad = False
   ✓ Appli

RuntimeError: Error(s) in loading state_dict for SignalingModel:
	size mismatch for input_layer.weights: copying a param with shape torch.Size([1197]) from checkpoint, the shape in current model is torch.Size([1196]).

In [6]:
# looking across test dataset to see what the RNN's predictive ability is on my data
# y_hat -> predictions made only across the 16,100 target genes we are looking at in our final output (returned)
# y_full -> predictions across all 16,371 network nodes in the .tsv file, including hidden states (intermediary calculations, returned)
# torch.no_grad() -> command to look across but not change the RNN's learned weights

with torch.no_grad():
    Y_hat, Y_full = loaded_RNN(loaded_RNN.X_in)
    
print(f"\nPredictions shape: {Y_hat.shape}")
print(f"Hidden states shape: {Y_full.shape}")


Predictions shape: torch.Size([3187, 16100])
Hidden states shape: torch.Size([3187, 16371])


In [None]:
# calculating Pearson correlation to check how well model is predicting results vs. a groundtruth
# Y_hat -> the cleaned predictions only including the target genes synched with the .tsv file
# loaded_RNN.y_out -> the test set 'y_test_df' with exact same dimensions, used here as a baseline to compare performance 
# data is run through detach().cpu().numpy() to convert from Pytorch tensors to Numpy arrays
# flatten() compresses multidimensional data into a 1D array
# Agg Pearson correlation coefficient of 0.8587 between y_test and y_hat predictions 

pr, _ = pearsonr(
    loaded_RNN.y_out.detach().flatten().cpu().numpy(),
    Y_hat.detach().flatten().cpu().numpy())

print(f"\n" + "=" * 70)
print(f"RESULTS")
print("=" * 70)
print(f"Pearson correlation coefficient: {pr:.4f}")



RESULTS
Pearson correlation coefficient: 0.8587


In [16]:
# saving predictions in an output file
output_file = "/home/christianl/Zhang-Lab/Zhang Lab Data/uncenteredRNN_on_uncentereddata_predictions.tsv"
pd.DataFrame(Y_hat.detach().cpu().numpy()).to_csv(
    output_file, sep="\t", index=False, header=False
)
print(f"\nPredictions saved to: {output_file}")


Predictions saved to: /home/christianl/Zhang-Lab/Zhang Lab Data/uncenteredRNN_on_uncentereddata_predictions.tsv


#### Retrying the code but with mean-centered data to check differences in Agg PCC ####

In [12]:
%run '/home/christianl/Zhang-Lab/Zhang Lab Code/Remote boilerplate/centered_RNN_remote.py'

In [13]:
centered_loaded_RNN = load_model_from_checkpoint(
                checkpoint_path='/home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/RNN/signaling_model.v1.pt',
                net_path='/home/christianl/Zhang-Lab/Zhang Lab Data/Full data files/network(full).tsv',
                X_in_df=x_test_centered_df,  # passing as df not tensors (centered)
                y_out_df=y_test_centered_df,  # passing as df not tensors (centered)
                device='cpu',
                use_exact_training_params=True)

LOADING MODEL - EXACT TRAINING SCRIPT SEQUENCE

1. Loading checkpoint from: /home/christianl/Zhang-Lab/Zhang Lab Data/Saved models/RNN/signaling_model.v1.pt

2. Loading network from: /home/christianl/Zhang-Lab/Zhang Lab Data/Full data files/network(full).tsv
   Network shape: (1153904, 3)
   Network columns: ['TF', 'Gene', 'Interaction']

3. Formatting network...

4. Using EXACT benchmark.py parameters
   projection_amplitude_in: 1.2
   projection_amplitude_out: 1.2
   bionet_params: {'target_steps': 150, 'max_steps': 10, 'exp_factor': 50, 'tolerance': 1e-20, 'leak': 0.01}

5. Initializing model with DataFrames...
   Input X_in shape: (3187, 1198)
   Input y_out shape: (3187, 16101)
  Filtered X_in: 1198 → 1197 features
  Filtered y_out: 16101 → 16100 features
   ✓ Model initialized (data automatically filtered)

6. Converting DataFrames to tensors...
   ✓ Tensors created

7. Applying training settings...
   ✓ Set input_layer.weights.requires_grad = False
   ✓ Applied prescale_weights(

In [14]:
with torch.no_grad():
    Y_hat_centered, Y_full_centered = centered_loaded_RNN(loaded_RNN.X_in)
    
print(f"\nPredictions shape: {Y_hat_centered.shape}")
print(f"Hidden states shape: {Y_full_centered.shape}")


Predictions shape: torch.Size([3187, 16100])
Hidden states shape: torch.Size([3187, 16371])


In [None]:
# As expected, performance drop significantly with PCC dropping to 0.3320 when comparing 
# a model trained on uncentered data with data that has been mean-centered
# This illustrates how important a unified data preprocessing step is for benchmarking 

pr, _ = pearsonr(
    centered_loaded_RNN.y_out.detach().flatten().cpu().numpy(),
    Y_hat_centered.detach().flatten().cpu().numpy())

print(f"\n" + "=" * 70)
print(f"RESULTS")
print("=" * 70)
print(f"Pearson correlation coefficient on mean-centered data (uncentered-data training) : {pr:.4f}")


RESULTS
Pearson correlation coefficient on mean-centered data (uncentered-data training) : 0.3320


In [17]:
# saving predictions in an output file
output_file_1 = "/home/christianl/Zhang-Lab/Zhang Lab Data/uncenteredRNN_on_centereddata_predictions.tsv"
pd.DataFrame(Y_hat_centered.detach().cpu().numpy()).to_csv(
    output_file_1, sep="\t", index=False, header=False
)
print(f"\nPredictions saved to: {output_file_1}")


Predictions saved to: /home/christianl/Zhang-Lab/Zhang Lab Data/uncenteredRNN_on_centereddata_predictions.tsv
