**This script contains the figure generating functions for Fig 2.A**

In [None]:
import os
from pathlib import Path
import json 

# 1. Find the Repo Root dynamically
# Walks up folders until it finds the README.md file
_root = next(p for p in Path.cwd().parents if (p / "README.md").exists())
REPO_ROOT = str(_root)

# 2. Add to sys.path so standard 'import' statements work
import sys
if REPO_ROOT not in sys.path:
    sys.path.insert(0, REPO_ROOT)

# Load the Data Root from the JSON file
with open(Path(REPO_ROOT) / "data_config.json", "r") as f:
    config = json.load(f)
    DATA_ROOT = config["DATA_ROOT"]

print(f"Data is being pulled from: {DATA_ROOT}")
print(f"Repo root identified as: {REPO_ROOT}")

In [None]:
############### GENERATE TEST/TRAIN PREDICTIONS ###############

%run "$REPO_ROOT/config/predictions/model_train_test_predictions.py"

In [None]:
############### LOADING DATA ###############

%run "$REPO_ROOT/config/predictions/model_load.py"

In [None]:
############### DIAGNOSTICS ###############

# 1. Create the mask (y_test is a DataFrame, so we convert to values)
mask = y_test.values < 0

# 2. Update the function to handle flattening automatically
def get_local_metrics(y_true, y_pred, name):
    # Ensure they are flattened and aligned with the mask
    y_t = y_true.ravel()
    y_p = y_pred.ravel()
    
    # Apply the mask to both
    mask_flat = mask.ravel()
    y_t_masked = y_t[mask_flat]
    y_p_masked = y_p[mask_flat]
    
    mse = np.mean((y_t_masked - y_p_masked)**2)
    pred_variance = np.var(y_p_masked) 
    
    print(f"--- {name} (Low Range Only) ---")
    print(f"Samples in range: {len(y_t_masked)}")
    print(f"MSE: {mse:.4f}")
    print(f"Prediction Variance: {pred_variance:.6f}")
    print(f"Min Prediction: {np.min(y_p_masked):.4f}\n")

# Use the keys from your predictions_test dictionary
get_local_metrics(y_test.values, predictions_test["MLR"], "MLR")
get_local_metrics(y_test.values, predictions_test["XGBRFRegressor"], "XGBRF")
get_local_metrics(y_test.values, predictions_test["RNN"], "RNN")

# 3. Visualize
plt.figure(figsize=(15, 5))
# Mapping your dictionary keys to plot titles
plot_info = [
    ('MLR', predictions_test["MLR"]), 
    ('XGBRF', predictions_test["XGBRFRegressor"]), 
    ('RNN', predictions_test["RNN"])
]

mask_flat = mask.ravel()
y_test_flat = y_test.values.ravel()

for i, (name, pred) in enumerate(plot_info, 1):
    plt.subplot(1, 3, i)
    pred_flat = pred.ravel()
    
    plt.scatter(y_test_flat[mask_flat], pred_flat[mask_flat], alpha=0.3, s=1, color=MODEL_COLORS.get(name, 'black'))
    plt.axhline(0, color='red', linestyle='--', label='Zero Floor')
    plt.title(f"{name} Performance (Observed < 0)")
    plt.xlabel("Observed Expression (Log10)")
    plt.ylabel("Predicted Expression (Log10)")
    plt.ylim(-2, 2) # Zooming in on the boundary area
    plt.legend()

plt.tight_layout()
plt.show()

In [None]:
############### DIAGNOSTICS ###############

# Check the actual range of your data
print(f"Observed Test Min: {y_test.values.min():.6f}")
print(f"Observed Test Max: {y_test.values.max():.6f}")

# Count samples in specific ranges
print(f"Samples exactly 0: {np.sum(y_test.values == 0)}")
print(f"Samples < 0:       {np.sum(y_test.values < 0)}")
print(f"Samples < 0.1:     {np.sum(y_test.values < 0.1)}")

In [None]:
############### DIAGNOSTICS ###############

# Calculate how much each model 'leaks' into impossible negative values
for name, pred in predictions_test.items():
    neg_count = np.sum(pred < 0)
    neg_pct = (neg_count / pred.size) * 100
    min_val = np.min(pred)
    
    print(f"{name}:")
    print(f"  Samples < 0: {neg_count} ({neg_pct:.2f}%)")
    print(f"  Lowest Prediction: {min_val:.4f}")

In [None]:
############### UNIT TEST ###############

# Check what your y_test columns actually are:
print("y_test columns type:", type(y_test.columns))
print("First 5 column names:", y_test.columns[:5].tolist())

# Also check if y_test is actually a DataFrame or numpy array:
print("y_test type:", type(y_test))

In [None]:
############### UNIT TEST ###############

import inspect
print(inspect.getsource(compute_metrics_per_gene_test))

In [None]:
############### DIAGNOSTICS ###############

# diagnostics of MLR r2 scores

# checking to ensure predictions and groundtruth are comparable (they do)
assert y_test.shape == mlr_y_pred.shape

# compare r2 values
# .score() r2 is the variance-weighted r2, which computes each gene's r2 individually (genes are treated differently) 
# as each gene predictions are still in 2D Numpy arrays not flattened
# mlr_loaded.score: 0.7986089431408789
print("mlr_loaded.score:", mlr_loaded.score(x_test, y_test))

# compute_metrics() is the flattened r2, where each gene's r2 is treated equally (flattened) before aggregating
# compute_metrics r2: 0.9323357932034118

# compute_metrics r2 > .score() r2 -> figured out that .score() when unspecified does uniform-average, different to compute_metrics r2 
# which calculates the variance-weighted r2, taking into account the individual variances of each gene.
# given the difference of 0.365 between the two, positive correlation between variance and R2 indicates that model performs better
# on genes with higher variance (more distinct expression patterns) than lower variance ones (ie. housekeeping, "silenced", etc.)

metrics_flat_mlr = compute_metrics(y_test.values, mlr_y_pred)
print("compute global R²:", metrics_flat_mlr['r2'])

# compute_metrics_per_gene() looks at the indiviudal r2 at per-gene resolution (using DFs to maintain biological relevance of each column) 

#compute_metrics_per_gene r2: 0        0.861767
#1        0.747973
#2        0.373796
#3        0.769920
#4        0.859807
#           ...   
#16095    0.736619
#16096    0.866326
#16097    0.924248
#16098    0.912513
#16099    0.900036
#Name: r2, Length: 16100, dtype: float64

metrics_flat_per_gene_mlr = compute_metrics_per_gene_test(y_test, mlr_y_pred)
print("compute_metrics_per_gene R²:", metrics_flat_per_gene_mlr['r2'])

In [None]:
############### DIAGNOSTICS ###############

# same diagnostics for XGBRF.v3 (trained on same x_train)
# checking to ensure predictions and groundtruth are comparable
assert y_test.shape == xgbrf_y_pred.shape

# computing XGBRF metrics (aggregate and per gene (per model is this case))
# compute global R²: 0.9136
metrics_flat_xgbrf = compute_metrics(y_test.values, xgbrf_y_pred)
print(f"compute global R²: {metrics_flat_xgbrf['r2']:.4f}")

#compute_metrics_per_gene R²: 0        0.798614
#1        0.732539
#2        0.390864
#3        0.717345
#4        0.819955
#           ...   
#16095    0.677608
#16096    0.801420
#16097    0.883455
#16098    0.861673
#16099    0.863954
#Name: r2, Length: 16100, dtype: float64

metrics_flat_per_gene_xgbrf = compute_metrics_per_gene_test(y_test, xgbrf_y_pred)
print("compute_metrics_per_gene R²:", metrics_flat_per_gene_xgbrf['r2'])


In [None]:
############### DIAGNOSTICS ###############


# same diagnostics for RNN.v1 (used as a reference for data preprocessing of other models )
# checking to ensure predictions and groundtruth are comparable
assert y_test.shape == rnn_y_pred.shape

# computing RNN metrics (aggregate and per gene (per model is this case))
# compute global R²: 0.7366
metrics_flat_rnn = compute_metrics(y_test.values, rnn_y_pred)
print(f"compute global R²: {metrics_flat_rnn['r2']:.4f}")

#compute_metrics_per_gene R²: 0        0.249111
#1       -0.058234
#2       -0.235948
#3        0.382344
#4        0.671084
#           ...   
#16095    0.336766
#16096    0.509802
#16097    0.657901
#16098    0.514623
#16099    0.622254
#Name: r2, Length: 16100, dtype: float64

metrics_flat_per_gene_rnn = compute_metrics_per_gene_test(y_test, rnn_y_pred)
print("compute_metrics_per_gene R²:", metrics_flat_per_gene_rnn['r2'])

# lower R² than I would have expected - maybe the test-train splitting was different for model training? 21/01/26

In [None]:
############### PLOTTING FUNCTION ###############

def figure_1_observed_vs_predicted(y_true, predictions_dict, 
                                  r2_method='variance_weighted',
                                  output_path=f"{DATA_ROOT}/Saved figures/"):
    """
    Generate observed vs. predicted scatterplot with Pearson correlation, R2, RMSE and MAE.
    """
    set_publication_style()
    fig, axes = plt.subplots(1, 3, figsize=FIGSIZE_TRIPLE)
    model_names = list(predictions_dict.keys())
    
    for idx, model_name in enumerate(model_names):
        ax = axes[idx]
        y_pred = predictions_dict[model_name]
        
        # Flatten arrays for scatter plot
        y_true_flat = np.asarray(y_true).ravel()
        y_pred_flat = np.asarray(y_pred).ravel()
        
        # Compute metrics based on specified method
        if r2_method == 'variance_weighted' or r2_method == 'flattened':
            r2 = r2_score(y_true_flat, y_pred_flat)
            r2_label = "R² (var-w)" # Shortened for display fit
        elif r2_method == 'uniform_average':
            r2 = r2_score(y_true, y_pred, multioutput='uniform_average')
            r2_label = "R² (uni-avg)"
        else:
            raise ValueError(f"Unknown r2_method: {r2_method}")
        
        # Pearson correlation
        pearson_r, p_value = pearsonr(y_true_flat, y_pred_flat)

        # --- NEW: Calculate RMSE and MAE for the plot ---
        rmse = np.sqrt(mean_squared_error(y_true_flat, y_pred_flat))
        mae = mean_absolute_error(y_true_flat, y_pred_flat)
        
        # Scatter plot
        ax.scatter(y_true_flat, y_pred_flat, alpha=0.5, s=5, 
                   color=MODEL_COLORS.get(model_name, '#1f77b4'),
                   edgecolors='none')
        
        # Perfect prediction diagonal line
        min_val = min(y_true_flat.min(), y_pred_flat.min())
        max_val = max(y_true_flat.max(), y_pred_flat.max())
        ax.plot([min_val, max_val], [min_val, max_val], 'k--', 
                lw=1, alpha=0.5, label='Perfect prediction')
        
        # Fit regression line
        z = np.polyfit(y_true_flat, y_pred_flat, 1)
        p = np.poly1d(z)
        x_line = np.linspace(y_true_flat.min(), y_true_flat.max(), 100)
        y_line = p(x_line)
        ax.plot(x_line, y_line, color="#000000",
                lw=1.5, alpha=0.8, label='Linear fit')
        
        # Labels and formatting
        ax.set_xlabel('Observed Expression (Log10)', fontsize=12, fontweight='bold')
        ax.set_ylabel('Predicted Expression (Log10)', fontsize=12, fontweight='bold')
        ax.set_title(model_name, fontsize=13, fontweight='bold')
        
        # --- MODIFIED: Add RMSE and MAE to text box ---
        # Construct the string part by part for clarity
        stats_text = (f"Pearson's R = {pearson_r:.4f}\n"
                      f"{r2_label} = {r2:.4f}\n"
                      f"RMSE = {rmse:.4f}\n"
                      f"MAE = {mae:.4f}")
        
        # Append p-value logic
        if p_value < 0.001:
            textstr = f"{stats_text}\np < 0.001"
        else:
            textstr = f"{stats_text}\np = {p_value:.3f}"
        
        ax.text(0.05, 0.95, textstr, transform=ax.transAxes, 
                fontsize=10, verticalalignment='top',
                bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.8))
        
        ax.legend(loc='lower right', fontsize=9)
        ax.grid(True, alpha=0.3)
        
        ax.set_xlim(-8, 6)
        ax.set_ylim(-8, 6)
    
    # counting how many points are within visible range
    for idx, model_name in enumerate(model_names):
        ax = axes[idx]
        xlim = ax.get_xlim()
        ylim = ax.get_ylim()
    
        y_pred = predictions_dict[model_name]
        y_true_flat = np.asarray(y_true).ravel()
        y_pred_flat = np.asarray(y_pred).ravel()
    
        outside_x = ((y_true_flat < xlim[0]) | (y_true_flat > xlim[1])).sum()
        outside_y = ((y_pred_flat < ylim[0]) | (y_pred_flat > ylim[1])).sum()
        
        print(f"\n{model_name}:")
        print(f"  Axis limits: x={xlim}, y={ylim}")
        print(f"  Points outside x-range: {outside_x}")
        print(f"  Points outside y-range: {outside_y}")
        print(f"  Data range: x=[{y_true_flat.min():.2f}, {y_true_flat.max():.2f}], y=[{y_pred_flat.min():.2f}, {y_pred_flat.max():.2f}]")
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=DPI, bbox_inches='tight')
    print(f"Figure 1 saved to {output_path}")
    plt.show()
    
    # Return metrics for reference
    metrics_summary = {}
    for model_name in model_names:
        y_pred = predictions_dict[model_name]
        y_true_flat = np.asarray(y_true).ravel()
        y_pred_flat = np.asarray(y_pred).ravel()
        
        pearson_r, p_value = pearsonr(y_true_flat, y_pred_flat)
        r2_variance_weighted = r2_score(y_true_flat, y_pred_flat)
        r2_uniform = r2_score(y_true, y_pred, multioutput='uniform_average')
        rmse = np.sqrt(mean_squared_error(y_true_flat, y_pred_flat))
        mae = mean_absolute_error(y_true_flat, y_pred_flat)
        
        metrics_summary[model_name] = {
            'pearson_r': pearson_r,
            'p_value': p_value,
            'r2_variance_weighted': r2_variance_weighted,
            'r2_uniform_average': r2_uniform,
            'rmse': rmse,
            'mae': mae
        }
    
    return metrics_summary

In [None]:
############### UNIT TEST ###############

# How many MLR predictions are in the "visible" range of your plot (roughly 0-7)?
visible_range = (mlr_y_pred_test >= -1) & (mlr_y_pred_test <= 7)
print(f"MLR predictions in visible range: {visible_range.sum()} / {mlr_y_pred_test.size}")
print(f"Percentage visible: {100 * visible_range.sum() / mlr_y_pred_test.size:.2f}%")

In [None]:
############### UNIT TEST ###############

print("Prediction variance:")
for model_name, y_pred in predictions_test.items():
    print(f"{model_name}: {y_pred.std():.4f}")

In [None]:
############### UNIT TEST ###############

# Huge outlier range for MLR, so auto-scaling is such that the datpoints are super compressed
# to accomodate the rare ouliers (99.98% data within a normal 0-6 range like other plots)

# This is yet another reason MLR isn't a good choice for gene expression prediction tasks compared 
# to other models suitable for handling this data's non-linearity

print("Prediction ranges:")
for model_name, y_pred in predictions.items():
    pred_range = y_pred.max() - y_pred.min()
    print(f"{model_name}: {pred_range:.4f}")

In [None]:
############### PLOTTING STEP ###############

figure_1_observed_vs_predicted(y_test, predictions_test, r2_method='variance_weighted', output_path=f"{DATA_ROOT}/Saved figures/Production_model_figures/figure1_(PRODUCTION.v4).png")