In [None]:
%run '/home/christianl/Zhang-Lab/Zhang Lab Code/Boilerplate/Fig_config_utilities.py'

In [None]:
# observed residuals vs. expected residuals scatterplots with Pearson's R and r2 for MLR, XGBRF and RNN

def figure_2_residuals_vs_predicted(y_true, predictions_dict, output_path='figure_2.png'):
    """
    Generate residual vs. predicted scatter plots to detect heteroscedasticity
    and systematic bias.
    
    Parameters
    ----------
    y_true : array-like
        True target values
    predictions_dict : dict
        Dictionary of predictions by model
    output_path : str
        Path to save figure
    """
    set_publication_style()
    fig, axes = plt.subplots(1, 3, figsize=FIGSIZE_TRIPLE)
    
    model_names = list(predictions_dict.keys())
    
    for idx, model_name in enumerate(model_names):
        ax = axes[idx]
        y_pred = predictions_dict[model_name]
        residuals = y_true - y_pred
        
        # Scatter plot
        ax.scatter(y_pred, residuals, alpha=0.5, s=30,
                  color=MODEL_COLORS.get(model_name, '#1f77b4'),
                  edgecolors='none')
        
        # Zero line (perfect predictions)
        ax.axhline(y=0, color='k', linestyle='--', lw=2, alpha=0.5)
        
        # LOWESS smoothing for trend detection (non-parametric)
        sorted_indices = np.argsort(y_pred)
        y_pred_sorted = y_pred[sorted_indices]
        residuals_sorted = residuals[sorted_indices]
        
        # Simple moving average as alternative to LOWESS
        window = max(10, len(y_pred) // 20)
        if len(y_pred) > window:
            moving_avg = pd.Series(residuals_sorted).rolling(window=window, center=True).mean()
            ax.plot(y_pred_sorted, moving_avg, color='red', lw=2.5, 
                   alpha=0.7, label='Trend (moving avg)')
        
        # Compute and display heteroscedasticity metric
        # Split into quartiles and compute variance ratio
        quartiles = np.array_split(sorted_indices, 4)
        variances = [np.var(residuals[q]) for q in quartiles]
        hetero_ratio = np.max(variances) / np.min(variances) if np.min(variances) > 0 else np.inf
        
        ax.set_xlabel('Predicted Expression', fontsize=12, fontweight='bold')
        ax.set_ylabel('Residuals (Observed - Predicted)', fontsize=12, fontweight='bold')
        ax.set_title(model_name, fontsize=13, fontweight='bold')
        
        # Text box with heteroscedasticity info
        mean_residual = np.mean(residuals)
        std_residual = np.std(residuals)
        textstr = f"Mean Residual: {mean_residual:.4f}\nStd Dev: {std_residual:.4f}\nHetero Ratio: {hetero_ratio:.2f}x"
        ax.text(0.05, 0.95, textstr, transform=ax.transAxes,
               fontsize=9, verticalalignment='top',
               bbox=dict(boxstyle='round', facecolor='lightblue', alpha=0.8))
        
        if len(y_pred) > window:
            ax.legend(loc='lower right', fontsize=9)
        ax.grid(True, alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=DPI, bbox_inches='tight')
    print(f"Figure 2 saved to {output_path}")
    plt.show()