In [1]:
%run '/home/christianl/Zhang-Lab/Zhang Lab Code/Boilerplate/Fig_config_utilities.py'

<class 'numpy.ndarray'> (3187, 16101)
<class 'numpy.ndarray'> (3187, 16101)


In [2]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.multioutput import MultiOutputRegressor
from sklearn.linear_model import LinearRegression

# wrapping models in pipelines as to avoid issues of data leakage from data centering through cross-validation analysis
mlr_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

xgbrf_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', MultiOutputRegressor(
        xgb.XGBRFRegressor(
            random_state=42,
            n_estimators=3,
            n_jobs=-1,     
            verbosity=0,
        )
    ))
])

# Dictionary for cross-validation
model_dict_cv = {
    'MLR': mlr_pipeline,
    'XGBRFRegressor': xgbrf_pipeline
}


In [3]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

# cross-validation helper function (5-fold)
def run_cross_validation(models_dict, x, y, n_folds=5, random_state=42):
    """
    Helper function to run cross-validation for multiple models.
    
    Parameters
    ----------
    models_dict : dict
        Dictionary of models to evaluate
        Example: {'MLR': reg_model, 'XGBRFRegressor': xgbrf_model}
    x : np.ndarray
        Features
    y : np.ndarray
        Targets
    n_folds : int
        Number of CV folds
    random_state : int
        Random seed for reproducibility
        
    Returns
    -------
    cv_results : dict
        Cross-validation results for each model
    """
    print("\n" + "="*80)
    print(f"RUNNING {n_folds}-FOLD CROSS-VALIDATION")
    print("="*80)
    
    cv_results = {}
    kfold = KFold(n_splits=n_folds, shuffle=True, random_state=random_state)
    
    for model_name, model in models_dict.items():
        print(f"\nEvaluating {model_name}...")
        
        # Run cross-validation
        scores = cross_val_score(model, x, y, cv=kfold, 
                                scoring='r2', n_jobs=-1)
        
        cv_results[model_name] = scores
        
        print(f"  R² scores: {scores}")
        print(f"  Mean: {scores.mean():.4f} (±{scores.std():.4f})")
    
    print("="*80 + "\n")
    
    return cv_results

In [6]:
cv_results_models = run_cross_validation(model_dict_cv,x_train_centered,y_train_centered)


RUNNING 5-FOLD CROSS-VALIDATION

Evaluating MLR...
  R² scores: [0.78680039 0.77747036 0.77383015 0.77505197 0.77711993]
  Mean: 0.7781 (±0.0046)

Evaluating XGBRFRegressor...


Exception ignored on calling ctypes callback function: <bound method DataIter._next_wrapper of <xgboost.data.SingleBatchInternalIter object at 0x79c191d6b020>>
Traceback (most recent call last):
  File "/home/christianl/miniconda3/envs/remote_training/lib/python3.12/site-packages/xgboost/core.py", line 606, in _next_wrapper
    def _next_wrapper(self, this: None) -> int:  # pylint: disable=unused-argument

KeyboardInterrupt: 


KeyboardInterrupt: 

In [None]:
# 10-fold cross-validation stability plot 
def figure_6_cv_stability(cv_results_dict, 
                         output_path='/home/christianl/Zhang-Lab/Zhang Lab Code/Figures/'):
    """
    Generate boxplots of model performance across CV folds.
    
    Parameters
    ----------
    cv_results_dict : dict
        Dictionary where keys are model names and values are arrays of 
        cross-validation fold scores.
        
        Simple format (single metric):
        {
            'MLR': [0.75, 0.76, 0.74, 0.75, 0.76],
            'XGBRFRegressor': [0.72, 0.73, 0.71, 0.72, 0.73]
        }
        
        Or multi-metric format:
        {
            'MLR': {
                'r2': [0.75, 0.76, 0.74, 0.75, 0.76],
                'rmse': [0.42, 0.41, 0.43, 0.42, 0.41]
            },
            'XGBRFRegressor': {
                'r2': [0.72, 0.73, 0.71, 0.72, 0.73],
                'rmse': [0.45, 0.44, 0.46, 0.45, 0.44]
            }
        }
    output_path : str
        Path to save figure
        
    Returns
    -------
    summary_df : pd.DataFrame
        Summary statistics for each model
    """
    set_publication_style()
    
    print("\n" + "="*80)
    print("CROSS-VALIDATION STABILITY ANALYSIS")
    print("="*80)
    
    model_names = list(cv_results_dict.keys())
    
    # Detect data structure
    first_value = list(cv_results_dict.values())[0]
    is_multi_metric = isinstance(first_value, dict)
    
    if is_multi_metric:
        print("\nDetected multi-metric CV results")
        # Extract metrics
        available_metrics = list(first_value.keys())
        print(f"Available metrics: {available_metrics}")
        
        # Use r2 as primary metric
        if 'r2' in available_metrics:
            primary_metric = 'r2'
            metric_label = 'R²'
        else:
            primary_metric = available_metrics[0]
            metric_label = primary_metric.upper()
    else:
        print("\nDetected single-metric CV results")
        is_multi_metric = False
        primary_metric = None
        metric_label = 'R²'
    
    # Create figure
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    
    # Prepare data for plotting
    cv_data = []
    summary_stats = []
    
    for model_name in model_names:
        if is_multi_metric:
            scores = cv_results_dict[model_name][primary_metric]
        else:
            scores = cv_results_dict[model_name]
        
        scores = np.array(scores)
        
        # Store for dataframe
        for fold_idx, score in enumerate(scores):
            cv_data.append({
                'Model': model_name, 
                'Fold': fold_idx, 
                'Score': score
            })
        
        # Calculate statistics
        mean_score = np.mean(scores)
        std_score = np.std(scores, ddof=1)  # Sample std
        min_score = np.min(scores)
        max_score = np.max(scores)
        cv_coef = std_score / mean_score if mean_score != 0 else np.nan  # Coefficient of variation
        
        summary_stats.append({
            'Model': model_name,
            'Mean': mean_score,
            'Std': std_score,
            'Min': min_score,
            'Max': max_score,
            'CV': cv_coef,
            'Range': max_score - min_score
        })
        
        print(f"\n{model_name}:")
        print(f"  Mean {metric_label}: {mean_score:.4f}")
        print(f"  Std Dev:  {std_score:.4f}")
        print(f"  Range:    [{min_score:.4f}, {max_score:.4f}]")
        print(f"  CV (Std/Mean): {cv_coef:.4f}")
    
    cv_df = pd.DataFrame(cv_data)
    summary_df = pd.DataFrame(summary_stats)
    
    # LEFT PLOT: Box plot with individual points
    ax1 = axes[0]
    
    # Box plot
    bp = ax1.boxplot([cv_results_dict[m] if not is_multi_metric 
                       else cv_results_dict[m][primary_metric] 
                       for m in model_names],
                      labels=model_names,
                      patch_artist=True,
                      widths=0.6,
                      showmeans=True,
                      meanprops=dict(marker='D', markerfacecolor='red', 
                                   markeredgecolor='red', markersize=8))
    
    # Color boxes
    for patch, model_name in zip(bp['boxes'], model_names):
        patch.set_facecolor(MODEL_COLORS.get(model_name, '#1f77b4'))
        patch.set_alpha(0.7)
    
    # Overlay individual points
    for i, model_name in enumerate(model_names):
        if is_multi_metric:
            scores = cv_results_dict[model_name][primary_metric]
        else:
            scores = cv_results_dict[model_name]
        
        # Add jitter to x-coordinates
        x = np.random.normal(i + 1, 0.04, size=len(scores))
        ax1.scatter(x, scores, alpha=0.6, color='black', s=50, zorder=3)
    
    ax1.set_ylabel(f'{metric_label} Score', fontsize=12, fontweight='bold')
    ax1.set_xlabel('Model', fontsize=12, fontweight='bold')
    ax1.set_title(f'Cross-Validation Performance ({metric_label})', 
                  fontsize=13, fontweight='bold')
    ax1.grid(True, alpha=0.3, axis='y')
    
    # summary stats
    summary_text = "CV Summary:\n"
    for _, row in summary_df.iterrows():
        summary_text += f"{row['Model']:15s}: {row['Mean']:.4f} ± {row['Std']:.4f}\n"
    
    ax1.text(0.02, 0.98, summary_text, transform=ax1.transAxes,
            fontsize=9, verticalalignment='top', horizontalalignment='left',
            bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.9),
            family='monospace')
    
    # RIGHT PLOT: Stability comparison (Coefficient of Variation)
    ax2 = axes[1]
    
    # sorted CV for better visualization
    summary_df_sorted = summary_df.sort_values('CV')
    
    bars = ax2.barh(summary_df_sorted['Model'], summary_df_sorted['CV'],
                    color=[MODEL_COLORS.get(m, '#1f77b4') for m in summary_df_sorted['Model']],
                    alpha=0.7, edgecolor='black', linewidth=1.5)
    
    # value labels
    for i, (idx, row) in enumerate(summary_df_sorted.iterrows()):
        ax2.text(row['CV'], i, f"  {row['CV']:.4f}", 
                va='center', ha='left', fontsize=10, fontweight='bold')
    
    #reference line for "good" stability (CV < 0.05 = <5% variation)
    ax2.axvline(x=0.05, color='green', linestyle='--', linewidth=2, 
                alpha=0.7, label='Good stability (CV < 0.05)')
    
    ax2.set_xlabel('Coefficient of Variation (Std/Mean)', fontsize=12, fontweight='bold')
    ax2.set_ylabel('Model', fontsize=12, fontweight='bold')
    ax2.set_title('Model Stability Across CV Folds\n(Lower = More Stable)', 
                  fontsize=13, fontweight='bold')
    ax2.grid(True, alpha=0.3, axis='x')
    ax2.legend(loc='lower right', fontsize=9)
    ax2.invert_yaxis()  # best at top
    
    # interpretation
    max_cv = summary_df['CV'].max()
    if max_cv < 0.03:
        stability_text = "✓ All models very stable"
        color = 'lightgreen'
    elif max_cv < 0.05:
        stability_text = "✓ All models stable"
        color = 'lightgreen'
    elif max_cv < 0.10:
        stability_text = "~ Some instability"
        color = 'lightyellow'
    else:
        stability_text = "⚠ High variability"
        color = 'lightsalmon'
    
    ax2.text(0.98, 0.02, stability_text, transform=ax2.transAxes,
            fontsize=11, verticalalignment='bottom', horizontalalignment='right',
            bbox=dict(boxstyle='round', facecolor=color, alpha=0.9),
            fontweight='bold')
    
    plt.tight_layout()
    plt.savefig(output_path, dpi=DPI, bbox_inches='tight')
    print(f"\nFigure 6 saved to {output_path}")
    print("="*80 + "\n")
    plt.show()
    
    return summary_df

In [None]:
figure_6_cv_stability(cv_results_models,'figure6_v1(centered).png')