# Batch Correction - Two-Phase Execution

- **Phase 1**: Data generation (calls `glycoforge/pipeline.py` to simulate data)
- **Phase 2**: Correction & evaluation


# Example 1: ComBat correction based on simplified-simulated data
Steps:

1. Define Dirichlet parameters directly (uniform alpha_H, heterogeneous alpha_U scaling)
2. Generate clean simulated data with biological ground truth
3. Apply batch effects 
4. Apply ComBat batch correction

Run batch correction pipeline across different parameter combinations to evaluate:
1. Batch effect correction effectiveness
2. Biological signal preservation
3. Differential expression recovery 

Parameter grid: Defined in `sample_confgi/simplifed_mode_config.yaml`

In [None]:
import os
import sys
import yaml

project_root = '../..'
if project_root not in sys.path:
    sys.path.append(project_root)

from use_cases.batch_correction.correction import run_correction

config_path = os.path.join(project_root, 'sample_config/simlified_mode_config.yaml')

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

print("Configuration loaded from:", config_path)
print(f"Output directory: {config.get('output_dir')}")
print(f"Seeds to run: {config.get('random_seeds')}")

In [None]:
# ~5mins for 10 seeds * 16 parameter combinations 
results = run_correction(config)

print("\nPipeline completed!")
print(f"Total results: {len(results)}")

In [None]:
# Visualize the results from the parameter grid search
from visualization import plot_parameter_grid_metrics
# Identify grid parameters (lists in config)
grid_params = {k: v for k, v in config.items() if isinstance(v, list)}

if grid_params:
    print(f"Plotting results for parameters: {list(grid_params.keys())}")
    
    # Ensure output directory ends with a separator for the save path prefix
    output_dir = config.get('output_dir')
    save_path = os.path.join(output_dir, '') if output_dir else None
    
    plot_parameter_grid_metrics(
        results_dir=output_dir,
        save_path=save_path
    )
else:
    print("No parameter grid found (single run configuration). Skipping grid summary plots.")

# Example 2: ComBat correction based on hybrid-simulated data

Steps:

1. Load real-world glycomics data (CSV)
2. Estimate biological effect sizes from real data (Robust CLR-space processing)
3. Generate clean simulated data preserving real biological signal
4. Apply batch effects
5. Apply ComBat batch correction

Run batch correction pipeline across different parameter combinations to evaluate:
1. Batch effect correction effectiveness 
2. Biological signal preservation 
3. Differential expression recovery 

Parameter grid: Defined in `sample_config/hybrid_mode_config.yaml`

In [None]:
import os
import sys
import yaml

project_root = '../..'
if project_root not in sys.path:
    sys.path.append(project_root)

from use_cases.batch_correction.correction import run_correction

config_path = os.path.join(project_root, 'sample_config/hybrid_mode_config.yaml')

with open(config_path, 'r') as f:
    config = yaml.safe_load(f)

# Note: No need to resolve data_file path anymore
# load_data_from_glycowork() handles both local files and glycowork built-in datasets automatically

print("Configuration loaded from:", config_path)
print(f"Output directory: {config.get('output_dir')}")
print(f"Seeds to run: {config.get('random_seeds')}")
print(f"Data source: {config.get('data_source')}")
print(f"Data file: {config.get('data_file')}")

In [None]:
# ~7mins for 10 seeds * 16 parameter combinations 
results = run_correction(config)

print("\nPipeline completed!")
print(f"Total results: {len(results)}")


In [None]:
from visualization import ParameterGridPlotter

output_dir = config.get('output_dir')
save_path = os.path.join(output_dir, '') if output_dir else None

plotter = ParameterGridPlotter(results_dir=output_dir)
plotter.plot_all(save_path)

# Example 3 ï¼š Plot Single Run Metrics

In [None]:
from visualization import SingleRunPlotter
# Example directories:
# - "results/simplified_mode/kappa_mu_2.0_var_b_0.5"
single_run_dir = "results/simplified_mode/kappa_mu_2.0_var_b_0.5"

# Create plotter and generate visualization
plotter = SingleRunPlotter(single_run_dir, verbose=True)
plotter.plot_single_run_metrics(save_path=f"{single_run_dir}/single_run_summary.png")