# Feature Generation Pipeline

This notebook demonstrates the streamlined pipeline approach for DIA-Aspire-Rescore feature generation.

For step-by-step details, see `01_step_by_step_feature_generation.ipynb`.

In [None]:
import warnings
warnings.filterwarnings("ignore")

from pathlib import Path
import matplotlib.pyplot as plt
from peptdeep.rescore.fdr import calc_fdr

from dia_aspire_rescore.pipeline import Pipeline
from dia_aspire_rescore.config import FineTuneConfig, IOConfig
from dia_aspire_rescore.plot import plot_target_decoy_dist, plot_qvalues

import logging
logging.basicConfig(level=logging.INFO)


## Configuration

In [None]:
output_dir = Path('../output/pipeline')
output_dir.mkdir(parents=True, exist_ok=True)

io_config = IOConfig(
    report_file="../../data/raw/SYS026_RA957/DDA_SYSMHC_bynam/lib-base-result-first-pass.parquet",
    ms_file_dir="../output",
    ms_file_type="hdf5",
    output_dir=str(output_dir),
)

finetune_config = FineTuneConfig(
    fdr_threshold=0.01,
    instrument='QE',
    nce=27,
    psm_num_to_train_ms2=8000,
    epoch_to_train_ms2=20,
    epoch_to_train_rt_ccs=25,
    train_verbose=True,
)

## Run Pipeline

In [None]:
pipeline = Pipeline(
    io_config=io_config,
    finetune_config=finetune_config,
    feature_generators=["basic", "ms2", "rt"],
)

psm_df = pipeline.run_feature_generation()

## Feature Evaluation

In [None]:
plot_target_decoy_dist(psm_df, metric="spc")

In [None]:
plot_target_decoy_dist(psm_df, metric="abs_rt_delta")

In [None]:
from dia_aspire_rescore.features import MS2FeatureGenerator, RTFeatureGenerator
# get the feature names
ms2_generator = MS2FeatureGenerator(model_mgr=pipeline.finetuner.model_manager, ms_files=pipeline.ms_files, ms_file_type=io_config.ms_file_type, ms2_match_config=pipeline.ms2_match_config)
rt_generator = RTFeatureGenerator(model_mgr=pipeline.finetuner.model_manager)

for feature in ms2_generator.feature_names + rt_generator.feature_names:
    psm_df_eval = calc_fdr(psm_df, score_column=feature)
    fig, axes = plt.subplots(1, 2, figsize=(14, 5))
    plot_target_decoy_dist(psm_df_eval, feature, ax=axes[0])
    axes[0].set_title(f'{feature} - Target/Decoy Distribution')
    
    threshold = 0.1
    if psm_df_eval['fdr'].min() > 0.1:
        threshold = 0.5
    plot_qvalues(psm_df_eval['fdr'], threshold=threshold, ax=axes[1])
    axes[1].set_title(f'{feature} - Discoveries at FDR')
    plt.tight_layout()
    
    pdf_path = output_dir / f'{feature}.pdf'
    plt.savefig(pdf_path, bbox_inches='tight')
    plt.close()