# MS2FeatureGenerator Demo

Demonstrates MS2 feature generation with model finetuning.


In [1]:
import warnings
warnings.filterwarnings("ignore")

from alpharaw import register_all_readers
from alphabase.peptide.fragment import get_charged_frag_types
from dia_aspire_rescore.io import read_diann2
from dia_aspire_rescore.psm.matcher import DIAPeptideSpectrumMatcher
from dia_aspire_rescore.config import FineTuneConfig
from dia_aspire_rescore.finetuning import FineTuner
from dia_aspire_rescore.features import MS2FeatureGenerator

register_all_readers()




In [2]:
psm_df_all = read_diann2("../../data/raw/SYS026_RA957/DDA_SYSMHC_bynam/lib-base-result-first-pass.parquet")
psm_df_filtered = psm_df_all[psm_df_all['fdr1_search1'] < 0.01].copy()
psm_df_sample = psm_df_filtered[0:1000].sort_values(by='nAA', ascending=True).reset_index(drop=True)
print(f"Loaded {len(psm_df_sample)} PSMs for training")




Loaded 1000 PSMs for training


In [3]:
matcher = DIAPeptideSpectrumMatcher(n_neighbors=0)

ms_files = {
    '20200317_QE_HFX2_LC3_DIA_RA957_R01': '../output/20200317_QE_HFX2_LC3_DIA_RA957_R01.mzML.hdf5',
    '20200317_QE_HFX2_LC3_DIA_RA957_R02': '../output/20200317_QE_HFX2_LC3_DIA_RA957_R02.mzML.hdf5'
}

psm_df, fragment_mz_df, matched_intensity_df, matched_mz_err_df = matcher.match_ms2_multi_raw(
    psm_df_sample,
    ms_files,
    'hdf5'
)
print(f"Matched MS2 for {len(psm_df)} PSMs")


100%|██████████| 2/2 [00:04<00:00,  2.21s/it]

Matched MS2 for 1000 PSMs





In [4]:
config = FineTuneConfig(
    instrument='QE',
    nce=27,
    psm_num_to_train_ms2=1000,
    epoch_to_train_ms2=10,
    train_verbose=False
)

finetuner = FineTuner(config)
finetuner.load_pretrained('generic')
finetuner.train_ms2(psm_df, matched_intensity_df)
print("MS2 model finetuned")




2025-12-06 02:48:50> 1000 PSMs for MS2 model training/transfer learning
2025-12-06 02:48:58> Testing refined MS2 model on training df:
          PCC     COS      SA          SPC
count  1000.0  1000.0  1000.0  1000.000000
mean      0.0     0.0     0.0    -0.454433
std       0.0     0.0     0.0     0.223398
min       0.0     0.0     0.0    -0.818555
25%       0.0     0.0     0.0    -0.618774
50%       0.0     0.0     0.0    -0.487137
75%       0.0     0.0     0.0    -0.319376
max       0.0     0.0     0.0     0.800493
>0.90     0.0     0.0     0.0     0.000000
>0.75     0.0     0.0     0.0     0.001000
MS2 model finetuned


In [5]:
frag_types = get_charged_frag_types(['b', 'y'], 2)

ms2_generator = MS2FeatureGenerator(
    model_mgr=finetuner.model_manager,
    frag_types=frag_types,
    spc_top_k=10
)

print(f"Number of features: {len(ms2_generator.feature_names)}")
print(f"Feature names: {ms2_generator.feature_names[:5]}...")


Number of features: 51
Feature names: ['cos', 'sa', 'spc', 'pcc', 'cos_bion']...


In [6]:
psm_df = ms2_generator.generate(psm_df, matched_intensity_df, matched_mz_err_df)
print(f"Generated {len(ms2_generator.feature_names)} MS2 features")


2025-12-06 02:48:58> Predicting MS2 ...


100%|██████████| 1/1 [00:00<00:00,  4.27it/s]


Generated 51 MS2 features


In [7]:
similarity_features = ['cos', 'sa', 'spc', 'pcc', 'cos_bion', 'cos_yion']
score_features = ['merr_weighted_frag_score', 'pred_weighted_frag_score']
stat_features = ['matched_frag_num', 'matched_frag_ratio', 'both_matched_pred_frag_to_matched']

print("Similarity Metrics:")
print(psm_df[similarity_features].head())
print("\nWeighted Scores:")
print(psm_df[score_features].head())
print("\nFragment Statistics:")
print(psm_df[stat_features].head())


Similarity Metrics:
        cos        sa       spc       pcc  cos_bion  cos_yion
0  0.332606  0.215856  0.363636  0.227126  0.592010  0.000000
1  0.286655  0.185086  0.515152  0.171041  0.238466  0.309388
2  0.368627  0.240344 -0.103030  0.254163  0.694543  0.062748
3  0.052349  0.033342 -0.400000 -0.112871  0.047433  0.092635
4  0.492663  0.327953  0.642424  0.386667  0.217515  0.648712

Weighted Scores:
   merr_weighted_frag_score  pred_weighted_frag_score
0                 43.667080                  6.604607
1                 78.847804                 15.949612
2                 78.732410                 21.095405
3                 88.389080                  5.679304
4                 67.314554                 14.361843

Fragment Statistics:
   matched_frag_num  matched_frag_ratio  both_matched_pred_frag_to_matched
0               7.0            0.250000                           0.571429
1              11.0            0.392857                           0.818182
2              11.0