In [1]:
import pyopenms as oms
from collections import defaultdict

# Load protein and peptide IDs
protein_ids = []
peptide_ids = []
oms.IdXMLFile().load("output.idXML", protein_ids, peptide_ids)

# Load quantified feature map with peptide identifications annotated
feature_map = oms.FeatureMap()
oms.FeatureXMLFile().load("data_annotated.featureXML", feature_map)

# Build mapping: peptide sequence -> list of protein accessions
pep_to_proteins = defaultdict(set)
for prot_id in protein_ids:
    for hit in prot_id.getHits():
        accession = hit.getAccession()
        for pep_id in peptide_ids:
            for hit_pep in pep_id.getHits():
                pep_seq = hit_pep.getSequence().toString()
                # You may want to refine this mapping to be peptide-specific
                pep_to_proteins[pep_seq].add(accession)

# Aggregate intensities per protein
protein_intensity = defaultdict(float)

for feature in feature_map:
    peptide_ids = feature.getPeptideIdentifications()
    if not peptide_ids:
        continue
    for pep_id in peptide_ids:
        for hit in pep_id.getHits():
            pep_seq = hit.getSequence().toString()
            intensity = feature.getIntensity()
            # Add intensity to all proteins linked to this peptide
            for prot_acc in pep_to_proteins.get(pep_seq, []):
                protein_intensity[prot_acc] += intensity

# Sort proteins by intensity descending
sorted_proteins = sorted(protein_intensity.items(), key=lambda x: x[1], reverse=True)

# Print top 10 highly expressed proteins
print("Top 10 highly expressed proteins:")
for prot, inten in sorted_proteins[:10]:
    print(f"{prot}: {inten}")


Determination of memory status is not supported on this 
 platform, measuring for memoryleaks will never fail


Top 10 highly expressed proteins:
sp|Q9NUQ6|SPS2L_HUMAN: 7498565330.984375
sp|P04350|TBB4A_HUMAN: 7498565330.984375
sp|P01009|A1AT_HUMAN: 7498565330.984375
DECOY_sp|P11215|ITAM_HUMAN: 7498565330.984375
DECOY_sp|P15408|FOSL2_HUMAN: 7498565330.984375
sp|P26373|RL13_HUMAN: 7498565330.984375
sp|Q08AH3|ACS2A_HUMAN: 7498565330.984375
DECOY_sp|P26358|DNMT1_HUMAN: 7498565330.984375
sp|Q96CM8|ACSF2_HUMAN: 7498565330.984375
sp|Q58FF8|H90B2_HUMAN: 7498565330.984375


In [2]:
filtered_proteins = [(prot, inten) for prot, inten in sorted_proteins if not prot.startswith("DECOY")]
print("Top 10 real proteins:")
for prot, inten in filtered_proteins[:10]:
    print(f"{prot}: {inten}")


Top 10 real proteins:
sp|Q9NUQ6|SPS2L_HUMAN: 7498565330.984375
sp|P04350|TBB4A_HUMAN: 7498565330.984375
sp|P01009|A1AT_HUMAN: 7498565330.984375
sp|P26373|RL13_HUMAN: 7498565330.984375
sp|Q08AH3|ACS2A_HUMAN: 7498565330.984375
sp|Q96CM8|ACSF2_HUMAN: 7498565330.984375
sp|Q58FF8|H90B2_HUMAN: 7498565330.984375
sp|Q86X83|COMD2_HUMAN: 7498565330.984375
sp|P0C7V6|SP202_HUMAN: 7498565330.984375
sp|Q5T7N8|F27D1_HUMAN: 7498565330.984375
