In [1]:
# Native
import sys
from itertools import combinations, chain

# External general
from tqdm import tqdm
import pandas as pd

# Proteomics imports
from unimod_mapper import UnimodMapper
from pyteomics.mass.mass import std_aa_mass

In [2]:
# Get list of all unimod modifications
unimod = UnimodMapper()
modifications = unimod.data_list

# Define type of mods to include
types_mod_include = set((
    "O-linked glycosylation",
    "Post-translational",
    "Chemical derivative",
    "Isotopic label",
    #"AA substitution",
    "Other",
    "Pre-translational",
    "NoMod",
    "Multiple"
))

[32m2024-05-04 16:44:24.277[0m | [34m[1mDEBUG   [0m | [36munimod_mapper.unimod_mapper[0m:[36m_parseXML[0m:[36m537[0m - [34m[1mNo usermod.xml file found. Expected at /home/robbin/anaconda3/envs/py311/lib/python3.11/site-packages/unimod_mapper/usermod.xml[0m
[32m2024-05-04 16:44:24.278[0m | [34m[1mDEBUG   [0m | [36munimod_mapper.unimod_mapper[0m:[36m_parseXML[0m:[36m480[0m - [34m[1mParsing mods file (/home/robbin/anaconda3/envs/py311/lib/python3.11/site-packages/unimod_mapper/unimod.xml)[0m


In [3]:
# Create a dictionary that contains a mapping between AA and all modifications

# aa_to_modifications[{AA}] = [ [unimodID,unimodName,monoisotopicmass,modificationType] , ... ]
aa_to_modifications = {}

for mod in modifications:
    for loc in mod["specificity"]:
        # loc[0] == AA and loc[1] == type of modification (e.g., chemical derivative)
        if loc[0] in aa_to_modifications.keys():
            aa_to_modifications[loc[0]].append([mod["unimodID"],mod["unimodname"],mod["mono_mass"],loc[1]])
        else:
            aa_to_modifications[loc[0]] = [[mod["unimodID"],mod["unimodname"],mod["mono_mass"],loc[1]]]

In [4]:
aa_to_modifications

{'T': [['1', 'Acetyl', 42.010565, 'Post-translational'],
  ['5', 'Carbamyl', 43.005814, 'Chemical derivative'],
  ['21', 'Phospho', 79.966331, 'Post-translational'],
  ['2007', 'Methamidophos-S', 108.975121, 'Chemical derivative'],
  ['23', 'Dehydrated', -18.010565, 'Post-translational'],
  ['34', 'Methyl', 14.01565, 'Post-translational'],
  ['35', 'Oxidation', 15.994915, 'Chemical derivative'],
  ['40', 'Sulfo', 79.956815, 'Post-translational'],
  ['41', 'Hex', 162.052824, 'O-linked glycosylation'],
  ['43', 'HexNAc', 203.079373, 'O-linked glycosylation'],
  ['47', 'Palmitoyl', 238.229666, 'Post-translational'],
  ['54', 'Glucuronyl', 176.032088, 'O-linked glycosylation'],
  ['56', 'Acetyl:2H(3)', 45.029395, 'Isotopic label'],
  ['58', 'Propionyl', 56.026215, 'Isotopic label'],
  ['93', 'EDT-maleimide-PEO-biotin', 601.206246, 'Chemical derivative'],
  ['118', 'EDT-iodoacetyl-PEO-biotin', 490.174218, 'Chemical derivative'],
  ['121', 'GG', 114.042927, 'Other'],
  ['142', 'HexNAc(1)dHex

In [5]:
aa_to_modifications.keys()

dict_keys(['T', 'N-term', 'S', 'C', 'K', 'Y', 'H', 'C-term', 'U', 'M', 'W', 'R', 'F', 'E', 'D', 'N', 'Q', 'L', 'I', 'G', 'V', 'P', 'A'])

In [6]:
# List containing all the fragment ion and its monoisotopic mass
# [ [aminoacid+modification, monoisotopicMass] , ... ]
list_collision_possibility = []

for aa in std_aa_mass.keys():
    # O and J are not in unimod ...
    try:
        aa_mods = aa_to_modifications[aa]            
    except KeyError:
        print(f"Unable to analyze amino acid: {aa}, no modifications found in unimod")
        continue
    
    for modification in aa_mods:
        # Filter on modification type
        if modification[3] not in types_mod_include:
            continue
        # Filter on modifications that remove the actual amino acid + some extra
        if std_aa_mass[aa]+modification[2] < 0.0:
            continue
            
        # Write non-modified amino acids differently        
        if modification[3] == "NoMod":
            list_collision_possibility.append([f"{aa}",std_aa_mass[aa]+modification[2]])
        else:
            list_collision_possibility.append([f"{aa}[UNIMOD:{modification[0]}]",std_aa_mass[aa]+modification[2]])

Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod


In [7]:
print(f"Length of fragment ion list that is potentially colliding (single AA + mod): {len(list_collision_possibility)}")

Length of fragment ion list that is potentially colliding (single AA + mod): 1764


In [8]:
# Repeat for diamino acids, so combinations of two amino acids and their modifications
# TODO more general code where the number of amino acids is simply defined
for aa1, aa2 in combinations(std_aa_mass.keys(), 2):
    try:
        aa_mods1 = aa_to_modifications[aa1]
        aa_mods1.append(["","",0,"NoMod"])
    except KeyError:
        print(f"Unable to analyze amino acid: {aa1}, no modifications found in unimod")
        continue
    try:
        aa_mods2 = aa_to_modifications[aa2]
        aa_mods2.append(["","",0,"NoMod"])
    except KeyError:
        print(f"Unable to analyze amino acid: {aa2}, no modifications found in unimod")
        continue
        
    for modification1 in aa_mods1:
        for modification2 in aa_mods2:
            if modification1[3] not in types_mod_include:
                continue
            if std_aa_mass[aa1]+modification1[2] < 0.0:
                continue
            
            if modification2[3] not in types_mod_include:
                continue
            if std_aa_mass[aa2]+modification2[2] < 0.0:
                continue

            if modification1[3] == "NoMod" and modification2[3] == "NoMod":
                list_collision_possibility.append([f"{aa1}{aa2}",std_aa_mass[aa]+modification[2]])
            elif modification1[3] == "NoMod":
                list_collision_possibility.append([f"{aa1}{aa2}[UNIMOD:{modification2[0]}]",
                                                   std_aa_mass[aa1]+modification1[2]+std_aa_mass[aa2]+modification2[2]])
            elif modification2[3] == "NoMod":
                list_collision_possibility.append([f"{aa1}[UNIMOD:{modification1[0]}]{aa2}",
                                                   std_aa_mass[aa1]+modification1[2]+std_aa_mass[aa2]+modification2[2]])
            else:
                list_collision_possibility.append([f"{aa1}[UNIMOD:{modification1[0]}]{aa2}[UNIMOD:{modification2[0]}]",
                                                   std_aa_mass[aa1]+modification1[2]+std_aa_mass[aa2]+modification2[2]])


Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod
Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod
Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod
Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod
Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod
Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod
Unable to analyze amino acid: J, no modifications found in unimod
Unable to analyze amino acid: O, no modifications found in unimod
Unable to analyze amino acid: J, no modifications found in unimod
Unable to 

In [9]:
print(f"Length of fragment ion list that is potentially colliding (single AA + diamino + mod): {len(list_collision_possibility)}")

Length of fragment ion list that is potentially colliding (single AA + diamino + mod): 1649611


In [10]:
# Presort the list of potential fragment ions that could collide (makes it slightly faster)
list_collision_possibility = sorted(list_collision_possibility, key=lambda x:x[1])

In [12]:
# Define tolerance for colliding
tolerance_pos = 0.02
tolerance_neg = 0.02

dict_collission = {}

# Iterate over sorted list of all fragment ions
for idx,fragment in enumerate(tqdm(list_collision_possibility)):
    # Go up in the list of fragment ion masses until it goes over the tolerance value
    for diff_candidate in list_collision_possibility[idx+1:]:
        if diff_candidate[0] == fragment[0]:
            continue
        if diff_candidate[1]-fragment[1] > tolerance_pos:
            break
            
        dict_collission[f"{fragment[0]}->{diff_candidate[0]}"] = list(chain(*[fragment,diff_candidate]))
    
    # Go down in the list of fragment ion masses until it goes over the tolerance value
    for diff_candidate in list_collision_possibility[:idx+1][::-1]:
        if diff_candidate[0] == fragment[0]:
            continue
        if fragment[1]-diff_candidate[1] > tolerance_neg:
            break
            
        dict_collission[f"{fragment[0]}->{diff_candidate[0]}"] = list(chain(*[fragment,diff_candidate]))

  2%|▏         | 35263/1649611 [14:07<10:46:59, 41.59it/s]


KeyboardInterrupt: 

In [8]:
df_collision = pd.DataFrame(dict_collission).T
df_collision.columns = ["Fragment_selected","Monoisotopic_mass_fragment","Fragment_collision","Monoisotopic_mass_collision"]

In [9]:
df_collision.describe()

Unnamed: 0,Fragment_selected,Monoisotopic_mass_fragment,Fragment_collision,Monoisotopic_mass_collision
count,2004,2004.0,2004,2004.0
unique,900,841.0,900,841.0
top,S[UNIMOD:58],144.053494,K[UNIMOD:381],195.029661
freq,7,15.0,7,15.0


In [11]:
df_collision.to_csv("single_aa_modifications_collision.csv")