Autocomplete searching in all possible names <br>
Then dictionary with name and unimod_id <br>
I need a list of tuples with (unimod_id, modification name, chemical formula, Monoisotopic mass) <br>
Then the user can auto-complete search for the modification name<br> 
This is then automatically correctly formatted. <br>
If modification is not in the list they can type it but then they **always** need to add the monoisotopic mass of their modificiation.<br>
Then they can choose between:<br>
    type: fixed, variable, annotated<br>
    position: anywhere, protein c-term, protein n-term, c-term, n-term<br>
    amino acid: (one or multiple)<br>
<br><br>
<br><br>
Final format:

NT=Name;AC=unimod_accession;CF=Formula;MT=['Fixed', 'Variable', 'Annotated']; PP=['Anywhere', Protein N-term, Protein C-term, Any N-term, Any C-term];TA=[amino acids];MM=Monoisotopicmass

In [1]:
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv('C:\\Users\\tinec\\OneDrive - UGent\\git\\SDRF_GUI\\ontology\\unimod.tsv', sep='\t')
#if PSI-MS is empty, replace with interim name
df['PSI-MS Name'].fillna(df['Interim name'], inplace=True)
df.tail()

Unnamed: 0,Unimod_ID,PSI-MS Name,Interim name,Description,Monoisotopic mass,Average mass,Composition
1527,2073,N6pAMP,N6pAMP,probe; AMP analogon,367.06817,367.2539,H(14) C(13) N(5) O(6) P
1528,2074,DabMal,DabMal,Dabcyl C2 Maleimide,391.16444,391.4231,H(21) C(21) N(5) O(3)
1529,2079,NBF,NBF,Thiol blocking reagent,163.001791,163.0904,H C(6) N(3) O(3)
1530,2080,DCP,DCP,Dimedone-Based Chemical Probes,168.078644,168.1898,H(12) C(9) O(3)
1531,2081,Ethynylation,Ethynylation,Ethynlation of cysteine residues,24.0,24.0214,C(2)


In [28]:
df[df['Unimod_ID'] == 977]

Unnamed: 0,Unimod_ID,PSI-MS Name,Interim name,Description,Monoisotopic mass,Average mass,Composition
619,977,Carbofuran,Carbofuran,"2,3-dihydro-2,2-dimethyl-7-benzofuranol N-meth...",57.021464,57.0513,H(3) C(2) N O


In [29]:
df = df.applymap(lambda x: x.strip() if isinstance(x, str) else x)

In [30]:
df['Interim name'].values.tolist()[:5]

['Acetyl', 'Amide', 'Biotin', 'Carbamidomethyl', 'Carbamyl']

In [31]:
autocomplete_inputs = set()
autocomplete_inputs.update(df['PSI-MS Name'].values)
autocomplete_inputs.update(df['Interim name'].values)
autocomplete_inputs.update(df['Description'].values)

In [32]:
df.shape

(1532, 7)

In [33]:
len(autocomplete_inputs)

3235

In [34]:
# store list as gzipped json
import json
import gzip
with gzip.open('autocomplete_inputs.json.gz', 'wt') as f:
    json.dump(list(autocomplete_inputs), f)

Step 1: search for substring in autocomplete list<br>
Step 2: find according modification in unimod dataframe <br>

In [35]:
# assuming your DataFrame is named df
cols_to_search = ['Interim name', 'Description', 'PSI-MS Name'] # the columns to search for substrings in
substring = 'Acetylation' # the substring to search for

# use vectorized string methods to check for substring in specified columns
substring_mask = df[cols_to_search].apply(lambda x: x.str.contains(substring)).any(axis=1)

# use numpy's where method to get row indices where the substring appears
row_indices = np.where(substring_mask)[0]

# select the matching rows from the DataFrame
result = df.iloc[row_indices]

In [36]:
result

Unnamed: 0,Unimod_ID,PSI-MS Name,Interim name,Description,Monoisotopic mass,Average mass,Composition
0,1,Acetyl,Acetyl,Acetylation,42.010565,42.0367,H(2) C(2) O


xx NT=Name
xx AC=unimod_accession
xx CF=Formula
MT=['Fixed', 'Variable', 'Annotated']
PP=['Anywhere', Protein N-term, Protein C-term, Any N-term, Any C-term]
TA=['X', 'G', 'A', 'L', 'M', 'F', 'W', 'K', 'Q', 'E', 'S', 'P', 'V', 'I', 'C', 'Y', 'H', 'R', 'N', 'D', T]
xx MM=Monoisotopicmass

In [10]:
#if there is a PSI-MS Name, use that one, otherwise use the Interim name
if result['PSI-MS Name'].values[0] != '':
    nt = result['PSI-MS Name'].values[0]
else:
    nt = result['Interim name'].values[0]
mm = result['Monoisotopic mass'].values[0]
cf = result['Composition'].values[0]
ac = result['Unimod_id'].values[0]

mt = input() #from list
pp = input() #from list
ta = input() #from list ==> can be multiple!

final = f'NT = {nt}; AC={ac}; CF={cf}, MT={mt}, PP={pp}, TA={ta}, MM={mm}'

In [23]:
print(final)

NT = Acetyl; AC=1; CF=H(2) C(2) O, MT=fixed, PP=anywhere, TA=42, MM=42.010565


In [37]:
#the combo column merges the columns of interest into a single column, if PSI-MS column is nan, it will use the Interim column
df['combo'] = "NT="+df['PSI-MS Name'].astype(str)+";AC="+df['Unimod_ID'].astype(str)+";CF="+df['Composition'].astype(str)+";MM="+df['Monoisotopic mass'].astype(str)

In [38]:
df

Unnamed: 0,Unimod_ID,PSI-MS Name,Interim name,Description,Monoisotopic mass,Average mass,Composition,combo
0,1,Acetyl,Acetyl,Acetylation,42.010565,42.0367,H(2) C(2) O,NT=Acetyl;AC=1;CF=H(2) C(2) O;MM=42.010565
1,2,Amidated,Amide,Amidation,-0.984016,-0.9848,H N O(-1),NT=Amidated;AC=2;CF=H N O(-1);MM=-0.984016
2,3,Biotin,Biotin,Biotinylation,226.077598,226.2954,H(14) C(10) N(2) O(2) S,NT=Biotin;AC=3;CF=H(14) C(10) N(2) O(2) S;MM=2...
3,4,Carbamidomethyl,Carbamidomethyl,Iodoacetamide derivative,57.021464,57.0513,H(3) C(2) N O,NT=Carbamidomethyl;AC=4;CF=H(3) C(2) N O;MM=57...
4,5,Carbamyl,Carbamyl,Carbamylation,43.005814,43.0247,H C N O,NT=Carbamyl;AC=5;CF=H C N O;MM=43.005814
...,...,...,...,...,...,...,...,...
1527,2073,N6pAMP,N6pAMP,probe; AMP analogon,367.068170,367.2539,H(14) C(13) N(5) O(6) P,NT=N6pAMP;AC=2073;CF=H(14) C(13) N(5) O(6) P;M...
1528,2074,DabMal,DabMal,Dabcyl C2 Maleimide,391.164440,391.4231,H(21) C(21) N(5) O(3),NT=DabMal;AC=2074;CF=H(21) C(21) N(5) O(3);MM=...
1529,2079,NBF,NBF,Thiol blocking reagent,163.001791,163.0904,H C(6) N(3) O(3),NT=NBF;AC=2079;CF=H C(6) N(3) O(3);MM=163.001791
1530,2080,DCP,DCP,Dimedone-Based Chemical Probes,168.078644,168.1898,H(12) C(9) O(3),NT=DCP;AC=2080;CF=H(12) C(9) O(3);MM=168.078644


In [39]:
# turn pandas dataframe with two columns into dict
# key is column A, value is column B
def df_to_dict(df):
    return dict(zip(df.iloc[:,0], df.iloc[:,1]))
    

In [40]:
psi = df_to_dict(df[['PSI-MS Name', 'combo']])
interim = df_to_dict(df[['Interim name', 'combo']])
descr = df_to_dict(df[['Description', 'combo']])
monoisotopic_mass_dict = df_to_dict(df[['Monoisotopic mass', 'combo']])
average_mass_dict = df_to_dict(df[['Average mass', 'combo']])

In [41]:
len(psi.items()), len(interim.items()), len(descr.items())

(1532, 1532, 1532)

In [42]:
len(monoisotopic_mass_dict.items()), len(average_mass_dict.items())

(1381, 1379)

In [43]:
#psi.update(interim)
psi.update(descr)

In [44]:
len(psi.items())

3025

In [45]:
import ParsingModule


In [46]:
ParsingModule.store_as_gzipped_json(monoisotopic_mass_dict, 'monoisotopic_mass_dict.json.gz')
ParsingModule.store_as_gzipped_json(average_mass_dict, 'average_mass_dict.json.gz')

'Stored average_mass_dict.json.gz as gzipped json'

In [47]:
ParsingModule.store_as_gzipped_json(psi, 'unimod_dict')

'Stored unimod_dict as gzipped json'