In [1]:
import pandas as pd

In [2]:
interactions = pd.read_csv('../pipeline/data/GlycanML/glycanML_glycan_interactions.tsv', sep='\t')

In [3]:
interactions.head()

Unnamed: 0,protein_sequence,glycan_iupac,interaction,split
0,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal,0.293462,train
1,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc,-1.316793,train
2,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc(a1-3)[Fuc(a1-2)]Gal(b...,-0.860744,train
3,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc(a1-3)[Fuc(a1-2)]Gal(b...,-1.211838,train
4,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc(b1-3)Gal,-0.335253,train


In [4]:
print('number of unique proteins:', interactions['protein_sequence'].nunique())

print('protein frequencies:', interactions['protein_sequence'].value_counts().to_dict().values())

number of unique proteins: 1388
protein frequencies: dict_values([831, 806, 785, 648, 567, 565, 562, 562, 556, 551, 550, 550, 550, 548, 547, 545, 545, 545, 545, 545, 545, 545, 544, 543, 543, 543, 543, 543, 542, 541, 541, 540, 540, 539, 539, 539, 539, 539, 539, 539, 539, 539, 539, 539, 538, 538, 538, 538, 537, 537, 537, 536, 536, 536, 536, 536, 536, 535, 535, 535, 535, 535, 535, 535, 535, 534, 534, 534, 534, 533, 533, 533, 531, 531, 530, 530, 530, 529, 529, 528, 528, 527, 527, 527, 527, 527, 527, 527, 526, 526, 525, 525, 525, 525, 525, 525, 525, 525, 525, 525, 522, 521, 519, 519, 515, 515, 515, 515, 515, 515, 515, 515, 515, 515, 515, 515, 514, 514, 514, 514, 514, 514, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513, 513,

In [5]:
print('number of unique glycans:', interactions['glycan_iupac'].nunique())

print('glycan frequencies:', interactions['glycan_iupac'].value_counts().to_dict().values())

number of unique glycans: 927
glycan frequencies: dict_values([1374, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1370, 1367, 1367, 1366, 1361, 1360, 1360, 1358, 1358, 1356, 1356, 1356, 1356, 1356, 1356, 1354, 1354, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1352, 1351, 1351, 1351, 1351, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1349, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1345, 1343, 1340, 1340, 1336, 1334, 1331, 1331, 1331, 1331, 1331, 1331, 1331, 1331, 1331, 1331, 1330, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1327, 1

In [6]:
# protein_sequence -> protein_id
unique_proteins = interactions['protein_sequence'].unique()
protein_mapping = {seq: idx+1 for idx, seq in enumerate(unique_proteins)}

# glycan_iupac -> glycan_id
unique_glycans = interactions['glycan_iupac'].unique()
glycan_mapping = {iupac: f'glycan_{idx+1}' for idx, iupac in enumerate(unique_glycans)}

proteins_df = pd.DataFrame({
    'ProteinGroup': [protein_mapping[seq] for seq in unique_proteins],
    'Amino Acid Sequence': unique_proteins
})

glycans_df = pd.DataFrame({
    'Name': [glycan_mapping[iupac] for iupac in unique_glycans],
    'IUPAC': unique_glycans
})

In [7]:
interactions.head()

Unnamed: 0,protein_sequence,glycan_iupac,interaction,split
0,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal,0.293462,train
1,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc,-1.316793,train
2,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc(a1-3)[Fuc(a1-2)]Gal(b...,-0.860744,train
3,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc(a1-3)[Fuc(a1-2)]Gal(b...,-1.211838,train
4,AADSIPSISPTGIITPTPTQSGMVSNCNKFYDVHSNDGCSAIASSQ...,Fuc(a1-2)Gal(b1-3)GalNAc(b1-3)Gal,-0.335253,train


In [8]:
print(proteins_df.shape)
proteins_df_filtered = proteins_df[
    ~proteins_df['Amino Acid Sequence'].str.contains(r'\*', na=False)
]
print(proteins_df_filtered.shape)
proteins_df_filtered = proteins_df_filtered[
    ~proteins_df_filtered['Amino Acid Sequence'].str.contains(r'X', na=False)
]
print(proteins_df_filtered.shape)
proteins_df_filtered = proteins_df_filtered[
    ~proteins_df_filtered['Amino Acid Sequence'].str.contains(r'B', na=False)
]
print(proteins_df_filtered.shape)
proteins_df_filtered = proteins_df_filtered[
    ~proteins_df_filtered['Amino Acid Sequence'].str.contains(r'2', na=False)
]
print(proteins_df_filtered.shape)
proteins_df_filtered = proteins_df_filtered[
    ~proteins_df_filtered['Amino Acid Sequence'].str.contains(r'1', na=False)
]
print(proteins_df_filtered.shape)

print('int shape', interactions.shape)
protein_mapping = {seq: idx+1 for idx, seq in enumerate(proteins_df_filtered['Amino Acid Sequence'])}
interactions_filtered = interactions[
    interactions['protein_sequence'].isin(proteins_df_filtered['Amino Acid Sequence'])
]
print('int filt shape', interactions_filtered.shape)

(1388, 2)
(1387, 2)
(1374, 2)
(1373, 2)
(1372, 2)
(1371, 2)
int shape (564647, 4)
int filt shape (557115, 4)


In [None]:
from rdkit import Chem
def safe_mol_from_smiles(smiles):
    # Check if smiles is a string and not NaN
    if pd.notna(smiles) and isinstance(smiles, str):
        try:
            mol = Chem.MolFromSmiles(smiles)
            return mol
        except Exception as e:
            #print(f"Error converting SMILES {smiles}: {e}")
            return Exception
    return None

In [12]:
# for each glyucan check if we can convert its IUPAC into a SMILES so we can use our datasplit

import pubchempy as pcp

def CIRconvert(ids):
    s= pcp.get_compounds(ids,'name')
    try:
        s = s[0].isomeric_smiles
        safe_mol_from_smiles(s)
        return s
    except:
        return 'Did not work'


smiles_arr = []

iupac_to_smiles_dict = {}

for ids in glycans_df.IUPAC.values.tolist():
    #print(ids)
    smiles = CIRconvert(ids)
    if smiles != 'Did not work':
        iupac_to_smiles_dict[ids] = smiles
        smiles_arr.append(smiles)
    else:
        smiles_arr.append(None)

In [13]:
import json
with open('iupac_to_smiles_dict.json', 'w') as fp:
    json.dump(iupac_to_smiles_dict, fp)

In [14]:
print(len(smiles_arr))
print(smiles_arr.count(None))

927
510


In [15]:
glycans_df['SMILES'] = smiles_arr

In [None]:
#glycans_df = pd.read_csv('../pipeline/data/GlycanML/glycans_w_smiles.tsv', sep='\t')

In [16]:

glycans_with_smiles_df = glycans_df[glycans_df['SMILES'].notnull()]
valid_glycan_ids = set(glycans_with_smiles_df['Name'].tolist())
print(interactions_filtered.shape)
interactions_with_smiles = interactions_filtered[interactions_filtered['glycan_iupac'].isin(glycans_with_smiles_df['IUPAC'])]
print(interactions_with_smiles.shape)

(557115, 4)
(240035, 4)


In [17]:
unique_glycans = interactions_with_smiles['glycan_iupac'].unique()
glycan_mapping = {iupac: f'glycan_{idx+1}' for idx, iupac in enumerate(unique_glycans)}

unique_proteins = interactions_with_smiles['protein_sequence'].unique()
protein_mapping = {seq: idx+1 for idx, seq in enumerate(unique_proteins)}

proteins_df = pd.DataFrame({
    'ProteinGroup': [protein_mapping[seq] for seq in unique_proteins],
    'Amino Acid Sequence': unique_proteins
})

glycans_df = pd.DataFrame({
    'Name': [glycan_mapping[iupac] for iupac in unique_glycans],
    'IUPAC': unique_glycans,
    'SMILES': [iupac_to_smiles_dict[iupac] for iupac in unique_glycans]
})

In [18]:
train_fractions_data = []
test_fractions_data = []
obj_id_start = 1000000

for idx, row in interactions_with_smiles.iterrows():
    protein_id = protein_mapping[row['protein_sequence']]
    glycan_id = glycan_mapping[row['glycan_iupac']]
    interaction = float(row['interaction'])
    split = row['split']
    
    f_value = interaction #max(0, interaction)

    
    entry = {
        'ObjId': obj_id_start + idx,
        'ProteinGroup': protein_id,
        'Concentration': 1.0,
        'GlycanID': glycan_id,
        'f': f_value
    }
    
    if split == 'train':
        train_fractions_data.append(entry)
    elif split == 'test':
        test_fractions_data.append(entry)

train_fractions_df = pd.DataFrame(train_fractions_data)
test_fractions_df = pd.DataFrame(test_fractions_data)

In [19]:
train_fractions_df.head()

Unnamed: 0,ObjId,ProteinGroup,Concentration,GlycanID,f
0,1000000,1,1.0,glycan_1,0.293462
1,1000001,1,1.0,glycan_2,-1.316793
2,1000003,1,1.0,glycan_3,-1.211838
3,1000004,1,1.0,glycan_4,-0.335253
4,1000005,1,1.0,glycan_5,-1.12733


In [24]:
test = train_fractions_df.f.values.tolist()
print(sum(test)/len(test))

-0.045645659326037535


In [20]:
print(f"Proteins Shape: {proteins_df.shape}")
print(f"Glycans Shape: {glycans_df.shape}")
print(f"Train Fractions Shape: {train_fractions_df.shape}")
print(f"Test Fractions Shape: {test_fractions_df.shape}")

Proteins Shape: (1371, 2)
Glycans Shape: (417, 3)
Train Fractions Shape: (188767, 5)
Test Fractions Shape: (26628, 5)


In [21]:
proteins_df.to_csv('../pipeline/data/GlycanML/proteins.tsv', sep='\t', index=False)
glycans_df.to_csv('../pipeline/data/GlycanML/glycans.tsv', sep='\t', index=False)
train_fractions_df.to_csv('../pipeline/data/GlycanML/train_fractions.tsv', sep='\t', index=False)
test_fractions_df.to_csv('../pipeline/data/GlycanML/test_fractions.tsv', sep='\t', index=False)