In [1]:
import numpy as np
import pandas as pd
import os
from tqdm import tqdm
import torch
import torch.nn as nn

pd.set_option('display.max_columns', None)


In [2]:
# # glycosciences
# glycoscience_atom_embed = pd.read_csv('glycosciencedb/node_embedding/atom_embed.csv')

# glycoscience_residual_embed = pd.read_csv('glycosciencedb/node_embedding/residual_embed.csv')

# glycoscience_monosaccharide_embed = pd.read_csv('glycosciencedb/node_embedding/monosaccharide_embed.csv')

# glycoscience_bound_AB_embed = pd.read_csv('glycosciencedb/node_embedding/ab_embed.csv')

# glycoscience_DL_embed = pd.read_csv('glycosciencedb/node_embedding/dl_embed.csv')

# glycoscience_PF_embed = pd.read_csv('glycosciencedb/node_embedding/pf_embed.csv')

In [3]:
# # godess

# godess_atom_embed=pd.read_csv('godess/node_embedding/atom_name_embed.csv')

# godess_bound_orig_embed=pd.read_csv('godess/node_embedding/bound_orig.csv')

# godess_atom_type_embed=pd.read_csv('godess/node_embedding/atom_type.csv')

# godess_bound_AB_embed=pd.read_csv('godess/node_embedding/bound_ab.csv')

# godess_DL_embed=pd.read_csv('godess/node_embedding/bound_dl.csv')

# godess_PF_embed=pd.read_csv('godess/node_embedding/carbon_pf.csv')

# godess_monosaccharide_accurate_embed=pd.read_csv('godess/node_embedding/monosaccharide_accurate_embed.csv')

# godess_monosaccharide_simple_embed=pd.read_csv('godess/node_embedding/monosaccharide_simple_embed.csv')

# godess_me_embed=pd.read_csv('godess/node_embedding/root_me_embed.csv')

# godess_ser_embed=pd.read_csv('godess/node_embedding/root_ser_embed.csv')

# godess_s_embed=pd.read_csv('godess/node_embedding/component_s_embed.csv')

# godess_ac_embed=pd.read_csv('godess/node_embedding/component_ac_embed.csv')

# godess_gc_embed=pd.read_csv('godess/node_embedding/component_gc_embed.csv')

In [4]:
godess_monosaccharide_simple_embed.columns

Index(['17hoole', '3,6anhgal', 'ac', 'allyl', 'ara', 'asn', 'bu', 'bz', 'caf',
       'cho', 'fer', 'fru', 'fuc', 'fucn', 'gal', 'gala', 'gallic', 'galn',
       'gc', 'glc', 'glca', 'glcn', 'kdo', 'mal', 'man', 'mana', 'mann', 'me',
       'missing monosaccharide', 'myoino', 'neu', 'p', 'pr', 'rha', 's', 'ser',
       'xyl'],
      dtype='object')

In [5]:
glycoscience_monosaccharide_embed.columns

Index(['ara', 'fuc', 'fucnac', 'gal', 'gala', 'galanac', 'galnac', 'glc',
       'glca', 'glcn', 'glcnac', 'gula', 'hep', 'ido', 'idoa', 'kdo', 'lyx',
       'man', 'mana', 'mannac', 'missing_refornulated_mono', 'neu5ac', 'quip',
       'quipnac', 'rha', 'rhanac', 'rib', 'tal', 'xyl'],
      dtype='object')

#### We need to first merge two data source together

##### 1, Reformulate godess monosaccharide name to align it with glycoscience, create new atom type name

In [6]:
# notice that godess has

replace_value = {'Kdo_C1': 'C1_KDO',
                 'Kdo_C2': 'C2_KDO', 
                 'Kdo_C3': 'C3_KDO', 
                 'Kdo_C4': 'C4_KDO', 
                 'Kdo_C5': 'C5_KDO', 
                 'Kdo_C6': 'C6_KDO', 
                 'Kdo_C7': 'C7_KDO',
                 'Kdo_C8': 'C8_KDO'}

common_atom_names = ['C', 'C1', 'C10', 'C11', 'C12', 'C13', 'C2', 'C3', 'C4', 'C5',
              'C6', 'C7', 'C8', 'C9', 'H', 'H1', 'H10', 'H11', 'H12', 'H13',
              'H14', 'H15', 'H16', 'H17', 'H18', 'H2', 'H21', 'H22', 'H23', 'H3',
              'H4', 'H5', 'H51', 'H6', 'H61', 'H62', 'H7', 'H8', 'H9', 'N2',
              'N4', 'N5', 'O', 'O1', 'O2', 'O3', 'O4', 'O5', 'O6', 'O7', 'O8',
              'S1'] + ['C1_KDO', 'C2_KDO', 'C3_KDO', 'C4_KDO', 'C5_KDO', 'C6_KDO', 'C7_KDO', 'C8_KDO']


godess_data_dir = 'godess/data/'

out_godess_data_dir = 'godess/data_reformulate/'

godess_data_files = os.listdir(godess_data_dir)

for i in tqdm(range(len(godess_data_files))):
    
    godess_file_name = godess_data_files[i]
    
    godess_file_path = os.path.join(godess_data_dir, godess_file_name)
    
    df_godess = pd.read_csv(godess_file_path)
    
    df_godess['New_Atom_name'] = df_godess['New_Atom_name'].replace(replace_value)
    
    merged_atom_list = []
    
    merged_mono_list = []
    
    for j in range(len(df_godess)):
        temp_atom_name = df_godess.loc[j, :]['New_Atom_name']
        
        temp_atom_type = df_godess.loc[j, :]['Atom_type']
        
        temp_mono_name = df_godess.loc[j, :]['reformulated_standard_mono']
        
        temp_attached_ac = df_godess.loc[j, :]['Ac_component']
        
        # reformulate atom names
        if temp_atom_name in common_atom_names:
            merged_atom_list.append(temp_atom_name)
        else:
            merged_atom_list.append(temp_atom_type)
        
        # reformulate monosaccharide names
        if temp_attached_ac:
            
            temp_new_mono_name = temp_mono_name + 'ac'
            
        else:
            
            temp_new_mono_name = temp_mono_name
            
        merged_mono_list.append(temp_new_mono_name)
    
    df_godess['Merged_Atom_name'] = merged_atom_list
    df_godess['Merged_standard_mono'] = merged_mono_list
    df_godess['Merged_standard_mono'] = df_godess['Merged_standard_mono'].replace('missing monosaccharide', 
                                                                                  'missing_refornulated_mono')
    out_godess_data_path = os.path.join(out_godess_data_dir, godess_file_name)
    df_godess.to_csv(out_godess_data_path, index = False)

100%|███████████████████████████████████████| 2310/2310 [02:00<00:00, 19.11it/s]


##### 2, Reformulate glycoscience to align it with the newly created atom name of godess

In [7]:
glycoscience_data_dir = 'glycosciencedb/data/'

out_glycoscience_data_dir = 'glycosciencedb/data_reformulate/'

glycoscience_data_files = os.listdir(glycoscience_data_dir)

for i in tqdm(range(len(glycoscience_data_files))):
    
    glycoscience_file_name = glycoscience_data_files[i]
    
    glycoscience_file_path = os.path.join(glycoscience_data_dir, glycoscience_file_name)
    
    df_glycoscience = pd.read_csv(glycoscience_file_path)
    
    merged_atom_list = []
    
    for j in range(len(df_glycoscience)):
        
        # naming error here 'Atom_Type' actually refers to atom name
        temp_atom_name = df_glycoscience.loc[j, :]['Atom_Type']
        
        # 'atom_simplify' here actually refers to atom type e.g C, H, O, N
        temp_atom_type = df_glycoscience.loc[j, :]['atoms_simplify']
        
       
        # reformulate atom names
        if temp_atom_name in common_atom_names:
            merged_atom_list.append(temp_atom_name)
        else:
            merged_atom_list.append(temp_atom_type)
    
    df_glycoscience['Merged_Atom_name'] = merged_atom_list
    out_glycoscience_data_path = os.path.join(out_glycoscience_data_dir, glycoscience_file_name)
    df_glycoscience.to_csv(out_glycoscience_data_path, index = False)

100%|█████████████████████████████████████████| 299/299 [00:04<00:00, 68.67it/s]
