In [1]:
import pandas as pd 
import numpy as np
import os
import re
from tqdm import tqdm
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)

##### 1, read in temperature

In [2]:
data_dir = 'experimental_data_combined/FullyAnnotatedPDB_pdb_labeled_combined/'

out_data_dir_1 = 'experimental_data_combined/FullyAnnotatedPDB_pdb_labeled_combined_add_temperature/'

matching_table = pd.read_csv('matching_table/combined_all_formulated_monosaccharide_v2.csv')
combined_glycans = os.listdir(data_dir)

linear_dir = 'experimental_data/FullyAnnotatedPDB_V2/'
nonlinear_dir = 'experimental_data_nonlinear/FullyAnnotatedPDB_V2/'

In [3]:
glycan_types = matching_table['type'].values
glycan_names = matching_table['glycan name'].values

In [4]:
matching_table.head(1)

Unnamed: 0,csv,pdb,glycan name,residual num,type,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
0,b-D-GlcpNAc,NAG,a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b...,1,linear,b,d,glcpnac,glcnac,P


In [5]:
combined_glycans[0], glycan_names[0]

('Repeat-6)-a-D-Glcp-(1-4)-b-D-GlcpA-(1-4)-b-D-GalpNAc3Ac-(1-3)-a-D-Galp-(1-3)-b-D-GalpNAc-(1-.pdb.csv',
 'a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl')

In [6]:
def extract_mhz_temperature_solvent(temp_path, temperature_correction = 273):
    MHz = 'missing'
    Temperature = 'missing'
    Solvent = 'missing'
    print(temp_path)
    with open(temp_path) as f:
        f = f.readlines()
        for f_l in f:
            f_l = f_l.lower()
            print(f_l)
#             temp_l = f_l.split('\t')
            if 'mhz' in f_l:
                print(f_l)
                MHz = re.findall(r'\d+', f_l)
            if 'temperature' in f_l:
                print(type(f_l))
                Temperature = re.findall(r'\d+', f_l)
                
#                 if Temperature < 273:
#                     Temperature += 273
            if 'solvent' in f_l:
                Solvent = re.findall(r'\d+', f_l)
            
    
    return MHz, Temperature, Solvent

In [7]:
def assign_temperature_label(temperature_list):
    new_list = []
    for t in temperature_list:
        if t < 300:
            new_list.append(0)
        elif (t >= 300) and (t < 330):
            new_list.append(1)
        elif t >= 330:
            new_list.append(2)
    return new_list

In [8]:
def assign_temperature_label_single(temperature):
    new_label = None
    t = temperature
    if t < 300:
        new_label = 0
    elif (t >= 300) and (t < 330):
        new_label = 1
    elif t >= 330:
        new_label = 2
    return new_label

In [9]:
glycan_list = []
hz_list = []
t_list = []
solvent_list = []

for i in range(len(combined_glycans)):
    
    current_glycan_name = combined_glycans[i]
    current_glycan_name_plain = current_glycan_name.replace('.pdb.csv', '')
    current_glycan_name_csv = current_glycan_name.replace('.pdb.csv', '.csv')
    
    if current_glycan_name_plain in glycan_names:
        
        
        temp_indexes = np.where(glycan_names == current_glycan_name_plain)[0]
        temp_types = matching_table.loc[temp_indexes]['type'].values
        temp_types = list(np.unique(temp_types))
        
        if len(temp_types) > 1:
            raise ValueError(current_glycan_name, 'The glycan is considered twice')
        
        temp_types = ''.join(temp_types)
        
        if temp_types == 'linear':
            current_path = os.path.join(linear_dir, current_glycan_name_csv)
            df = pd.read_csv(current_path, header = None)
            
            current_mhz = float(df.loc[0, 1])
            current_temperature = float(df.loc[1, 1])
            
            
        elif temp_types == 'nonlinear':
            current_path = os.path.join(nonlinear_dir, current_glycan_name_csv)
    
            try:
                df = pd.read_csv(current_path, header = None)
                
                current_mhz = float(df.loc[0, 1])
                current_temperature = float(df.loc[1, 1])
                
                
            except:
                df = pd.read_csv(current_path, on_bad_lines='skip', header = None)
                
                current_mhz = float(df.loc[0].values[0].split('\t')[1])
                current_temperature = float(df.loc[1].values[0].split('\t')[1])
                
#             glycan_list.append(current_path)

        else:
            raise ValueError('A glyan should be either linear or branch')
        
        if current_temperature < 273:
            current_temperature += 273
        
        hz_list.append(current_mhz)
        t_list.append(current_temperature)
        
    else:
        raise ValueError('Missing glycans')
    

    
    labeled_pdb = pd.read_csv(os.path.join(data_dir, current_glycan_name))
    
    labeled_pdb.insert(loc = 9, column = 'temperature_f', value = current_temperature)
    
    current_temperature_label = assign_temperature_label_single(current_temperature)
    
    labeled_pdb.insert(loc = 10, column = 'temperature_label', value = current_temperature_label)
    
    if 'ido' in current_glycan_name.lower():
        print(current_glycan_name)
    
    labeled_pdb.to_csv(os.path.join(out_data_dir_1, current_glycan_name), index = False)
    
#     print(current_temperature_label)
t_list_label = assign_temperature_label(t_list)

b-D-Idop.pdb.csv
a-D-Idop.pdb.csv
Repeat-4)-a-L-IdopA-(1-4)-a-D-GlcpNAc-(1-.pdb.csv


In [10]:
len(t_list_label), len(combined_glycans)

(301, 301)

##### 1.1 Missing temerature

In [11]:
# 273 is the missing temperature
idx_omit = np.where(np.array(t_list) == 273)[0]
np.array(combined_glycans)[idx_omit]

array(['a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpN-(1-6)-a-D-GlcpN1PO4.pdb.csv',
       'b-D-Glcp-(1-4)-b-D-Glcp-(1-4)-b-D-Galp.pdb.csv',
       'DB26383.pdb.csv', 'P-(O-6)-a-D-Glcp-(1-1)-a-D-Glcp.pdb.csv',
       'b-D-Glcp-(1-4)-b-D-Glcp.pdb.csv', 'DB26928.pdb.csv',
       'P-(O-2)-a-D-Glcp-(1-1)-a-D-Glcp.pdb.csv',
       'P-(O-4)-a-D-Glcp-(1-1)-a-D-Glcp.pdb.csv',
       'b-D-Glcp-(1-4)-a-D-Glcp.pdb.csv',
       'b-D-Glcp-(1-4)-b-D-Glcp-(1-4)-a-D-Galp.pdb.csv',
       'P-(O-3)-a-D-Glcp-(1-1)-a-D-Glcp.pdb.csv', 'DB9023.pdb.csv'],
      dtype='<U112')

In [12]:
exp_glycan = 'experimental_data_nonlinear/FullyAnnotatedPDB_V2/DB8939.csv'
with open(exp_glycan) as f:
    f = f.readlines()

In [13]:
df1 = pd.read_csv(exp_glycan, on_bad_lines='skip', header = None)
float(df1.loc[0].values[0].split('\t')[1])

400.0

##### 2 Reformulate summary matching table, missing monosaccharide imputation for labeled glycan

In [14]:
def split_single_mono_bound(mono):
    
    mono = mono.lower()
    
    ab = 'missing_a_b'
    ld = 'missing_L_D'
    abbrev = ''
    
    _ct = mono.count('-')
    temp_mono_list = mono.split('-')
    if _ct == 2:
        
        
        ab = temp_mono_list[0]
        ld = temp_mono_list[1]
        abbrev = temp_mono_list[2]
        
    elif _ct == 1:
#         the following are wired only one appeared
#         D-GLCPA
#         D-GLCPN
#         DELTA-GLCPAN
        temp_bound_incomp = temp_mono_list[0]
        
        if temp_bound_incomp in ['delta', 'd', 'l']:
            ld = temp_bound_incomp 
            if ld == 'delta':
                ld = 'd'
        elif temp_bound_incomp in ['a', 'b']:
            ab = temp_bound_incomp 
        
        abbrev = temp_mono_list[1]
        
    elif _ct == 0:
        abbrev = mono
    
    elif _ct == 3:
#         the following are wired, three bound appeared, all treated as A-D-HEPP
#         D-A-D-HEPP
#         L-A-D-HEPP
#         L-A-D-HEPP 
        ab = temp_mono_list[1]
        ld = temp_mono_list[2]
        abbrev = temp_mono_list[3]
    elif _ct == 4:
#         the following are wired, four bound appeared,
#         L-gro-a-D-manHepp
#         a-L-6-deoxy-Talp
        if 'man' in mono:
            ab = 'a'
            ld = 'd'
            abbrev = 'manhepp'
        if 'tal' in mono:
            ab = 'a'
            ld = 'l'
            abbrev = 'talp'
    if abbrev == 'monosaccharid':
        abbrev = 'missing_monosaccharide'
    
    return ab, ld, abbrev

In [15]:
def reformulate_monosaccharide(abbrev, g_list):
    
    if 'p' in abbrev:
        carbon_number = 'P'
        current_split_list = abbrev.split('p')
    
    elif 'f' in abbrev:
        carbon_number = 'F'
        current_split_list = abbrev.split('f')
    
    else:
        carbon_number = 'missing_carbon_number'
        current_split_list = [abbrev]
#     print(current_split_list)
    new_abbrev = ''.join(current_split_list)
    
    indicator = 0
#     print(new_abbrev)
    for m in g_list:
        if (m in new_abbrev) and (indicator == 0):
            
            new_mono = m
            indicator = 1
            prev_m = m
        elif (m in new_abbrev) and (indicator == 1):
            
            if len(m) > len(prev_m):
                new_mono = m
            else:
                new_mono = prev_m
        
    if indicator == 0:
        new_mono = 'missing_monosaccharide'
    
    return abbrev, new_mono, carbon_number

In [16]:
def sorting(lst):
    lst2 = sorted(lst, key=len, reverse=True)
    return lst2
glycan_list = ['gal', 'galnac', 'galn', 'gala', 'glc', 'glcnac', 'glcn', 'glca', 'fuc', 
               'man', 'mannac', 'mann', 'mana', 'kdn', 'neu5ac', 'xyl', 'idoa', 'neu5gc']

glycan_list_extend = ['gal', 'galnac', 'galn', 'gala', 'galanac', 'glc', 'glcnac', 'glcn', 'glca', 'fuc', 
                      'man', 'mannac', 'mann', 'mana', 'kdn', 'neu5ac', 'xyl', 'idoa', 'neu5gc', 
                      'kdo', 'kdn', 'quip', 'rhap', 'ara', 'gula', 'rib', 'neu5ac', 'tal', 'rha', 'rhanac',
                      'fucnac', 'rhanac', 'ido', 'hep']

glycan_list_extend = sorting(glycan_list_extend)
print(glycan_list_extend)

['galanac', 'galnac', 'glcnac', 'mannac', 'neu5ac', 'neu5gc', 'neu5ac', 'rhanac', 'fucnac', 'rhanac', 'galn', 'gala', 'glcn', 'glca', 'mann', 'mana', 'idoa', 'quip', 'rhap', 'gula', 'gal', 'glc', 'fuc', 'man', 'kdn', 'xyl', 'kdo', 'kdn', 'ara', 'rib', 'tal', 'rha', 'ido', 'hep']


In [17]:
df_summary = pd.read_csv('matching_table/combined_all_formulated_monosaccharide_v2.csv')
csv_missing_mono_index = np.where(df_summary['csv'].values == 'Monosaccharid')
df_summary_missing = df_summary.loc[csv_missing_mono_index]

In [18]:
all_mono = df_summary_missing['glycan name'].values

In [19]:
ab_list = []
ld_list = []
abbrev_list = []
pf_list = []

standard_abbrev_list = []

for i in range(len(all_mono)):
    current_mono = all_mono[i]
    ab, ld, abbrev = split_single_mono_bound(current_mono)
#     print(ab, ld, abbrev)
    ab_list.append(ab)
    ld_list.append(ld)
    abbrev_list.append(abbrev)
    if ('p' not in abbrev) and ('f' not in abbrev):
#         print(i, abbrev, all_glycan_name[i], all_type[i], all_pdb[i])
        if abbrev == 'glc1ome':
            abbrev = 'glcp'
        
        if abbrev == 'galnac':
            abbrev = 'galpnac'
        
        if abbrev == 'glc':
            abbrev = 'glcp'
            
        if abbrev == 'kdn':
            abbrev = 'kdnp'
            
        if abbrev == 'tyr':
            abbrev = 'tyrp'
    
    if 'p' in abbrev:
        current_split_list = abbrev.split('p')
    
    elif 'f' in abbrev:
        current_split_list = abbrev.split('f')
    
    else:
        current_split_list = [abbrev]
    if len(current_split_list) > 2:
        pass
#         print(current_split_list, abbrev)
    
#     if any(ext in abbrev for ext in glycan_list_extend):
#         pass
#     else:
#         print(abbrev)
    abbrev, new_mono, carbon_number = reformulate_monosaccharide(abbrev, glycan_list_extend)
    print(abbrev, new_mono, carbon_number)
    
    standard_abbrev_list.append(new_mono)
    pf_list.append(carbon_number)

idop ido P
araf ara F
glcpa glca P
rhap rha P
ribp rib P
ribp rib P
xylp xyl P
rhap rha P
galpanac galanac P
araf ara F
glcpa glca P
fucp fuc P
glcp glc P
galpa gala P
galpa gala P
glcpnac glcnac P
glcpnac glcnac P
fucp fuc P
lyxp missing_monosaccharide P
lyxp missing_monosaccharide P
galpnac galnac P
manpnac mannac P
galp gal P
manp man P
ribf rib F
manpa mana P
glcp glc P
idop ido P
manp man P
xylp xyl P
ribf rib F
galp gal P
manpnac mannac P


In [20]:
# for j in df_summary['csv'].values:
#     if 'rha' in j.lower():
#         print(j)

In [21]:
# for j in df_summary['glycan name'].values:
#     if 'ido' in j.lower():
#         print(j)

In [22]:
df_summary_missing['bound_AB'] = ab_list
df_summary_missing['fischer_projection_DL'] = ld_list
df_summary_missing['origin_mono'] = abbrev_list
df_summary_missing['reformulated_standard_mono'] = standard_abbrev_list
df_summary_missing['carbon_number_PF'] = pf_list
df_summary_missing['csv'] = df_summary_missing['glycan name']

In [23]:
df_summary.update(df_summary_missing)

In [24]:
len(df_summary.loc[csv_missing_mono_index]), len(df_summary_missing)

(33, 33)

##### 2.2 rarely appeared monosaccharide

In [25]:
idx_rarely_mono = np.where(df_summary['reformulated_standard_mono'].values == 'missing_monosaccharide')[0]
df_summary_rarely_missing = df_summary.loc[idx_rarely_mono]

In [26]:
imputate_mono_list = []
for mono in df_summary_rarely_missing['origin_mono'].values:
    if (mono == 'quip4n') or (mono == 'quip'):
        imputate_mono_list.append('quip')
    elif (mono == 'lyxp'):
        imputate_mono_list.append('lyx')
    elif (mono == 'quip3nac') or (mono == 'quip4nac'):
        imputate_mono_list.append('quipnac')
    elif (mono == 'hepp'):
        imputate_mono_list.append('hep')
    elif (mono == 'tyr'):
        imputate_mono_list.append('tyr')
    else:
        print(mono)

In [27]:
df_summary_rarely_missing['reformulated_standard_mono'] = imputate_mono_list
df_summary.update(df_summary_rarely_missing)

In [28]:
df_summary.to_csv('matching_table/combined_all_formulated_monosaccharide_final_version.csv', index = False)

In [29]:
df_summary

Unnamed: 0,csv,pdb,glycan name,residual num,type,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
0,b-D-GlcpNAc,NAG,a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b...,1.0,linear,b,d,glcpnac,glcnac,P
1,a-D-Kdop,KDO,a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b...,2.0,linear,a,d,kdop,kdo,P
2,a-D-Kdop,KDO,a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b...,3.0,linear,a,d,kdop,kdo,P
3,a-D-Kdop,KDO,a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b...,4.0,linear,a,d,kdop,kdo,P
4,a-L-Fucp,FUC,repeat-2)-b-D-GlcpA-(1-3)-a-D-GalpNAc-(1-3)-a-...,1.0,linear,a,l,fucp,fuc,P
5,a-D-GalpNAc,A2G,repeat-2)-b-D-GlcpA-(1-3)-a-D-GalpNAc-(1-3)-a-...,2.0,linear,a,d,galpnac,galnac,P
6,a-D-GalpNAc,A2G,repeat-2)-b-D-GlcpA-(1-3)-a-D-GalpNAc-(1-3)-a-...,3.0,linear,a,d,galpnac,galnac,P
7,b-D-GlcpA,GLC,repeat-2)-b-D-GlcpA-(1-3)-a-D-GalpNAc-(1-3)-a-...,4.0,linear,b,d,glcpa,glca,P
8,b-D-Glcp,GLC,a-D-Xylp-(1-6)-b-D-Glcp-(1-4)-D-Glc-ol,1.0,linear,b,d,glcp,glc,P
9,a-D-Xylp,XYS,a-D-Xylp-(1-6)-b-D-Glcp-(1-4)-D-Glc-ol,2.0,linear,a,d,xylp,xyl,P


##### 3 relabel pdb files

In [30]:
out_data_dir_2 = 'experimental_data_combined/FullyAnnotatedPDB_pdb_labeled_combined_add_temperature_reformulate_pdb/'

In [31]:
pdb_orig_list = os.listdir(out_data_dir_1)

for pdb in tqdm(pdb_orig_list):
    pdb_plain = pdb.replace('.pdb.csv', '')
    
    temp_label_df = df_summary.loc[df_summary['glycan name'] == pdb_plain]
    
    temp_pdb = pd.read_csv(os.path.join(out_data_dir_1, pdb))
    
    temp_pdb['bound_AB'] = 'missing_a_b'
    temp_pdb['fischer_projection_DL'] = 'missing_L_D'
    temp_pdb['origin_mono'] = 'missing_mono'
    temp_pdb['reformulated_standard_mono'] = 'missing_refornulated_mono'
    temp_pdb['carbon_number_PF'] = 'missing_p_f'
    
    for res_num_csv in temp_label_df['residual num'].values:
        temp_pdb.loc[temp_pdb['residual'] == res_num_csv, ['bound_AB']] = \
        temp_label_df.loc[temp_label_df['residual num'] == res_num_csv]['bound_AB'].values[0]
        
        temp_pdb.loc[temp_pdb['residual'] == res_num_csv, ['fischer_projection_DL']] = \
        temp_label_df.loc[temp_label_df['residual num'] == res_num_csv]['fischer_projection_DL'].values[0]
        
        temp_pdb.loc[temp_pdb['residual'] == res_num_csv, ['origin_mono']] = \
        temp_label_df.loc[temp_label_df['residual num'] == res_num_csv]['origin_mono'].values[0]

        temp_pdb.loc[temp_pdb['residual'] == res_num_csv, ['reformulated_standard_mono']] = \
        temp_label_df.loc[temp_label_df['residual num'] == res_num_csv]['reformulated_standard_mono'].values[0]
        
        temp_pdb.loc[temp_pdb['residual'] == res_num_csv, ['carbon_number_PF']] = \
        temp_label_df.loc[temp_label_df['residual num'] == res_num_csv]['carbon_number_PF'].values[0]
    
        temp_pdb.to_csv(os.path.join(out_data_dir_2, pdb), index = False)

100%|█████████████████████████████████████████| 301/301 [00:05<00:00, 52.41it/s]


In [32]:
temp_pdb

Unnamed: 0,Atom,Atom_Num,Atom_Type,Monosaccharide_belong,residual,x,y,z,atoms_simplify,temperature_f,temperature_label,labels,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
0,HETATM,1,C1,GDL,1,-1.717,2.847,-2.449,C,343.0,2,102.5,a,d,glcpnac,glcnac,P
1,HETATM,2,C2,GDL,1,-3.167,2.473,-2.158,C,343.0,2,70.9,a,d,glcpnac,glcnac,P
2,HETATM,3,C3,GDL,1,-3.293,1.603,-0.919,C,343.0,2,70.6,a,d,glcpnac,glcnac,P
3,HETATM,4,C4,GDL,1,-2.342,0.431,-0.996,C,343.0,2,75.8,a,d,glcpnac,glcnac,P
4,HETATM,5,C5,GDL,1,-0.922,0.932,-1.261,C,343.0,2,70.9,a,d,glcpnac,glcnac,P
5,HETATM,6,C6,GDL,1,0.125,-0.153,-1.383,C,343.0,2,175.1,a,d,glcpnac,glcnac,P
6,HETATM,7,OH3,GDL,1,-4.637,1.112,-0.838,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P
7,HETATM,8,O4,GDL,1,-2.335,-0.3,0.221,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P
8,HETATM,9,O5,GDL,1,-0.939,1.645,-2.509,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P
9,HETATM,10,OH6,GDL,1,-0.171,-1.06,-2.433,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P


In [33]:
temp_label_df

Unnamed: 0,csv,pdb,glycan name,residual num,type,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
280,a-D-GlcpNAc,GDL,Repeat-4)-a-L-IdopA-(1-4)-a-D-GlcpNAc-(1-,1.0,linear,a,d,glcpnac,glcnac,P
281,a-L-IdopA,IDE,Repeat-4)-a-L-IdopA-(1-4)-a-D-GlcpNAc-(1-,2.0,linear,a,l,idopa,idoa,P


##### Missing example in monosaccharide

In [34]:
missing_example = 'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.pdb.csv'
pd.read_csv(os.path.join(out_data_dir_1, missing_example))

Unnamed: 0,Atom,Atom_Num,Atom_Type,Monosaccharide_belong,residual,x,y,z,atoms_simplify,temperature_f,temperature_label,labels
0,HETATM,1,C,Missing monosaccharide,0,1.326,1.889,5.136,C,300.0,1,-1.0
1,HETATM,2,C,Missing monosaccharide,0,1.583,0.651,4.262,C,300.0,1,-1.0
2,HETATM,3,C,Missing monosaccharide,0,2.744,-0.237,4.748,C,300.0,1,-1.0
3,HETATM,4,C,Missing monosaccharide,0,3.214,-0.877,3.445,C,300.0,1,-1.0
4,HETATM,5,C,Missing monosaccharide,0,3.04,0.287,2.452,C,300.0,1,-1.0
5,HETATM,6,C,Missing monosaccharide,0,2.79,-0.135,0.996,C,300.0,1,-1.0
6,HETATM,7,O,Missing monosaccharide,0,1.908,1.048,2.927,O,300.0,1,-1.0
7,HETATM,8,O1,FRU,4,0.381,-0.093,4.164,O,300.0,1,-1.0
8,HETATM,9,O,Missing monosaccharide,0,0.996,1.515,6.475,O,300.0,1,-1.0
9,HETATM,10,O,Missing monosaccharide,0,2.319,-1.209,5.701,O,300.0,1,-1.0


In [35]:
df_summary.loc[df_summary['glycan name'] == missing_example.replace('.pdb.csv', '')]

Unnamed: 0,csv,pdb,glycan name,residual num,type,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
81,A-D-GLCP,GLC,a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b...,1.0,linear,a,d,glcp,glc,P
82,A-D-GLCP,GLC,a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b...,2.0,linear,a,d,glcp,glc,P
83,A-D-GLCP,GLC,a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b...,3.0,linear,a,d,glcp,glc,P


In [36]:
temp_label_df

Unnamed: 0,csv,pdb,glycan name,residual num,type,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
280,a-D-GlcpNAc,GDL,Repeat-4)-a-L-IdopA-(1-4)-a-D-GlcpNAc-(1-,1.0,linear,a,d,glcpnac,glcnac,P
281,a-L-IdopA,IDE,Repeat-4)-a-L-IdopA-(1-4)-a-D-GlcpNAc-(1-,2.0,linear,a,l,idopa,idoa,P


In [37]:
temp_pdb

Unnamed: 0,Atom,Atom_Num,Atom_Type,Monosaccharide_belong,residual,x,y,z,atoms_simplify,temperature_f,temperature_label,labels,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
0,HETATM,1,C1,GDL,1,-1.717,2.847,-2.449,C,343.0,2,102.5,a,d,glcpnac,glcnac,P
1,HETATM,2,C2,GDL,1,-3.167,2.473,-2.158,C,343.0,2,70.9,a,d,glcpnac,glcnac,P
2,HETATM,3,C3,GDL,1,-3.293,1.603,-0.919,C,343.0,2,70.6,a,d,glcpnac,glcnac,P
3,HETATM,4,C4,GDL,1,-2.342,0.431,-0.996,C,343.0,2,75.8,a,d,glcpnac,glcnac,P
4,HETATM,5,C5,GDL,1,-0.922,0.932,-1.261,C,343.0,2,70.9,a,d,glcpnac,glcnac,P
5,HETATM,6,C6,GDL,1,0.125,-0.153,-1.383,C,343.0,2,175.1,a,d,glcpnac,glcnac,P
6,HETATM,7,OH3,GDL,1,-4.637,1.112,-0.838,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P
7,HETATM,8,O4,GDL,1,-2.335,-0.3,0.221,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P
8,HETATM,9,O5,GDL,1,-0.939,1.645,-2.509,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P
9,HETATM,10,OH6,GDL,1,-0.171,-1.06,-2.433,O,343.0,2,-1.0,a,d,glcpnac,glcnac,P
