In [1]:
import numpy as np
import pandas as pd
import os
pd.set_option('display.max_rows', 2000)
pd.set_option('display.max_columns', 2000)

##### 1, Relabel kdo

In [2]:
# carbon number mode
kdo_carbon_template = [176.3, 102.3, 35.2, 67.1, 67.2, 72.7, 70.7, 64.0]
kdo_carbon_number = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8']
kdo_hydrogen_number = ['H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8']

In [3]:
kdo_files = os.listdir('experimental_data_combined/kdo/')
kdo_label_files = os.listdir('experimental_data_combined/kdo_csv_second_round_check/')
out_dir_1 = 'experimental_data_combined/kdo_relabeled/'

In [4]:
for f in kdo_files:
    temp_pdb = pd.read_csv(os.path.join('experimental_data_combined/kdo/', f))
    
    temp_atom_type = temp_pdb['Atom_Type'].values
    temp_mono = temp_pdb['Monosaccharide_belong'].values
    temp_labels = temp_pdb['labels'].values
    temp_residual_num = temp_pdb['residual'].values
    
    csv_name = f.replace('.pdb.csv', '.csv')
    
    temp_label_f = pd.read_csv(os.path.join('experimental_data_combined/kdo_csv_second_round_check', csv_name))
    
    new_labels = np.repeat(-1.0, len(temp_labels))
    for i in range(len(temp_atom_type)):
        
        current_atom = temp_atom_type[i]
        current_mono = temp_mono[i]
        current_label = temp_labels[i]
        current_residual_num = temp_residual_num[i]
        
        if (current_atom in kdo_carbon_number) and (current_mono == 'KDO'):
            
            new_kdo_val = temp_label_f.loc[(temp_label_f['Atom'] == current_atom) & 
                                           (temp_label_f['Residual Num'] == current_residual_num)]['Shift'].values
            if new_kdo_val.size > 0:
                
                new_labels[i] = new_kdo_val
            
            else:
#                 print(f, 'is missing carbon number:', 
#                      current_atom, 'on residual number:', current_residual_num, '\n')
                new_labels[i] = current_label
        else:
            new_labels[i] = current_label
        
    temp_pdb['labels'] = new_labels
    temp_pdb.to_csv(os.path.join(out_dir_1, f), index = False)

##### 2, Check carbon shift by template

In [5]:
out_dir_2 = 'experimental_data_combined/kdo_relabeled_correct/'

In [6]:
def correct_kdo_carbon_shift(df, res_num, new_carbon_values, 
                             kdo_carbon_number = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8']):
    df_out = df.copy()
    assert len(new_carbon_values) == 8
    for i in range(len(kdo_carbon_number)):
        current_c = kdo_carbon_number[i]
        current_val = new_carbon_values[i]
        
        df_out.loc[(df_out['Atom_Type'] == current_c) & (df_out['residual'] == res_num), ['labels']] = current_val
    
    return df_out

In [7]:
# example: right shift = 1 
# 1 -> 2, orig val 1 set to -1 value 
# 2 -> 3
# ...
# 7 -> 8
# 8 -> drop
def apply_kdo_hydrogen_shift(df, res_num, shift_num, 
                             kdo_hydrogen_number = ['H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H7', 'H8']):
    df_out = df.copy()
    assert shift_num > 0 
    assert shift_num < 8
    
    new_shift_val = []
    
    for i in range(len(kdo_hydrogen_number)):
        current_h = kdo_hydrogen_number[i]
        
        if (i - shift_num) >= 0:
#             print(i - shift_num)
            new_h = kdo_hydrogen_number[int(i - shift_num)]
#             print(new_h)
            
            if df_out.loc[(df_out['Atom_Type'] == new_h) & (df_out['residual'] == res_num)].empty:
                new_val = -1.0
            
            else:
                
                new_val = df_out.loc[(df_out['Atom_Type'] == new_h) & (df_out['residual'] == res_num)]['labels']
            
        
        else:
#             print(i - shift_num)
            new_val = -1.0
#         print(new_val)
        new_shift_val.append(float(new_val))
        
#     print(new_shift_val)
    for j in range(len(new_shift_val)):
        
        current_h_modified = kdo_hydrogen_number[j]
        
        if df_out.loc[(df_out['Atom_Type'] == current_h_modified) & (df_out['residual'] == res_num)].empty:
#             print(current_h_modified)
            pass
        
        else:
#             print(current_h_modified)
            df_out.loc[(df_out['Atom_Type'] == current_h_modified) & (df_out['residual'] == res_num), 
                       ['labels']] = new_shift_val[j]
    
    return df_out

In [8]:
# apply_kdo_hydrogen_shift(prev_pdb, 1, 2)

In [9]:
# prev_pdb.loc[(prev_pdb['Atom_Type'] == 'H4') & (prev_pdb['residual'] == 1)]['labels']

In [10]:
relabeled_kdo_files = os.listdir(out_dir_1)
for f in relabeled_kdo_files:
    temp_pdb = pd.read_csv(os.path.join(out_dir_1, f))
    
    
    # correct files according to kdo_template_comparision_before_reshift.txt
    
    if f == 'DB26475.pdb.csv':
        temp_pdb = correct_kdo_carbon_shift(temp_pdb, 1, [-1.0, -1.0, 35.0, 66.1, 73.1, 72.1, 70.1, 64])
        temp_pdb = apply_kdo_hydrogen_shift(temp_pdb, 1, 2)
        
        
    if f == 'DB26467.pdb.csv':
        temp_pdb = correct_kdo_carbon_shift(temp_pdb, 4, [-1.0, -1.0, 35.0, 69.5, 70.2, 73.8, 70.8, 64.6])
        temp_pdb = correct_kdo_carbon_shift(temp_pdb, 5, [-1.0, -1.0, 35.0, 66.5, 68.4, 74.0, 70.5, 64.7])
        
        temp_pdb = apply_kdo_hydrogen_shift(temp_pdb, 4, 2)
        temp_pdb = apply_kdo_hydrogen_shift(temp_pdb, 5, 2)
        
    if f == 'DB26431.pdb.csv':
        temp_pdb = correct_kdo_carbon_shift(temp_pdb, 4, [-1.0, 101.2, 35.6, 71.6, 75.2, 75.5, 67.3, 44.8])
        temp_pdb = apply_kdo_hydrogen_shift(temp_pdb, 4, 1)
    
    if f == 'DB26928.pdb.csv':
        temp_pdb = correct_kdo_carbon_shift(temp_pdb, 10, [-1.0, 102.8, 35.3, 68.3, 66., 74.1, 69.8, 65.])
        temp_pdb = apply_kdo_hydrogen_shift(temp_pdb, 10, 1)
    
    if f == 'DB26910.pdb.csv':
        temp_pdb = correct_kdo_carbon_shift(temp_pdb, 1, [-1., -1, 30.2, 72.8, 79.1, 71.5, 71. ,64. ])
        temp_pdb = apply_kdo_hydrogen_shift(temp_pdb, 10, 2)
        prev_pdb = temp_pdb
        
    temp_atom_type = temp_pdb['Atom_Type'].values
    temp_mono = temp_pdb['Monosaccharide_belong'].values
    temp_labels = temp_pdb['labels'].values
    temp_residual_num = temp_pdb['residual'].values
    
    extract_carbon_df = temp_pdb.loc[np.where(np.in1d(temp_pdb['Atom_Type'], kdo_carbon_number))[0]]
    extract_carbon_df.index = range(len(extract_carbon_df))
#     extract_carbon_df
    
    for i in range(len(temp_atom_type)):
        
        current_atom = temp_atom_type[i]
        current_mono = temp_mono[i]
        current_label = temp_labels[i]
        current_residual_num = temp_residual_num[i]
        
        # create new carbon list
        temp_carbon_value_by_residual_num = extract_carbon_df.loc[extract_carbon_df['residual'] == \
                                                                         current_residual_num]['labels'].values
        
        temp_carbon_name_by_residual_num = extract_carbon_df.loc[extract_carbon_df['residual'] == \
                                                                 current_residual_num]['Atom_Type'].values
        
        
        
        
        
        if (current_atom in kdo_carbon_number) and (current_mono == 'KDO'):
            
            # get the corresponding label value in carbon template by atom type
            current_index = kdo_carbon_number.index(current_atom)
            template_value = kdo_carbon_template[current_index]
            
            if (np.abs(template_value - current_label) > 3) and (current_label != -1):
                print(f)
                print('with carbon number:', current_atom)
                print('at residual number:', current_residual_num)
                print('Template value:', template_value, 'Actual value:', current_label)
                print('difference:', np.abs(template_value - current_label))
                print(temp_carbon_value_by_residual_num)
                print(temp_carbon_name_by_residual_num)
                print('temperature:', np.unique(temp_pdb['temperature_f'].values)[0])
                print('')

    temp_pdb.to_csv(os.path.join(out_dir_2, f), index = False)

a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.pdb.csv
with carbon number: C4
at residual number: 2
Template value: 67.1 Actual value: 70.5
difference: 3.4000000000000057
[175.4 100.7  34.3  70.5  65.6  72.1  70.4  64.1]
['C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8']
temperature: 300.0

a-D-GlcpNAc-(1-4)-a-D-GlcpNAc-(1-4)-a-D-Kdop.pdb.csv
with carbon number: C2
at residual number: 1
Template value: 102.3 Actual value: 105.7
difference: 3.4000000000000057
[176.8 105.7  41.9  74.6  83.5  62.7  76.   61.7]
['C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8']
temperature: 300.0

a-D-GlcpNAc-(1-4)-a-D-GlcpNAc-(1-4)-a-D-Kdop.pdb.csv
with carbon number: C3
at residual number: 1
Template value: 35.2 Actual value: 41.9
difference: 6.699999999999996
[176.8 105.7  41.9  74.6  83.5  62.7  76.   61.7]
['C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8']
temperature: 300.0

a-D-GlcpNAc-(1-4)-a-D-GlcpNAc-(1-4)-a-D-Kdop.pdb.csv
with carbon number: C4
at residual number: 1
Template value: 67.1 Actual value:

DB26379.pdb.csv
with carbon number: C4
at residual number: 4
Template value: 67.1 Actual value: 72.9
difference: 5.800000000000011
[-1.  -1.  34.8 72.9 68.  73.3 70.2 64.2]
['C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8']
temperature: 300.0

DB26379.pdb.csv
with carbon number: C5
at residual number: 6
Template value: 67.2 Actual value: 63.5
difference: 3.700000000000003
[-1.  -1.  33.4 70.  63.5 72.9 71.  64.2]
['C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8']
temperature: 300.0

DB26378.pdb.csv
with carbon number: C4
at residual number: 4
Template value: 67.1 Actual value: 73.1
difference: 6.0
[175.8 103.2  33.7  73.1  69.5  73.6  70.2  64.5]
['C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8']
temperature: 300.0

DB26378.pdb.csv
with carbon number: C4
at residual number: 6
Template value: 67.1 Actual value: 74.0
difference: 6.900000000000006
[177.  101.1  35.4  74.   65.4  72.7  71.8  63.6]
['C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8']
temperature: 300.0



In [11]:
prev_pdb.loc[(prev_pdb['residual'] == 1) & (prev_pdb['Atom_Type'] == 'C1')]['labels'].values[0]

-1.0

In [12]:
prev_pdb.loc[prev_pdb['residual'] == 1]

Unnamed: 0,Atom,Atom_Num,Atom_Type,Monosaccharide_belong,residual,x,y,z,atoms_simplify,temperature_f,temperature_label,labels,bound_AB,fischer_projection_DL,origin_mono,reformulated_standard_mono,carbon_number_PF
0,HETATM,1,C1,KDO,1,4.059,4.224,6.92,C,300.0,1,-1.0,missing_a_b,missing_L_D,kdop,kdo,P
1,HETATM,2,C2,KDO,1,3.449,3.195,7.893,C,300.0,1,-1.0,missing_a_b,missing_L_D,kdop,kdo,P
2,HETATM,3,C3,KDO,1,1.917,3.047,7.655,C,300.0,1,30.2,missing_a_b,missing_L_D,kdop,kdo,P
3,HETATM,4,C4,KDO,1,1.324,1.847,8.433,C,300.0,1,72.8,missing_a_b,missing_L_D,kdop,kdo,P
4,HETATM,5,C5,KDO,1,2.115,0.536,8.185,C,300.0,1,79.1,missing_a_b,missing_L_D,kdop,kdo,P
5,HETATM,6,C6,KDO,1,3.63,0.761,8.479,C,300.0,1,71.5,missing_a_b,missing_L_D,kdop,kdo,P
6,HETATM,7,C7,KDO,1,4.526,-0.474,8.198,C,300.0,1,71.0,missing_a_b,missing_L_D,kdop,kdo,P
7,HETATM,8,C8,KDO,1,6.006,-0.239,8.551,C,300.0,1,64.0,missing_a_b,missing_L_D,kdop,kdo,P
8,HETATM,9,O1,KDO,1,3.703,5.516,7.227,O,300.0,1,-1.0,missing_a_b,missing_L_D,kdop,kdo,P
9,HETATM,10,OH4,KDO,1,-0.041,1.672,8.072,O,300.0,1,-1.0,missing_a_b,missing_L_D,kdop,kdo,P


In [13]:
np.unique(temp_pdb['temperature_f'].values[0])

array([297.])

In [14]:
temp_carbon_value_by_residual_num

array([176. , 101. ,  35.2,  67.2,  67.5,  72.7,  70.7,  64.5])

In [15]:
extract_carbon_df.loc[extract_carbon_df['residual'] == current_residual_num]['Atom_Type'].values

array(['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8'], dtype=object)

##### 3, Check rename carbon values

In [16]:
out_dir_3 = 'experimental_data_combined/kdo_relabeled_correct_reformulate/'
kdo_files = os.listdir(out_dir_2)

In [17]:
for f in kdo_files:
    temp_pdb = pd.read_csv(os.path.join(out_dir_2, f))
    
    
    temp_atom_type = temp_pdb['Atom_Type'].values
    temp_mono = temp_pdb['Monosaccharide_belong'].values
    temp_labels = temp_pdb['labels'].values
    temp_residual_num = temp_pdb['residual'].values
    
    for i in range(len(temp_atom_type)):
        
        current_atom = temp_atom_type[i]
        current_mono = temp_mono[i]
        current_label = temp_labels[i]
        current_residual_num = temp_residual_num[i]
        
        
        if (current_atom in kdo_carbon_number) and (current_mono == 'KDO'):
            temp_pdb.loc[i, ['Atom_Type']] += '_KDO'
        
        if (current_atom in kdo_hydrogen_number) and (current_mono == 'KDO'):
            temp_pdb.loc[i, ['Atom_Type']] += '_KDO'
        
        
    temp_pdb.to_csv(os.path.join(out_dir_3, f), index = False)

In [18]:
['C1_KDO', 'C2_KDO', 'C3_KDO', 'C4_KDO', 'C5_KDO', 'C6_KDO', 'C7_KDO', 'C8_KDO']

['C1_KDO',
 'C2_KDO',
 'C3_KDO',
 'C4_KDO',
 'C5_KDO',
 'C6_KDO',
 'C7_KDO',
 'C8_KDO']

In [19]:
temp_pdb.loc[temp_pdb['Monosaccharide_belong'] == 'KDO']['Atom_Type'].values

array(['C1_KDO', 'C2_KDO', 'C3_KDO', 'C4_KDO', 'C5_KDO', 'C6_KDO',
       'C7_KDO', 'C8_KDO', 'O1', 'OH4', 'O1', 'OH7', 'OH8', 'O6', 'OH5',
       'HO1', 'H3A', 'HO4', 'H4_KDO', 'H5_KDO', 'HO8', 'H8R', 'H8S',
       'HO7', 'H7_KDO', 'H6_KDO', 'HO5', 'H3E'], dtype=object)