In [1]:
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', 1000)

In [3]:
data_dir = 'experimental_data/FullyAnnotatedPDB_label_assigned_residual_num/'
csv_list = os.listdir(data_dir)

In [4]:
non_mono_list = ['a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Xylp-(1-6)-b-D-Glcp-(1-4)-D-Glc-ol.csv',
'a-L-Rhap-(1-3)-a-L-Rhap-(1-2)-Groal.csv',
'a-L-Fucp-(1-2)-b-D-Galp-(1-3)-D-GalNAc-ol.csv',
'a-D-GalpNAc-(1-6)-a-D-Galp-(1-6)-a-D-Galp-(1-3)-D-Gal-Ol.csv',
'a-D-Galp-(1-6)-a-D-Galp-(1-3)-D-Gal-Ol.csv',
'a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-a-D-GlcpNAc-(1-1)-Allyl.csv', 
'a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-b-D-GlcpNAc-(1-1)-Allyl.csv', 
'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-b-D-GlcpNAc-(1-1)-Allyl.csv', 
'a-D-FucpNAc-(1-4)-a-D-GalpNac-(1-3)-b-D-Manp-(1-2)-erythritol.csv',
'a-D-Apif.csv', 
'a-D-Galp-(1-6)-a-D-Galp-(1-3)-ononitol.csv',
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv', 
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
'a-L-Fucp2SO33SO34SO3-(1-2)-a-L-Fucp3SO34SO3-(1-1)-propyl.csv', 
'b-D-Apif.csv',
'b-D-Xylp-(1-3)-L-Ara-ol.csv',
'a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-D-GalNAc-ol.csv',
'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-a-D-GlcpNAc-(1-1)-Allyl.csv', 
'a-D-Manp-(1-2)-a-D-Manp-(1-3)-Ser.csv',
'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-a-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Kdop-(2-4)-a-D-Kdop-(2-1)-Allyl.csv',
'a-D-Kdop-(2-8)-a-D-Kdop-(2-1)-Allyl.csv'
'a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.csv']

In [5]:
# case 1: One Non-monosaccharide component, both Carbon and Hydrogen
# for some glycans 
# 1 -> x, 
# 2 -> 1
# 3 -> 2 ....
def correct_csv_attached_none_glycan_case1(temp_df):

    temp_residual_num = temp_df['Residual Num'].values

    temp_residual_num[temp_residual_num == 1] = 1000

    temp_residual_num_correct = temp_residual_num - 1

    temp_df['Residual Num'] = temp_residual_num_correct

    df = temp_df.copy()

    return df

def correct_csv_attached_none_glycan_case1_consider_components(temp_df):

    temp_residual_num = temp_df['Residual Num'].values

    temp_max = np.max(temp_residual_num)

    temp_residual_num[temp_residual_num == 1] = temp_max + 1

    temp_residual_num_correct = temp_residual_num - 1

    temp_df['Residual Num'] = temp_residual_num_correct

    df = temp_df.copy()

    return df

In [6]:
# case 2: Two Non-monosaccharide component, both Carbon and Hydrogen
# 1 -> x, 
# 2 -> x
# 3 -> 1 
# 4 -> 2 ...
def correct_csv_attached_none_glycan_case2(temp_df):

    temp_residual_num = temp_df['Residual Num'].values

    temp_residual_num[temp_residual_num == 1] = 1001
    
    temp_residual_num[temp_residual_num == 2] = 1001
    
    temp_residual_num_correct = temp_residual_num - 2

    temp_df['Residual Num'] = temp_residual_num_correct

    df = temp_df.copy()

    return df

def correct_csv_attached_none_glycan_case2_consider_components(temp_df):

    temp_residual_num = temp_df['Residual Num'].values
    
    temp_max = np.max(temp_residual_num)

    temp_residual_num[temp_residual_num == 1] = temp_max + 1
    
    temp_residual_num[temp_residual_num == 2] = temp_max + 2
    
    temp_residual_num_correct = temp_residual_num - 2

    temp_df['Residual Num'] = temp_residual_num_correct

    df = temp_df.copy()

    return df

In [7]:
# case 3: Three Non-monosaccharide component, both Carbon and Hydrogen
# 1 -> x, 
# 2 -> x
# 3 -> x 
# 4 -> 1
# 5 -> 2...
def correct_csv_attached_none_glycan_case3(temp_df):

    temp_residual_num = temp_df['Residual Num'].values

    temp_residual_num[temp_residual_num == 1] = 1002
    
    temp_residual_num[temp_residual_num == 2] = 1002
    
    temp_residual_num[temp_residual_num == 3] = 1002
    
    temp_residual_num_correct = temp_residual_num - 3

    temp_df['Residual Num'] = temp_residual_num_correct

    df = temp_df.copy()

    return df

def correct_csv_attached_none_glycan_case3_consider_components(temp_df):

    temp_residual_num = temp_df['Residual Num'].values
    
    temp_max = np.max(temp_residual_num)

    temp_residual_num[temp_residual_num == 1] = temp_max + 1
    
    temp_residual_num[temp_residual_num == 2] = temp_max + 2
    
    temp_residual_num[temp_residual_num == 3] = temp_max + 3
    
    temp_residual_num_correct = temp_residual_num - 3

    temp_df['Residual Num'] = temp_residual_num_correct

    df = temp_df.copy()

    return df

In [8]:
# case 4: All Carbon and Hydrogen are from Non-monosaccharide component.
# 1 -> x
# 2 -> x 
# 3 -> x ...
def correct_csv_attached_none_glycan_case4(temp_df):


    temp_df['Residual Num'] = 999

    df = temp_df.copy()

    return df

In [9]:
# case 5: One Non-monosaccharide component, only Carbon
# for some glycans 
# 1 -> x, 
# 2 -> 1
# 3 -> 2 ....
def correct_csv_attached_none_glycan_case5(temp_df):

    non_mono_index = temp_df.loc[temp_df['Residual'] == 'Allyl'].index
    temp_residual_num = temp_df['Residual Num'].values
    temp_residual_num[non_mono_index[0]:] -= 1
    temp_residual_num[non_mono_index[0]:(non_mono_index[-1]+1)] = 999
    temp_df['Residual Num'] = temp_residual_num
    df = temp_df.copy()

    return df

In [10]:
new_list = []
for csv_name in csv_list:
    temp_df = pd.read_csv(os.path.join(data_dir, csv_name))
    if csv_name in non_mono_list:
        pass
#         print(csv_name)
#         print(temp_df)
#         print('')
    if '-Allyl' in csv_name:
        new_list.append(csv_name)
    
    if '-ol' in csv_name:
        new_list.append(csv_name)
    
    if '-Ser' in csv_name:
        new_list.append(csv_name)
    
    if '-Groal' in csv_name:
        new_list.append(csv_name)
    
    if '-Ol' in csv_name:
        new_list.append(csv_name)
        
    if '-erythritol' in csv_name:
        new_list.append(csv_name)
    
    if '-ononitol' in csv_name:
        new_list.append(csv_name)
    
    if '-propyl' in csv_name:
        new_list.append(csv_name)
    
    if '-Apif' in csv_name:
        new_list.append(csv_name)
        
    if '-Fruf' in csv_name:
        new_list.append(csv_name)
new_list.extend(non_mono_list)
new_list = list(np.unique(new_list))

In [11]:
new_list

['a-D-Apif.csv',
 'a-D-FucpNAc-(1-4)-a-D-GalpNac-(1-3)-b-D-Manp-(1-2)-erythritol.csv',
 'a-D-Galp-(1-6)-a-D-Galp-(1-3)-D-Gal-Ol.csv',
 'a-D-Galp-(1-6)-a-D-Galp-(1-3)-ononitol.csv',
 'a-D-GalpNAc-(1-6)-a-D-Galp-(1-6)-a-D-Galp-(1-3)-D-Gal-Ol.csv',
 'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
 'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
 'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
 'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
 'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
 'a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-D-GalNAc-ol.csv',
 'a-D-Kdop-(2-4)-a-D-Kdop-(2-1)-Allyl.csv',
 'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
 'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-a-D-GlcpNAc-(1-1)-Allyl.csv',
 'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
 'a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-

In [12]:
case1_list = ['a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Xylp-(1-6)-b-D-Glcp-(1-4)-D-Glc-ol.csv',
'a-L-Rhap-(1-3)-a-L-Rhap-(1-2)-Groal.csv',
'a-D-Kdop-(2-4)-a-D-Kdop-(2-1)-Allyl.csv',
'b-D-Xylp-(1-3)-L-Ara-ol.csv',
'a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-L-Fucp-(1-2)-b-D-Galp-(1-3)-D-GalNAc-ol.csv',
'a-D-GalpNAc-(1-6)-a-D-Galp-(1-6)-a-D-Galp-(1-3)-D-Gal-Ol.csv',
'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Galp-(1-6)-a-D-Galp-(1-3)-D-Gal-Ol.csv',
'a-D-FucpNAc-(1-4)-a-D-GalpNac-(1-3)-b-D-Manp-(1-2)-erythritol.csv',
'a-D-Kdop-(2-8)-a-D-Kdop-(2-1)-Allyl.csv',
'a-D-Manp-(1-2)-a-D-Manp-(1-3)-Ser.csv', 
'a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-D-GalNAc-ol.csv']

In [13]:
case2_list = ['a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv']

In [14]:
case3_list = ['a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv',
'a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-a-D-Glcp-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf-(1-2)-b-D-Fruf.csv']

In [15]:
case4_list = ['b-D-Fruf-(2-6)-b-D-Fruf.csv',
'b-D-Apif.csv',
'a-D-Apif.csv']

In [16]:
case5_list = ['a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-a-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-b-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-a-D-GlcpNAc-(1-1)-Allyl.csv',
'a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpNAc-(1-6)-a-D-GlcpNAc-(1-1)-Allyl.csv']

##### Second round preprocessing, correct residuals by matching table, stored in pdf: 'Correct by matching table'

In [17]:
def switch_residual(df, original_list = [1, 2, 3, 4, 5], modified_list = [2, 3, 4, 5, 1]):
    df_new = df.copy()
    
    orig_residual_list = df['Residual Num'].values
    new_residual_list = np.zeros(len(orig_residual_list)).astype(int)
    
    for i in range(len(original_list)):
        orig_idx = original_list[i]
        new_idx = modified_list[i]

        temp_idx = np.where(orig_residual_list == orig_idx)[0]
        new_residual_list[temp_idx] = new_idx
    df_new['Residual Num'] = new_residual_list
    return df_new

In [18]:
for csv_name in csv_list:
    temp_df = pd.read_csv(os.path.join(data_dir, csv_name))
    if csv_name in new_list:

        
        if csv_name in case1_list:
            df_out = correct_csv_attached_none_glycan_case1(temp_df)
        elif csv_name in case2_list:
            df_out = correct_csv_attached_none_glycan_case2(temp_df)
        elif csv_name in case3_list:
            df_out = correct_csv_attached_none_glycan_case3(temp_df)
        elif csv_name in case4_list:
            df_out = correct_csv_attached_none_glycan_case4(temp_df)
        elif csv_name in case5_list:
            df_out = correct_csv_attached_none_glycan_case5(temp_df)
            
        # Added second round preprocessing correct matching
        elif 'a-D-Galp-(1-6)-a-D-Galp-(1-3)-ononitol' in csv_name:
            df_out = switch_residual(temp_df, [1, 2, 3], [999, 1, 2])
#             print(df_out)
            
        
        else:
            df_out = temp_df
#             print(csv_name)
#         print(df_out)
#         print('')
        
    else:
        # Added second round preprocessing correct matching
        
        
        if 'Repeat-4)-a-D-GlcpA-(1-3)-a-D-Glcp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-4)-a-D-GlcpNAc-(1-.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2, 3, 4, 5], [2, 3, 4, 5, 1])
            
        elif 'a-L-Rhap-(1-4)-a-D-GalpA-(1-3)-b-D-GlcpNAc-(1-2)-a-L-Rhap-(3-1)-a-L-Rhap.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2, 3, 4, 5], [5, 1, 2, 3, 4])
        
        elif 'b-D-Xylp-(1-4)-b-D-Xylp-(1-4)-a-L-Galp-(2-1)-b-D-Xylp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2, 3, 4], [2, 1, 3, 4])
            
        elif 'b-D-Xylp-(1-4)-b-D-Xylp-(1-4)-b-L-Galp-(2-1)-b-D-Xylp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2, 3, 4], [2, 1, 3, 4])
            
        elif 'b-D-Galp-(3-1)-a-D-Manp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2], [2, 1])
#             print(df_out)

        elif 'b-D-Galp-(1-3)-b-D-Glcp-(1-3)-b-D-Xylp-(1-4)-b-D-Xylp-(1-4)-b-L-Galp-(2-1)-b-D-Xylp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2, 3, 4, 5, 6], [2, 1, 3, 4, 5, 6])
            
        elif 'P-(O-2)-a-D-Glcp-(1-1)-a-D-Glcp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2], [1, 3])
            
        elif 'P-(O-3)-a-D-Glcp-(1-1)-a-D-Glcp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2], [1, 3])
        
        elif 'P-(O-4)-a-D-Glcp-(1-1)-a-D-Glcp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2], [1, 3])
        
        elif 'P-(O-6)-a-D-Glcp-(1-1)-a-D-Glcp.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2], [1, 3])

        elif 'a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpN4PO4-(1-6)-a-D-GlcpN1PO4.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2, 3, 4, 5], [1, 2, 4, 5, 6])
            
        elif 'Repeat-4)-a-L-IdopA-(1-4)-a-D-GlcpNAc-(1-.csv' == csv_name:
            df_out = switch_residual(temp_df, [1, 2], [2, 1])
            print(df_out)
        else:
            df_out = temp_df
    
    if csv_name not in case4_list:
    
        df_out.to_csv(os.path.join('experimental_data/FullyAnnotatedPDB_label_corrected/', csv_name), index = False)

       Residual  Connection  Atom   Shift  Residual Num
0   a-D-GlcpNAc     4,until    H1    4.88             2
1   a-D-GlcpNAc     4,until    H2    3.64             2
2   a-D-GlcpNAc     4,until    H3    3.82             2
3   a-D-GlcpNAc     4,until    H4    4.03             2
4   a-D-GlcpNAc     4,until    H5    4.67             2
5   a-D-GlcpNAc     4,until    NH    8.01             2
6   a-D-GlcpNAc     4,until   NAC    1.99             2
7     a-L-IdopA   4,4,until    H1    5.13             1
8     a-L-IdopA   4,4,until    H2    3.92             1
9     a-L-IdopA   4,4,until    H3    3.72             1
10    a-L-IdopA   4,4,until    H4    3.68             1
11    a-L-IdopA   4,4,until    H5    3.84             1
12    a-L-IdopA   4,4,until  H6PS    3.84             1
13    a-L-IdopA   4,4,until  H6PR    3.78             1
14  a-D-GlcpNAc     4,until    C1  102.50             2
15  a-D-GlcpNAc     4,until    C2   70.90             2
16  a-D-GlcpNAc     4,until    C3   70.60       

In [19]:
# temp_df = pd.read_csv(os.path.join(data_dir, 
#                                    'a-D-Galp-(1-6)-a-D-Galp-(1-3)-ononitol.csv'))
# correct_csv_attached_none_glycan_case1(temp_df)