In [1]:
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm

In [2]:
pd.set_option('display.max_rows', 1000)

In [3]:
data_dir = 'experimental_data_nonlinear/FullyAnnotatedPDB_V2_csv/'
csv_list = os.listdir(data_dir)

In [4]:
# normal preprocessing, all C appears before all H
# Contain C1 or H1, H11, H12 as initial
def assign_label_normal_glycan(temp_df, H_initial = ['H11', 'H12', 'H1'], C_initial = ['C1']):
    
    residual_list = temp_df['Residual'].values
    connection_list = temp_df['Connection'].values
    atom_list = temp_df['Atom'].values

    H_res = 0
    C_res = 0

    previous_residual = 'some residual'
    previous_connection = 'some connection'
    previous_atom = 'some atom'

    residual_list = []

    for i in range(len(temp_df)):
        current_residual = temp_df.loc[i, :].Residual
        current_connection = temp_df.loc[i, :].Connection
        current_atom = temp_df.loc[i, :].Atom

        if (current_atom in H_initial) and (previous_atom not in H_initial):
            H_res += 1

        if (C_res == 0) and (current_atom not in C_initial):
            residual_list.append(H_res)
        else:
            if current_atom in C_initial:
                C_res += 1
            residual_list.append(C_res)

        previous_residual = current_residual
        previous_connection = current_connection
        previous_atom = current_atom


    temp_df_out = temp_df.copy()
    temp_df_out['Residual Num'] = residual_list

    return temp_df_out

In [5]:
def check_carbon_hydrogen_same(df_out, print_anyway=False):
    df_atom_list = df_out['Atom'].values
    df_res_list = df_out['Residual Num'].values
    max_h = 1
    max_c = 1
    for i in range(len(df_atom_list)):
        if 'H' in df_atom_list[i]:
            if df_res_list[i] > max_h:
                max_h = df_res_list[i]
        if 'C' in df_atom_list[i]:
            if df_res_list[i] > max_c:
                max_c = df_res_list[i]
    if max_c != max_h:
        print(csv_f)
        print('carbon:', max_c, 'hydrogen:', max_h)
        print('')
    if print_anyway:
        print(csv_f)
        print('carbon:', max_c, 'hydrogen:', max_h)
        print('')
    
    return max_c, max_h

In [6]:
def assign_label_glycan_case1_1(temp_df, H_initial = ['H11', 'H12', 'H1'], C_initial = ['C1']):
    
    residual_list = temp_df['Residual'].values
    connection_list = temp_df['Connection'].values
    atom_list = temp_df['Atom'].values

    H_res = 0
    C_res = 0

    previous_residual = 'some residual'
    previous_connection = 'some connection'
    previous_atom = 'some atom'

    residual_list = []

    for i in range(len(temp_df)):
        current_residual = temp_df.loc[i, :].Residual
        current_connection = temp_df.loc[i, :].Connection
        current_atom = temp_df.loc[i, :].Atom

        if (current_atom in H_initial):
            H_res += 1

        if (C_res == 0) and (current_atom not in C_initial):
            residual_list.append(H_res)
        else:
            if current_atom in C_initial:
                C_res += 1
            residual_list.append(C_res)

        previous_residual = current_residual
        previous_connection = current_connection
        previous_atom = current_atom


    temp_df_out = temp_df.copy()
    temp_df_out['Residual Num'] = residual_list

    return temp_df_out

In [7]:
def assign_label_glycan_case1_3(temp_df, H_initial = ['H11', 'H12', 'H1'], C_initial = ['C1']):
    
    temp_df = temp_df.loc[temp_df['Residual'] != 'ME']
    temp_df.index = range(len(temp_df))
    
    residual_list = temp_df['Residual'].values
    connection_list = temp_df['Connection'].values
    atom_list = temp_df['Atom'].values

    H_res = 0
    C_res = 0

    previous_residual = 'some residual'
    previous_connection = 'some connection'
    previous_atom = 'some atom'

    residual_list = []

    for i in range(len(temp_df)):
        current_residual = temp_df.loc[i, :].Residual
        current_connection = temp_df.loc[i, :].Connection
        current_atom = temp_df.loc[i, :].Atom
        

        if (current_atom in H_initial):
            H_res += 1

        if (C_res == 0) and (current_atom not in C_initial):
            residual_list.append(H_res)
        else:
            if current_atom in C_initial:
                C_res += 1
            residual_list.append(C_res)

        previous_residual = current_residual
        previous_connection = current_connection
        previous_atom = current_atom


    temp_df_out = temp_df.copy()
    temp_df_out['Residual Num'] = residual_list

    return temp_df_out

In [8]:
# DO NOT CHANGE THIS FUNCTION
def assign_label_wierd_glycan_by_connection(temp_df):
    H_initial = ['H11', 'H12', 'H1', 'H31', 'H32', 'H3 a', 'H1 a']
    C_initial = ['C1']

    residual_list = temp_df['Residual'].values
    connection_list = temp_df['Connection'].values
    atom_list = temp_df['Atom'].values

    H_res = 0
    C_res = 0

    previous_residual = 'some residual'
    previous_connection = 'some connection'
    previous_atom = 'some atom'

    residual_list = []

    for i in range(len(temp_df)):
        current_residual = temp_df.loc[i, :].Residual
        current_connection = temp_df.loc[i, :].Connection
        current_atom = temp_df.loc[i, :].Atom

        if (current_connection != previous_connection) and (C_res == 0):
            H_res += 1
            

        if (C_res == 0) and (current_atom not in C_initial):
            residual_list.append(H_res)
        else:
            if (current_atom in C_initial) and (previous_atom not in C_initial):
                C_res += 1
            elif (C_res != 0) and (current_connection != previous_connection):
                C_res += 1
            residual_list.append(C_res)

        previous_residual = current_residual
        previous_connection = current_connection
        previous_atom = current_atom


    temp_df_out = temp_df.copy()
    temp_df_out['Residual Num'] = residual_list
    return temp_df_out

In [9]:
def assign_label_wierd_glycan_by_connection_1_10(temp_df):
    H_initial = ['H11', 'H12', 'H1', 'H31', 'H32', 'H3 a', 'H1 a']
    C_initial = ['C1']

    residual_list = temp_df['Residual'].values
    connection_list = temp_df['Connection'].values
    atom_list = temp_df['Atom'].values

    H_res = 0
    C_res = 0

    previous_residual = 'some residual'
    previous_connection = 'some connection'
    previous_atom = 'some atom'

    residual_list = []
    
    dummy = 1
    
    for i in range(len(temp_df)):
        current_residual = temp_df.loc[i, :].Residual
        current_connection = temp_df.loc[i, :].Connection
        current_atom = temp_df.loc[i, :].Atom
        
        if current_atom == 'C2':
            dummy = 0

        if (current_connection != previous_connection) and (C_res == 0):
            H_res += 1
            

        if (C_res == 0) and (current_atom not in C_initial) and dummy:
            residual_list.append(H_res)
        else:
            
            if (current_atom in C_initial) and (previous_atom not in C_initial):
                C_res += 1
            elif (C_res != 0) and (current_connection != previous_connection):
                C_res += 1
            residual_list.append(C_res+1)

        previous_residual = current_residual
        previous_connection = current_connection
        previous_atom = current_atom


    temp_df_out = temp_df.copy()
    temp_df_out['Residual Num'] = residual_list
    return temp_df_out

In [10]:
def print_atom_list(csv_df, csv_name):
    csv_atom_list = csv_df['Atom'].values
    print(csv_name)
    print(csv_atom_list)
    print('')

In [11]:
def count_residual_from_swecon(csv_name, orig_dir = 'experimental_data_nonlinear/FullyAnnotatedPDB_V2/'):
    pdb_name = csv_name.replace('csv', 'pdb')
    max_val = 0
    l_f = []
    l_t = []
    with open(os.path.join(orig_dir, pdb_name)) as f:
        f = f.readlines()
        for l in f:
            if 'SWECON' in l:
                l1 = l.split(' ')
                l1 = [i for i in l1 if i != '']
#                 print(l1, csv_name, len(l1))
                if len(l1) == 5:
                    l_f.append(int(l1[2]))
                    l_t.append(int(l1[3]))
                elif len(l1) == 4:
                    l_f.append(int(l1[1]))
                    l_t.append(int(l1[2]))
    max_val = max(max(l_f), max(l_t))
    return max_val

In [12]:
incorrect_list = [
'DB26476.csv',
'DB26370.csv',
'DB9023.csv',
'DB26479.csv',
'DB26475.csv',
'DB26431.csv',
'DB26378.csv',
'DB22507.csv',
'DB26467.csv',
'DB26928.csv',
'DB26959.csv',
'NeuAca2-3Galb1-3( NeuAca2-3Galb1-4GlcNAc b1-6)GalNAc.csv',
'DB12870.csv',
'DB26879.csv',
'DB26379.csv',
'DB26380.csv',
'DB22506.csv',
'DB22505.csv',
'DB26910.csv',
]

In [13]:
not_equal_list = []
for csv_f in csv_list:
    csv_df = pd.read_csv(os.path.join(data_dir, csv_f))
#     print_atom_list(csv_df, csv_f)
#     df_out = assign_label_normal_glycan(csv_df)
#     check_carbon_hydrogen_same(df_out)
    if csv_f not in incorrect_list:
    
        df_out = assign_label_normal_glycan(csv_df)
    else:
        if csv_f == 'DB26476.csv':
            df_out = assign_label_glycan_case1_1(csv_df)
        if csv_f == 'DB26370.csv':
            # case 1_2
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H31'])
        if csv_f == 'DB9023.csv':
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H31'])
        if csv_f == 'DB26479.csv':
            df_out = assign_label_glycan_case1_3(csv_df)
        if csv_f == 'DB26475.csv':
            # case 1_4
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H3 ax'])
        if csv_f == 'DB26431.csv':
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H31'])
        if csv_f == 'DB26378.csv':
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H3 ax'])
        if csv_f == 'DB22507.csv':
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H31'])
        if csv_f == 'DB26467.csv':
            df_out = assign_label_wierd_glycan_by_connection(csv_df)
        if csv_f == 'DB26928.csv':
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H3ax'])
        if csv_f == 'DB26959.csv':
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H31'])
        if csv_f == 'NeuAca2-3Galb1-3( NeuAca2-3Galb1-4GlcNAc b1-6)GalNAc.csv':
            df_out = assign_label_wierd_glycan_by_connection(csv_df)
        if csv_f == 'DB12870.csv':
            df_out = assign_label_normal_glycan(csv_df, H_initial = ['H11', 'H12', 'H1', 'H31'])
        if csv_f == 'DB26879.csv':
            df_out = assign_label_wierd_glycan_by_connection(csv_df)
        if csv_f == 'DB26379.csv':
            df_out = assign_label_wierd_glycan_by_connection(csv_df)
        if csv_f == 'DB26380.csv':
            df_out = assign_label_wierd_glycan_by_connection(csv_df)
        if csv_f == 'DB22506.csv':
            df_out = assign_label_wierd_glycan_by_connection(csv_df)
        if csv_f == 'DB22505.csv':
            df_out = assign_label_wierd_glycan_by_connection(csv_df)
        if csv_f == 'DB26910.csv':
            df_out = assign_label_wierd_glycan_by_connection_1_10(csv_df)
            print(df_out)
    max_c, max_h = check_carbon_hydrogen_same(df_out, print_anyway=False)
    total_res = count_residual_from_swecon(csv_f)
    
    if (total_res != max_c) and (total_res != max_h):
        print(csv_f)
        not_equal_list.append(csv_f)
    df_out.to_csv(os.path.join('experimental_data_nonlinear/FullyAnnotatedPDB_V2_csv_assigned_labeled_first_correction/', 
                               csv_f), index = False)

DB26370.csv
DB26379.csv
DB26510.csv
DB26431.csv
DB26378.csv
DB22507.csv
DB26682.csv
DB17331.csv
DB26380.csv
DB26467.csv
DB26502.csv
DB26908.csv
NeuAca2-3Galb1-3( NeuAca2-3Galb1-4GlcNAc b1-6)GalNAc.csv
DB12870.csv
DB22506.csv
LFucpa1-3[DGalpb1-4]DGlcpNAcb1-3DGalpb1-4DGlc.csv
DB22505.csv
DB26879.csv
       Residual Connection    Atom   Shift  Residual Num
0          KDOP                 H2    4.17             1
1          KDOP               H3ax    1.68             1
2          KDOP               H3eq    1.90             1
3          KDOP                 H4    4.18             1
4          KDOP                 H5    3.82             1
5          KDOP                 H6    3.79             1
6          KDOP                 H7    3.72             1
7          KDOP                H8a    3.60             1
8          KDOP                H8b    3.76             1
9    L-A-D-HEPP          5      H1    5.36             2
10   L-A-D-HEPP          5      H2    4.56             2
11   L-A-D-HEPP  

In [14]:
not_equal_list

['DB26370.csv',
 'DB26379.csv',
 'DB26510.csv',
 'DB26431.csv',
 'DB26378.csv',
 'DB22507.csv',
 'DB26682.csv',
 'DB17331.csv',
 'DB26380.csv',
 'DB26467.csv',
 'DB26502.csv',
 'DB26908.csv',
 'NeuAca2-3Galb1-3( NeuAca2-3Galb1-4GlcNAc b1-6)GalNAc.csv',
 'DB12870.csv',
 'DB22506.csv',
 'LFucpa1-3[DGalpb1-4]DGlcpNAcb1-3DGalpb1-4DGlc.csv',
 'DB22505.csv',
 'DB26879.csv',
 'DB26910.csv']

In [15]:
csv_list

['DB22549.csv',
 'DB9948.csv',
 'DB26476.csv',
 'DB26813.csv',
 'DB26712.csv',
 'DB22557.csv',
 'DB26709.csv',
 'DB26405.csv',
 'DB26521.csv',
 'DB4858.csv',
 'DB27271.csv',
 'DB26719.csv',
 'DB26306.csv',
 'DB26370.csv',
 'DB7424.csv',
 'DB26404.csv',
 'DB22550.csv',
 'DB26379.csv',
 'DB9023.csv',
 'DB26919.csv',
 'Repeat-4)-a-D-Manp-(1-4)-a-D-GalpA-(1-3)-b-D-GlcpNAc-(1-2)-a-D-Galp-(1-3)-a-L-Rhap2Ac-(1-.csv',
 'DB9490.csv',
 'DB26907.csv',
 'DB9949.csv',
 'DB9539.csv',
 'DB26938.csv',
 'DB9213.csv',
 'DB26479.csv',
 'DB22551.csv',
 'DB26510.csv',
 'DB26475.csv',
 'DB26723.csv',
 'DB8939.csv',
 'DB27272.csv',
 'DB26722.csv',
 'DB22553.csv',
 'DB26431.csv',
 'DB26716.csv',
 'DB26715.csv',
 'DB12744.csv',
 'DB26378.csv',
 'DB26517.csv',
 'DB26553.csv',
 'Galb3(Fuca4)GlcNAcb.csv',
 'DB12745.csv',
 'DB26428.csv',
 'DB22556.csv',
 'DB3501.csv',
 'DB26918.csv',
 'DB22507.csv',
 'DB26682.csv',
 'DB17331.csv',
 'DB26810.csv',
 'DB26922.csv',
 'DB26380.csv',
 'Gala1-3(Fuca1-2)Galb.csv',
 'DB267