In [1]:
import pandas as pd
import numpy as np
import re
import os
import json
import math
from tqdm import tqdm
pd.set_option('display.max_rows', 1000+1)

In [2]:
pdb_out_dir = 'dataset/Godess_gnn/preprocess_data/Godess_carbon_unmatched_pdb/'

csv_out_dir = 'dataset/Godess_gnn/preprocess_data/Godess_carbon_unmatched_carbon_csv/'
csv_out_dir_hydrogen = 'dataset/Godess_gnn/preprocess_data/Godess_carbon_unmatched_hydrogen_csv/'


matching_out_dir = 'dataset/Godess_gnn/preprocess_data/Godess_carbon_match_dict/'
connection_out_dir = 'dataset/Godess_gnn/preprocess_data/Godess_carbon_connection_dict/'

pdb_file_list = os.listdir(pdb_out_dir)

partial_labeled_dir = 'dataset/Godess_gnn/preprocess_data/Godess_carbon_labeled_pdb_monosaccharide/'
all_labeled_dir = 'dataset/Godess_gnn/preprocess_data/Godess_carbon_labeled_pdb_all/'

In [3]:
def match_residual_accurate_name(pdb_f, csv_f):
    for i in range(len(pdb_f)):
        current_pdb_lineage = pdb_f.loc[i, :]['Lineage']
        if current_pdb_lineage in csv_f['Linkage'].values:

            current_pdb_residue_list = csv_f.loc[csv_f['Linkage'] == current_pdb_lineage]['Residue'].values
            
            if ('Ac' in current_pdb_residue_list) and (len(current_pdb_residue_list) >= 2):
                current_ac_index = np.where(current_pdb_residue_list == 'Ac')[0]
                current_pdb_residue_list = np.delete(current_pdb_residue_list, current_ac_index)
            
            assert (len(current_pdb_residue_list) == 1)
            
            current_corresponding_residue = current_pdb_residue_list[0]
        #         print(current_corresponding_residue)
            pdb_f.loc[i, ['Residual_accurate_name']] = current_corresponding_residue
    return pdb_f

##### Additional parts

In [4]:
# The connection information from the pdb file is cannot be fully trusted, some file have the patterns that A -> B, but B is not connected to A
def check_and_modify_graph(graph):
    """
    Checks if a graph satisfies the property that if node A is connected to node B, then node B is connected to node A.
    If the property is not satisfied, modifies the graph by adding the missing connections.
    
    Arguments:
    graph -- a dictionary representing a graph where the keys are the nodes and the values are lists of connected nodes
    
    Returns:
    The modified graph.
    """
    for node in graph:
        for neighbor in graph[node]:
            if node not in graph[neighbor]:
                graph[neighbor].append(node)
    return graph

In [5]:
# read in connection from step 2 (create linkage for pdb file)
def read_in_connection(connection_dir, pdb_name):
    
#     name = pdb_name.split('labeled_all_')[1].split('.pdb')[0]
    name = pdb_name.split('.pdb.csv')[0]
    connect_name = str(name) + '_connection.json'
    
    dict_f_path = os.path.join(connection_dir, connect_name)
    
    with open(dict_f_path) as dict_f_json:
        dict_f = json.load(dict_f_json)
    
    dict_f = {int(k):v for k,v in dict_f.items()}
    
    dict_f = check_and_modify_graph(dict_f)
    
    return dict_f

In [6]:
# given target root, for each atom, find the shortest distance to the root atom (Carbon) using BFS
def bfs_shortest_path(graph, start_node, end_node):
    """
    Compute the shortest path from start_node to end_node in graph using Breadth-First Search.
    
    Arguments:
    graph -- a dictionary representing the graph, where keys are nodes and values are lists of neighbors
    start_node -- the node to start the search from
    end_node -- the node to search for
    
    Returns:
    A list containing the shortest path as a list of nodes.
    """
    # Initialize the visited set, the queue, and the parent dictionary with start_node as the only node visited.
    visited = set([start_node])
    queue = deque([(start_node, [])])
    parent = {start_node: None}
    
    while queue:
        # Dequeue the next node and its path.
        current_node, path = queue.popleft()
        
        # If we've found the end_node, return its path.
        if current_node == end_node:
            return path + [end_node]
        
        # Loop through the neighbors of the current_node and enqueue them if they haven't been visited.
        for neighbor in graph[current_node]:
            if neighbor not in visited:
                visited.add(neighbor)
                queue.append((neighbor, path + [current_node]))
                parent[neighbor] = current_node
    
    # If we've gone through all the nodes and haven't found the end_node, return an empty path.
    return []

In [7]:
# identify Ac components with the help of connection information, branched info from labedled csv and pdb residue names
def identify_AC_components(df_pdb, df_pdb_name, df_label, connection_dir, j):

    dict_f = read_in_connection(connection_dir, df_pdb_name)

    df_pdb_copy = df_pdb.copy()

    df_label = df_label

    Ac_components_list = np.repeat(0, len(df_pdb_copy))

    for i in range(len(df_label)):
        current_linkage = df_label.loc[i, :]['Linkage']

        if isinstance(current_linkage, float):
            current_linkage = int(current_linkage)
            current_linkage = str(current_linkage)

        current_residue = df_label.loc[i, :]['Residue']

        if current_residue == 'Ac':

            attached_linkage = current_linkage[0:-2]

            attached_index = df_pdb_copy.loc[df_pdb_copy['Lineage'] == attached_linkage].index

            attached_atom_name = df_pdb_copy.loc[df_pdb_copy['Lineage'] == attached_linkage, :]['Atom_name'].values

            if 'C11' in attached_atom_name:
                c11_index_list = np.where(attached_atom_name == 'C11')[0]

                # Attached only one monosaccharide
                if len(c11_index_list) == 1:

                    # c11 index in csv
                    c11_index = attached_index[c11_index_list[0]]

                    # c11 index in atom names
                    c11_atom_num = df_pdb_copy.loc[c11_index, :]['Atom_num']

                    c11_connection_list = dict_f[c11_atom_num]

                    Ac_residue_index_list = [c11_atom_num]

                    for atom_num in c11_connection_list:

                        current_atom_connect = dict_f[atom_num]
                        
                        # in case of the connection file failed
                        if c11_atom_num in current_atom_connect:
                            current_atom_connect.remove(c11_atom_num)

                        # identify the Oxx
                        if len(current_atom_connect) == 0:

                            Ac_residue_index_list.append(atom_num)

                        else:
                            len_atom_connect = 0
                            for atom_num_connect in current_atom_connect:


                                atom_num_connect_atom_list = dict_f[atom_num_connect]
                                
                                if atom_num in atom_num_connect_atom_list:

                                    atom_num_connect_atom_list.remove(atom_num)
    #                             print(atom_num_connect, atom_num_connect_atom_list, atom_num)

                                len_atom_connect += len(atom_num_connect_atom_list)

                            # identify the C21, and three HXXs 
                            if len_atom_connect == 0:

                                Ac_residue_index_list.append(atom_num)
                                Ac_residue_index_list.extend(dict_f[atom_num])

                    Ac_residue_index_list = df_pdb_copy.loc[df_pdb_copy['Atom_num'].isin(Ac_residue_index_list)].index
                    Ac_components_list[Ac_residue_index_list] = 1
    #                     print(atom_num, current_atom_connect)
                else:
                    print(df_pdb_name, j, 'contain more than one C11')


            else:

                print(df_pdb_name, j, current_residue, 'the attached monosaccharides does not contain a C11')

                current_index = df_pdb_copy.loc[df_pdb_copy['Lineage'] == current_linkage].index

                current_three_letter_residual = df_pdb_copy.loc[current_index, :]['Residual_name'].values

                # the attached monosaccharide 
                assert(np.all(current_three_letter_residual == 'ACY'))

    # some ACs are behave like are separate components
    if not df_pdb_copy.loc[df_pdb_copy['Residual_name'] == 'ACY'].empty:
        ACY_index = df_pdb_copy.loc[df_pdb_copy['Residual_name'] == 'ACY'].index
        Ac_components_list[ACY_index] = 1

    df_pdb_copy['Ac_component'] = Ac_components_list
    
    return df_pdb_copy

##### label the shift for monosaccharides

In [8]:
def match_pdb_csv_lineage(pdb_f, csv_f, carbon_list, ac_carbon_list = ['C11', 'C21'], trust_threshold = 50):
    
    csv_f['Linkage'] = csv_f[['Linkage']].fillna('')
    csv_f = csv_f.astype({'Linkage':str})
    if ('1.0' in csv_f['Linkage'].values) or ('2.0' in csv_f['Linkage'].values) or ('3.0' in csv_f['Linkage'].values) or \
        ('4.0' in csv_f['Linkage'].values) or ('5.0' in csv_f['Linkage'].values) or ('6.0' in csv_f['Linkage'].values):
        print('error in pdb lineage')
    
    for i in range(len(pdb_f)):
        current_pdb_lineage = pdb_f.loc[i, :]['Lineage']
        current_pdb_atom = pdb_f.loc[i, :]['Atom_name']
        current_pdb_residual_name = pdb_f.loc[i, :]['Residual_name'] 

        # stores the monosaccharide labels 
        current_corresponding_csv_f = csv_f.loc[csv_f['Linkage'] == current_pdb_lineage]

        # stores the Ac labels
        current_corresponding_csv_f_ac = pd.DataFrame(columns=csv_f.columns)

        if 'Ac' in current_corresponding_csv_f['Residue'].values:

            current_corresponding_csv_f_ac = current_corresponding_csv_f.\
                loc[current_corresponding_csv_f['Residue'] == 'Ac']

            current_corresponding_csv_f = current_corresponding_csv_f.\
                loc[current_corresponding_csv_f['Residue'] != 'Ac']

        # current monosaccharide, assign shift
        if (current_pdb_atom in carbon_list) and (not current_corresponding_csv_f.empty):

            current_shift = current_corresponding_csv_f[current_pdb_atom].values[0]
            
            if (len(current_corresponding_csv_f['Trust'].values) == 0) or (current_corresponding_csv_f['Trust'].values ==''):
                print(current_pdb_residual_name)
                print('Missing trust confidence interval: ', current_corresponding_csv_f['Residue'])
                current_trust = -1

            elif isinstance(current_corresponding_csv_f['Trust'].values[0], float) and \
                np.isnan(current_corresponding_csv_f['Trust'].values[0]):

                print('Missing trust confidence interval: ', current_corresponding_csv_f['Residue'])
                current_trust = -1
            else:

                current_trust = current_corresponding_csv_f['Trust'].values[0]
                current_trust = int(current_trust.split('%')[0])

            pdb_f.loc[i, ['shift']] = current_shift
            pdb_f.loc[i, ['trust']] = current_trust

        # current Ac and the pdb file has a separate three letter code
        elif (current_pdb_atom in carbon_list) and (not current_corresponding_csv_f_ac.empty) \
            and (current_pdb_residual_name == 'ACY'):

            current_shift = current_corresponding_csv_f_ac[current_pdb_atom].values[0]

            if (len(current_corresponding_csv_f['Trust'].values) == 0) or (current_corresponding_csv_f['Trust'].values ==''):
                print(current_pdb_residual_name)
                print('Missing trust confidence interval: ', current_corresponding_csv_f['Residue'])
                current_trust = -1

            elif isinstance(current_corresponding_csv_f_ac['Trust'].values[0], float) and \
                np.isnan(current_corresponding_csv_f_ac['Trust'].values[0]):

                print('Missing trust confidence interval: ', current_corresponding_csv_f_ac['Residue'])
                current_trust = -1
            else:
                current_trust = current_corresponding_csv_f_ac['Trust'].values[0]
                current_trust = int(current_trust.split('%')[0])

            pdb_f.loc[i, ['shift']] = current_shift
            pdb_f.loc[i, ['trust']] = current_trust

        elif (current_pdb_atom in ac_carbon_list):
#             print(i)
            pass
        
    return pdb_f

In [9]:
def match_pdb_csv_lineage_hydrogen(pdb_f, csv_hydrogen_f, hydrogen_list, ac_hydrogen_list = ['H2'], 
                                   trust_threshold = 50):
    
    csv_hydrogen_f['Linkage'] = csv_hydrogen_f[['Linkage']].fillna('')
    csv_hydrogen_f = csv_hydrogen_f.astype({'Linkage':str})
    if ('1.0' in csv_hydrogen_f['Linkage'].values) or ('2.0' in csv_hydrogen_f['Linkage'].values) or ('3.0' in csv_hydrogen_f['Linkage'].values) or \
        ('4.0' in csv_hydrogen_f['Linkage'].values) or ('5.0' in csv_hydrogen_f['Linkage'].values) or ('6.0' in csv_hydrogen_f['Linkage'].values):
        print('error in pdb lineage')
    
    for i in range(len(pdb_f)):
        current_pdb_lineage = pdb_f.loc[i, :]['Lineage']
        current_pdb_atom = pdb_f.loc[i, :]['Atom_name']
        current_pdb_residual_name = pdb_f.loc[i, :]['Residual_name'] 

        # stores the monosaccharide labels 
        current_corresponding_csv_hydrogen_f = csv_hydrogen_f.loc[csv_hydrogen_f['Linkage'] == current_pdb_lineage]

        # stores the Ac labels
        current_corresponding_csv_hydrogen_f_ac = pd.DataFrame(columns=csv_hydrogen_f.columns)

        if 'Ac' in current_corresponding_csv_hydrogen_f['Residue'].values:

            current_corresponding_csv_hydrogen_f_ac = current_corresponding_csv_hydrogen_f.\
                loc[current_corresponding_csv_hydrogen_f['Residue'] == 'Ac']

            current_corresponding_csv_hydrogen_f = current_corresponding_csv_hydrogen_f.\
                loc[current_corresponding_csv_hydrogen_f['Residue'] != 'Ac']

        # current monosaccharide, assign shift,
        if (current_pdb_atom in hydrogen_list) and (not current_corresponding_csv_hydrogen_f.empty):

            current_shift = current_corresponding_csv_hydrogen_f[current_pdb_atom].values[0]
            
            if (len(current_corresponding_csv_hydrogen_f['Trust'].values) == 0) or (current_corresponding_csv_hydrogen_f['Trust'].values ==''):
                print(current_pdb_residual_name)
                print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f['Residue'])
                current_trust = -1

            elif isinstance(current_corresponding_csv_hydrogen_f['Trust'].values[0], float) and \
                np.isnan(current_corresponding_csv_hydrogen_f['Trust'].values[0]):

                print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f['Residue'])
                current_trust = -1
            else:

                current_trust = current_corresponding_csv_hydrogen_f['Trust'].values[0]
                current_trust = int(current_trust.split('%')[0])

            pdb_f.loc[i, ['shift']] = current_shift
            pdb_f.loc[i, ['trust']] = current_trust

        # current Ac and the pdb file has a separate three letter code
        elif (current_pdb_atom in hydrogen_list) and (not current_corresponding_csv_hydrogen_f_ac.empty) \
            and (current_pdb_residual_name == 'ACY'):

            current_shift = current_corresponding_csv_hydrogen_f_ac[current_pdb_atom].values[0]
            
            if (len(current_corresponding_csv_hydrogen_f['Trust'].values) == 0) or (current_corresponding_csv_hydrogen_f['Trust'].values ==''):
                print(current_pdb_residual_name)
                print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f['Residue'])
                current_trust = -1

            elif isinstance(current_corresponding_csv_hydrogen_f_ac['Trust'].values[0], float) and \
                np.isnan(current_corresponding_csv_hydrogen_f_ac['Trust'].values[0]):

                print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f_ac['Residue'])
                current_trust = -1
            else:
                current_trust = current_corresponding_csv_hydrogen_f_ac['Trust'].values[0]
                current_trust = int(current_trust.split('%')[0])

            pdb_f.loc[i, ['shift']] = current_shift
            pdb_f.loc[i, ['trust']] = current_trust

        elif (current_pdb_atom in ac_hydrogen_list):
#             print(i)
            pass
        
    return pdb_f

In [10]:
def match_Ac_carbon_hydrogen_in_monosaccharide(new_pdb_f, csv_f, csv_hydrogen_f, j):
    pdb_f = new_pdb_f.copy()
    pdb_f_lineage_list = pdb_f['Lineage'].values
    for i in range(len(csv_f)):
        current_csv_lineage = csv_f.loc[i, :]['Linkage']
        current_csv_residue = csv_f.loc[i, :]['Residue']

        if (current_csv_lineage not in pdb_f_lineage_list) and \
            (current_csv_lineage[:-2] in pdb_f_lineage_list):
                current_csv_attached_lineage = current_csv_lineage[:-2]

                temp_ac_pdb_f = pdb_f.loc[(pdb_f['Ac_component'] == 1) & \
                                            (pdb_f['Lineage'] == current_csv_attached_lineage)]
                
                if not temp_ac_pdb_f.empty:
                
                    temp_ac_pdb_f_atoms = temp_ac_pdb_f['Atom_name'].values

                    # assign carbon shift
                    # ['C1', 'C11', 'C2', 'C21', 'H2', 'H21', 'H22', 'H23', 'O1', 'O11','O12']

                    ac_c1_shift_gt = csv_f.loc[i, :]['C1']
                    ac_c2_shift_gt = csv_f.loc[i, :]['C2']
                    ac_c_trust_gt = csv_f.loc[i, :]['Trust']

                    if '%' in ac_c_trust_gt:
                        ac_c_trust_gt = int(ac_c_trust_gt.split('%')[0])
                    else:
                        ac_c_trust_gt = -1


                    if 'C11' in temp_ac_pdb_f_atoms:
                        c11_idx = temp_ac_pdb_f.loc[temp_ac_pdb_f['Atom_name'] == 'C11'].index
                        pdb_f.loc[c11_idx, ['shift']] = ac_c1_shift_gt
                        pdb_f.loc[c11_idx, ['Trust']] = ac_c_trust_gt

                    if 'C1' in temp_ac_pdb_f_atoms:
                        c11_idx = temp_ac_pdb_f.loc[temp_ac_pdb_f['Atom_name'] == 'C1'].index
                        pdb_f.loc[c11_idx, ['shift']] = ac_c1_shift_gt
                        pdb_f.loc[c11_idx, ['Trust']] = ac_c_trust_gt

                    assert( not (('C11' in temp_ac_pdb_f_atoms) and ('C1' in temp_ac_pdb_f_atoms)) )

                    if 'C21' in temp_ac_pdb_f_atoms:
                        c21_idx = temp_ac_pdb_f.loc[temp_ac_pdb_f['Atom_name'] == 'C21'].index
                        pdb_f.loc[c21_idx, ['shift']] = ac_c2_shift_gt

                    if 'C2' in temp_ac_pdb_f_atoms:
                        c21_idx = temp_ac_pdb_f.loc[temp_ac_pdb_f['Atom_name'] == 'C21'].index
                        pdb_f.loc[c21_idx, ['shift']] = ac_c2_shift_gt

                    assert( not (('C21' in temp_ac_pdb_f_atoms) and ('C2' in temp_ac_pdb_f_atoms)) )

                    # assign carbon shift
                    # # ['C1', 'C11', 'C2', 'C21', 'H2', 'H21', 'H22', 'H23', 'O1', 'O11','O12']
                    ac_h1_shift_gt = csv_hydrogen_f.loc[i, :]['H1']
                    ac_h_trust_gt = csv_hydrogen_f.loc[i, :]['Trust']

                    if '%' in ac_h_trust_gt:
                        ac_h_trust_gt = int(ac_h_trust_gt.split('%')[0])
                    else:
                        ac_h_trust_gt = -1

                    if (('H2' in temp_ac_pdb_f_atoms) and ('H21' in temp_ac_pdb_f_atoms) and 
                        ('H22' in temp_ac_pdb_f_atoms)):

                        h2_idx = temp_ac_pdb_f.loc[temp_ac_pdb_f['Atom_name'] == 'H2'].index
                        pdb_f.loc[h2_idx, ['shift']] = ac_h1_shift_gt
                        pdb_f.loc[h2_idx, ['Trust']] = ac_h_trust_gt

                    elif (('H21' in temp_ac_pdb_f_atoms) and ('H22' in temp_ac_pdb_f_atoms) and 
                          ('H23' in temp_ac_pdb_f_atoms)):

                        h2_idx = temp_ac_pdb_f.loc[temp_ac_pdb_f['Atom_name'] == 'H21'].index
                        pdb_f.loc[h2_idx, ['shift']] = ac_h1_shift_gt
                        pdb_f.loc[h2_idx, ['Trust']] = ac_h_trust_gt
                    else:
                        print(j, temp_ac_pdb_f_atoms, 'wierd h2')
    return pdb_f

In [11]:
# def match_Ac_in_monosaccharide(csv_f, pdb_f, ac_carbon_list = ['C11', 'C21'], 
#                                corres_ac_carbon_list = ['C1', 'C2'], exclude_mono_list = ['Pyr']):
    
#     pdb_f_lineage_list = pdb_f['Lineage'].values
#     for i in range(len(csv_f)):
#         current_csv_lineage = csv_f.loc[i, :]['Linkage']
#         current_csv_residue = csv_f.loc[i, :]['Residue']
        
#         current_csv_res = csv_f.loc[i, :]['Residue']
        
#         if (current_csv_lineage not in pdb_f_lineage_list) and (current_csv_res not in exclude_mono_list):
#             current_csv_attached_lineage = current_csv_lineage[:-2]
# #             print(current_csv_lineage, current_csv_attached_lineage)

#             for j in range(len(ac_carbon_list)):
#                 ac_c_shift = csv_f.loc[i, :][corres_ac_carbon_list[j]]
#     #             print(ac_c_shift)
    
#                 ac_c_trust = csv_f.loc[i, :]['Trust']
            
#                 if '%' in ac_c_trust:
#                     ac_c_trust = int(ac_c_trust.split('%')[0])
#                 else:
#                     ac_c_trust = -1
    
#                 pdb_f.loc[(pdb_f['Lineage'] == current_csv_attached_lineage) & 
#                           (pdb_f['Atom_name'] == ac_carbon_list[j]), ['shift']] = ac_c_shift
        
#                 pdb_f.loc[(pdb_f['Lineage'] == current_csv_attached_lineage) & 
#                       (pdb_f['Atom_name'] == ac_carbon_list[j]), ['trust']] = ac_c_trust
#     return pdb_f

In [12]:
# def match_Ac_in_monosaccharide_hydrogen(csv_hydrogen_f, pdb_f, ac_hydrogen_list = ['H2'], 
#                                corres_ac_hydrogen_list = ['H2'], exclude_mono_list = ['Pyr']):
    
#     pdb_f_lineage_list = pdb_f['Lineage'].values
#     for i in range(len(csv_hydrogen_f)):
#         current_csv_lineage = csv_hydrogen_f.loc[i, :]['Linkage']
#         current_csv_residue = csv_hydrogen_f.loc[i, :]['Residue']
        
#         current_csv_res = csv_f.loc[i, :]['Residue']
        
#         if (current_csv_lineage not in pdb_f_lineage_list) and (current_csv_res not in exclude_mono_list):
#             current_csv_attached_lineage = current_csv_lineage[:-2]
# #             print(current_csv_lineage, current_csv_attached_lineage)

#             for j in range(len(ac_hydrogen_list)):
#                 ac_c_shift = csv_hydrogen_f.loc[i, :][corres_ac_hydrogen_list[j]]
#     #             print(ac_c_shift)
    
#                 ac_c_trust = csv_hydrogen_f.loc[i, :]['Trust']
            
#                 if '%' in ac_c_trust:
#                     ac_c_trust = int(ac_c_trust.split('%')[0])
#                 else:
#                     ac_c_trust = -1
    
#                 pdb_f.loc[(pdb_f['Lineage'] == current_csv_attached_lineage) & 
#                           (pdb_f['Atom_name'] == ac_hydrogen_list[j]), ['shift']] = ac_c_shift
        
#                 pdb_f.loc[(pdb_f['Lineage'] == current_csv_attached_lineage) & 
#                       (pdb_f['Atom_name'] == ac_hydrogen_list[j]), ['trust']] = ac_c_trust
#     return pdb_f

In [13]:
# for i in tqdm(range(len(pdb_file_list))):
# #     if i in illegal_list:
# #         continue
    
#     pdb_f_name = pdb_file_list[i]
#     if '1494' in pdb_f_name:
#         print(i, pdb_f_name)

In [14]:
for i in tqdm(range(len(pdb_file_list))):


# for i in [0]:
# for i in [14]:
# for i in [59]:
# for i in [488]:
# for i in tqdm(range(1024, len(pdb_file_list))):

# for i in tqdm(range(2350, len(pdb_file_list))):

# for i in [407]:

#     if i in illegal_list:
#         continue
    
    pdb_f_name = pdb_file_list[i]
    
#     print(i, pdb_f_name)
    
    csv_f_name = pdb_f_name.replace('.pdb.csv', '.csv')
    dict_f_name = pdb_f_name.replace('.pdb.csv', '_matching.json')
    connect_f_name = pdb_f_name.replace('.pdb.csv', '_connection.json')
    
    # read in the dataset
    pdb_f_path = os.path.join(pdb_out_dir, pdb_f_name)
    
    csv_f_path = os.path.join(csv_out_dir, csv_f_name)
    csv_hydrogen_f_path = os.path.join(csv_out_dir_hydrogen, csv_f_name)
    dict_f_path = os.path.join(matching_out_dir, dict_f_name)
    connect_f_path = os.path.join(connection_out_dir, connect_f_name)
    
    pdb_f = pd.read_csv(pdb_f_path)
    
    # carbon 
    csv_f = pd.read_csv(csv_f_path, keep_default_na=False)
    csv_f = csv_f.loc[csv_f['Residue'] != 'PDa']
    csv_f.index = range(len(csv_f))
    
    #hydrogen
    csv_hydrogen_f = pd.read_csv(csv_hydrogen_f_path, keep_default_na=False)
    csv_hydrogen_f = csv_hydrogen_f.loc[csv_hydrogen_f['Residue'] != 'PDa']
    csv_hydrogen_f.index = range(len(csv_hydrogen_f))
    
    
#     lineage_length = len(np.unique(csv_f['Lineage'].values.astype(str)))
    
#     csv_length = len(csv_f)
    
#     assert(lineage_length == csv_length)
    
#     if csv_f.loc[0, ['Residue']].values[0] == 'Ac':
#         print(i)
    
    with open(dict_f_path) as dict_f_json:
        dict_f = json.load(dict_f_json)
    
    with open(connect_f_path) as connect_f_json:
        connect_f = json.load(connect_f_json)
    
    # assign pdb files by lineage from connection dict file
    pdb_f['Residual_accurate_name'] = 'Missing Monosaccharide'
    pdb_f['Lineage'] = 'Missing Lineage'
    pdb_f['shift'] = -1
    pdb_f['trust'] = -1
    
    C1_index = np.where(csv_f.columns == 'C1')[0][0]
    carbon_list = csv_f.columns[C1_index:].values
    
    H1_index = np.where(csv_hydrogen_f.columns == 'H1')[0][0]
    hydrogen_list = csv_hydrogen_f.columns[H1_index:].values
    
    
    for current_dict_residual, current_dict_lineage in sorted(dict_f.items()):
        current_dict_residual = int(current_dict_residual)
        pdb_f.loc[pdb_f['Residual_num'] == current_dict_residual, ['Lineage']] = current_dict_lineage
    
    
    # use assigned lineage, assign pdb files by shift and monosaccharides from label file
    
    if 'ACY' in pdb_f['Residual_name'].values:
        print(i, pdb_f_name)
        pass

#     for j in pdb_f['Atom_name'].values:
#         if len(j) >= 4:
#             print(i, j, pdb_f_name)
#     print(pdb_f_name)
    pdb_f = match_residual_accurate_name(pdb_f, csv_f)
    
    
    # match monosaccharide shift for carbon and hydrogen 
    
    new_pdb_f = match_pdb_csv_lineage(pdb_f, csv_f, carbon_list)
    
    
    new_pdb_f = match_pdb_csv_lineage_hydrogen(new_pdb_f, csv_hydrogen_f, hydrogen_list)
    
    
    
    #############################################################
    ####identify the Ac componenets
    #############################################################
    
    
    new_pdb_f = identify_AC_components(new_pdb_f, pdb_f_name, csv_f, connection_out_dir, i)
    
    
    # match monosaccharide shift for carbon and hydrogen
    
    
    new_pdb_f_with_ac = match_Ac_carbon_hydrogen_in_monosaccharide(new_pdb_f, csv_f, csv_hydrogen_f, i)
#     new_pdb_f_with_ac = match_Ac_in_monosaccharide(csv_f, new_pdb_f)
    
    
#     new_pdb_f_with_ac = match_Ac_in_monosaccharide_hydrogen(csv_hydrogen_f, new_pdb_f_with_ac)
    
    
    current_out_pdb_path = os.path.join(partial_labeled_dir, 'labeled_mono_' + pdb_f_name)
    current_out_pdb_path_complete = os.path.join(all_labeled_dir, 'labeled_all_' + pdb_f_name)
    
    
    new_pdb_f.to_csv(current_out_pdb_path, index = False)
    new_pdb_f_with_ac.to_csv(current_out_pdb_path_complete, index = False) 

  1%|▎                                        | 17/2402 [00:10<18:01,  2.20it/s]

17 209.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


  1%|▎                                        | 19/2402 [00:11<14:59,  2.65it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


  3%|█▏                                       | 71/2402 [00:40<13:42,  2.84it/s]

71 2360.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


  3%|█▏                                       | 72/2402 [00:40<18:43,  2.07it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2360.pdb.csv 71 Ac the attached monosaccharides does not contain a C11
2360.pdb.csv 71 Ac the attached monosaccharides does not contain a C11


  5%|██                                      | 127/2402 [01:06<19:38,  1.93it/s]

127 2361.pdb.csv


  5%|██▏                                     | 128/2402 [01:07<20:13,  1.87it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2361.pdb.csv 127 Ac the attached monosaccharides does not contain a C11


  7%|██▌                                     | 157/2402 [01:21<19:06,  1.96it/s]

157 251.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


  7%|██▋                                     | 158/2402 [01:21<19:39,  1.90it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 10%|████▏                                   | 252/2402 [02:01<12:36,  2.84it/s]

252 2242.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 11%|████▏                                   | 253/2402 [02:02<12:35,  2.84it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2242.pdb.csv 252 Ac the attached monosaccharides does not contain a C11


 11%|████▎                                   | 258/2402 [02:04<16:51,  2.12it/s]

258 2407.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 11%|████▎                                   | 259/2402 [02:05<15:21,  2.32it/s]

2407.pdb.csv 258 Ac the attached monosaccharides does not contain a C11


 16%|██████▏                                 | 374/2402 [03:03<13:16,  2.55it/s]

373 2183.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2183.pdb.csv 373 Ac the attached monosaccharides does not contain a C11
2183.pdb.csv 373 Ac the attached monosaccharides does not contain a C11


 16%|██████▎                                 | 380/2402 [03:05<14:56,  2.26it/s]

516.pdb.csv 378 Ac the attached monosaccharides does not contain a C11


 16%|██████▎                                 | 382/2402 [03:06<13:04,  2.58it/s]

382 2030.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 16%|██████▍                                 | 383/2402 [03:06<12:31,  2.69it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2030.pdb.csv 382 Ac the attached monosaccharides does not contain a C11
2030.pdb.csv 382 Ac the attached monosaccharides does not contain a C11
2030.pdb.csv 382 Ac the attached monosaccharides does not contain a C11
2030.pdb.csv 382 Ac the attached monosaccharides does not contain a C11
2030.pdb.csv 382 Ac the attached monosaccharides does not contain a C11


 18%|███████▏                                | 435/2402 [03:34<19:22,  1.69it/s]

457.pdb.csv 434 Ac the attached monosaccharides does not contain a C11


 19%|███████▍                                | 450/2402 [03:42<12:32,  2.59it/s]

171.pdb.csv 448 Ac the attached monosaccharides does not contain a C11
171.pdb.csv 448 Ac the attached monosaccharides does not contain a C11
171.pdb.csv 448 Ac the attached monosaccharides does not contain a C11


 20%|████████▏                               | 489/2402 [04:04<18:19,  1.74it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object


 24%|█████████▍                              | 568/2402 [04:41<14:40,  2.08it/s]

567 2024.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2024.pdb.csv 567 Ac the attached monosaccharides does not contain a C11


 24%|█████████▌                              | 575/2402 [04:44<17:52,  1.70it/s]

575 2336.pdb.csv


 24%|█████████▌                              | 576/2402 [04:45<16:33,  1.84it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2336.pdb.csv 575 Ac the attached monosaccharides does not contain a C11


 24%|█████████▋                              | 585/2402 [04:48<10:03,  3.01it/s]

585 2363.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 24%|█████████▊                              | 586/2402 [04:48<10:35,  2.86it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2363.pdb.csv 585 Ac the attached monosaccharides does not contain a C11
2363.pdb.csv 585 Ac the attached monosaccharides does not contain a C11


 26%|██████████▎                             | 620/2402 [05:08<18:39,  1.59it/s]

620 2402.pdb.csv


 26%|██████████▎                             | 621/2402 [05:08<17:34,  1.69it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2402.pdb.csv 620 Ac the attached monosaccharides does not contain a C11


 31%|████████████▏                           | 735/2402 [06:08<11:47,  2.36it/s]

735 2334.pdb.csv


 31%|████████████▎                           | 736/2402 [06:09<11:55,  2.33it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2334.pdb.csv 735 Ac the attached monosaccharides does not contain a C11
2334.pdb.csv 735 Ac the attached monosaccharides does not contain a C11


 31%|████████████▎                           | 738/2402 [06:09<10:02,  2.76it/s]

738 2227.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 31%|████████████▎                           | 739/2402 [06:10<10:33,  2.63it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2227.pdb.csv 738 Ac the attached monosaccharides does not contain a C11
2227.pdb.csv 738 Ac the attached monosaccharides does not contain a C11


 33%|█████████████                           | 785/2402 [06:39<14:00,  1.92it/s]

784 2130.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2130.pdb.csv 784 Ac the attached monosaccharides does not contain a C11
2130.pdb.csv 784 Ac the attached monosaccharides does not contain a C11
2130.pdb.csv 784 Ac the attached monosaccharides does not contain a C11


 33%|█████████████                           | 787/2402 [06:39<09:34,  2.81it/s]

786 2115.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2115.pdb.csv 786 Ac the attached monosaccharides does not contain a C11


 33%|█████████████▏                          | 793/2402 [06:44<24:01,  1.12it/s]

793 2137.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 33%|█████████████▏                          | 794/2402 [06:44<20:28,  1.31it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2137.pdb.csv 793 Ac the attached monosaccharides does not contain a C11
2137.pdb.csv 793 Ac the attached monosaccharides does not contain a C11


 33%|█████████████▎                          | 803/2402 [06:48<14:17,  1.87it/s]

803 2126.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 33%|█████████████▍                          | 804/2402 [06:48<12:07,  2.20it/s]

2126.pdb.csv 803 Ac the attached monosaccharides does not contain a C11
2126.pdb.csv 803 Ac the attached monosaccharides does not contain a C11
2126.pdb.csv 803 Ac the attached monosaccharides does not contain a C11


 34%|█████████████▌                          | 818/2402 [06:55<14:47,  1.79it/s]

818 315.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 34%|█████████████▋                          | 819/2402 [06:56<13:32,  1.95it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
315.pdb.csv 818 Ac the attached monosaccharides does not contain a C11


 35%|█████████████▉                          | 840/2402 [07:08<11:31,  2.26it/s]

840 2382.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 35%|██████████████                          | 841/2402 [07:09<12:12,  2.13it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2382.pdb.csv 840 Ac the attached monosaccharides does not contain a C11


 36%|██████████████▎                         | 858/2402 [07:18<18:43,  1.37it/s]

PO4
Missing trust confidence interval:  7    P
Name: Residue, dtype: object


 36%|██████████████▍                         | 866/2402 [07:21<08:21,  3.06it/s]

PO4
Missing trust confidence interval:  2    P
Name: Residue, dtype: object


 36%|██████████████▌                         | 872/2402 [07:23<07:50,  3.25it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  2    P
Name: Residue, dtype: object


 37%|██████████████▌                         | 877/2402 [07:26<14:14,  1.78it/s]

877 2121.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 37%|██████████████▌                         | 878/2402 [07:26<12:30,  2.03it/s]

2121.pdb.csv 877 Ac the attached monosaccharides does not contain a C11
2121.pdb.csv 877 Ac the attached monosaccharides does not contain a C11


 37%|██████████████▋                         | 882/2402 [07:28<17:10,  1.48it/s]

882 2385.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 37%|██████████████▋                         | 884/2402 [07:29<11:40,  2.17it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2385.pdb.csv 882 Ac the attached monosaccharides does not contain a C11


 37%|██████████████▉                         | 896/2402 [07:37<13:13,  1.90it/s]

17.pdb.csv 895 Ac the attached monosaccharides does not contain a C11
17.pdb.csv 895 Ac the attached monosaccharides does not contain a C11


 41%|████████████████▍                       | 987/2402 [08:25<10:35,  2.23it/s]

987 2332.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 41%|████████████████▍                       | 988/2402 [08:25<09:56,  2.37it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2332.pdb.csv 987 Ac the attached monosaccharides does not contain a C11


 42%|████████████████▎                      | 1005/2402 [08:32<07:27,  3.12it/s]

1006 2401.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 42%|████████████████▎                      | 1007/2402 [08:32<06:30,  3.57it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2401.pdb.csv 1006 Ac the attached monosaccharides does not contain a C11
1007 2136.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 42%|████████████████▎                      | 1008/2402 [08:32<06:12,  3.74it/s]

2136.pdb.csv 1007 Ac the attached monosaccharides does not contain a C11


 42%|████████████████▌                      | 1019/2402 [08:39<08:06,  2.84it/s]

1019 338.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 42%|████████████████▌                      | 1020/2402 [08:39<10:23,  2.22it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 43%|████████████████▉                      | 1042/2402 [08:47<06:05,  3.72it/s]

1041 2164.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2164.pdb.csv 1041 Ac the attached monosaccharides does not contain a C11
2164.pdb.csv 1041 Ac the attached monosaccharides does not contain a C11


 43%|████████████████▉                      | 1044/2402 [08:48<10:18,  2.20it/s]

1044 2133.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 44%|████████████████▉                      | 1045/2402 [08:49<10:08,  2.23it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2133.pdb.csv 1044 Ac the attached monosaccharides does not contain a C11
2133.pdb.csv 1044 Ac the attached monosaccharides does not contain a C11


 44%|█████████████████                      | 1050/2402 [08:50<07:01,  3.21it/s]

1050 2103.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 44%|█████████████████                      | 1051/2402 [08:50<06:54,  3.26it/s]

2103.pdb.csv 1050 Ac the attached monosaccharides does not contain a C11
2103.pdb.csv 1050 Ac the attached monosaccharides does not contain a C11


 50%|███████████████████▋                   | 1213/2402 [10:12<07:39,  2.59it/s]

317.pdb.csv 1212 Ac the attached monosaccharides does not contain a C11


 52%|████████████████████▍                  | 1260/2402 [10:34<07:20,  2.59it/s]

1260 2165.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 52%|████████████████████▍                  | 1261/2402 [10:35<07:23,  2.57it/s]

2165.pdb.csv 1260 Ac the attached monosaccharides does not contain a C11


 54%|████████████████████▉                  | 1290/2402 [10:51<10:33,  1.75it/s]

1290 2071.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 54%|████████████████████▉                  | 1291/2402 [10:51<09:42,  1.91it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2071.pdb.csv 1290 Ac the attached monosaccharides does not contain a C11
2071.pdb.csv 1290 Ac the attached monosaccharides does not contain a C11


 54%|█████████████████████▏                 | 1306/2402 [10:58<07:51,  2.32it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object


 55%|█████████████████████▎                 | 1313/2402 [11:01<06:27,  2.81it/s]

PO4
Missing trust confidence interval:  2    P
Name: Residue, dtype: object


 55%|█████████████████████▍                 | 1323/2402 [11:07<06:10,  2.92it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object


 55%|█████████████████████▌                 | 1330/2402 [11:10<09:21,  1.91it/s]

1330 2397.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 55%|█████████████████████▌                 | 1331/2402 [11:11<09:09,  1.95it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2397.pdb.csv 1330 Ac the attached monosaccharides does not contain a C11


 57%|██████████████████████                 | 1359/2402 [11:26<07:25,  2.34it/s]

1359 2079.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 57%|██████████████████████                 | 1360/2402 [11:27<06:35,  2.63it/s]

2079.pdb.csv 1359 Ac the attached monosaccharides does not contain a C11
2079.pdb.csv 1359 Ac the attached monosaccharides does not contain a C11
2079.pdb.csv 1359 Ac the attached monosaccharides does not contain a C11


 57%|██████████████████████▏                | 1368/2402 [11:31<07:44,  2.23it/s]

1367 2025.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2025.pdb.csv 1367 Ac the attached monosaccharides does not contain a C11


 58%|██████████████████████▌                | 1386/2402 [11:40<10:19,  1.64it/s]

1386 2043.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 58%|██████████████████████▌                | 1387/2402 [11:40<08:31,  1.99it/s]

2043.pdb.csv 1386 Ac the attached monosaccharides does not contain a C11


 58%|██████████████████████▋                | 1400/2402 [11:47<08:13,  2.03it/s]

1400 2063.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 58%|██████████████████████▋                | 1401/2402 [11:48<07:05,  2.35it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2063.pdb.csv 1400 Ac the attached monosaccharides does not contain a C11
2063.pdb.csv 1400 Ac the attached monosaccharides does not contain a C11
2063.pdb.csv 1400 Ac the attached monosaccharides does not contain a C11


 59%|███████████████████████                | 1418/2402 [11:56<08:31,  1.92it/s]

1417 2037.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2037.pdb.csv 1417 Ac the attached monosaccharides does not contain a C11


 60%|███████████████████████▎               | 1436/2402 [12:07<12:12,  1.32it/s]

1436 2087.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missin

 60%|███████████████████████▎               | 1437/2402 [12:07<10:41,  1.50it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2087.pdb.csv 1436 Ac the attached monosaccharides does not contain a C11
2087.pdb.csv 1436 Ac the attached monosaccharides does not contain a C11
2087.pdb.csv 1436 Ac the attached monosaccharides does not contain a C11
2087.pdb.csv 1436 Ac the attached monosaccharides do

 62%|████████████████████████▎              | 1494/2402 [12:35<06:56,  2.18it/s]

1494 2392.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 62%|████████████████████████▎              | 1495/2402 [12:35<05:56,  2.54it/s]

2392.pdb.csv 1494 Ac the attached monosaccharides does not contain a C11


 64%|████████████████████████▉              | 1537/2402 [12:55<04:39,  3.09it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  5    P
Name: Residue, dtype: object


 65%|█████████████████████████▎             | 1560/2402 [13:06<06:10,  2.27it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object


 65%|█████████████████████████▌             | 1571/2402 [13:11<04:33,  3.04it/s]

1570 2005.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2005.pdb.csv 1570 Ac the attached monosaccharides does not contain a C11


 67%|██████████████████████████▎            | 1621/2402 [13:37<06:15,  2.08it/s]

1620 2020.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2020.pdb.csv 1620 Ac the attached monosaccharides does not contain a C11


 68%|██████████████████████████▋            | 1640/2402 [13:46<05:37,  2.26it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object


 69%|██████████████████████████▉            | 1658/2402 [13:55<06:29,  1.91it/s]

1657 2185.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2185.pdb.csv 1657 Ac the attached monosaccharides does not contain a C11


 69%|██████████████████████████▉            | 1659/2402 [13:55<05:37,  2.20it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  5    P
Name: Residue, dtype: object


 69%|██████████████████████████▉            | 1660/2402 [13:56<04:46,  2.59it/s]

1660 2002.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 69%|██████████████████████████▉            | 1661/2402 [13:56<04:15,  2.90it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2002.pdb.csv 1660 Ac the attached monosaccharides does not contain a C11
2002.pdb.csv 1660 Ac the attached monosaccharides does not contain a C11


 69%|███████████████████████████            | 1666/2402 [13:58<04:53,  2.51it/s]

1666 2066.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 69%|███████████████████████████            | 1667/2402 [13:58<04:32,  2.69it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2066.pdb.csv 1666 Ac the attached monosaccharides does not contain a C11


 71%|███████████████████████████▌           | 1699/2402 [14:19<06:59,  1.68it/s]

PO4
Missing trust confidence interval:  4    P
Name: Residue, dtype: object


 71%|███████████████████████████▊           | 1716/2402 [14:30<06:30,  1.75it/s]

1715 2395.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
PO4
Missing trust confidence interval:  8    P
Name: Residue, dtype: object
2395.pdb.csv 1715 Ac the attached monosaccharides does not contain a C11
2395.pdb.csv 1715 Ac the attached monosaccharides does not contain a C11


 72%|███████████████████████████▉           | 1718/2402 [14:32<07:07,  1.60it/s]

1718 2143.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 72%|███████████████████████████▉           | 1719/2402 [14:32<06:27,  1.76it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2143.pdb.csv 1718 Ac the attached monosaccharides does not contain a C11
2143.pdb.csv 1718 Ac the attached monosaccharides does not contain a C11


 74%|████████████████████████████▋          | 1766/2402 [14:57<05:15,  2.02it/s]

1766 2065.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 74%|████████████████████████████▋          | 1767/2402 [14:58<04:57,  2.14it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2065.pdb.csv 1766 Ac the attached monosaccharides does not contain a C11
2065.pdb.csv 1766 Ac the attached monosaccharides does not contain a C11


 74%|████████████████████████████▊          | 1775/2402 [15:01<03:16,  3.19it/s]

1775 2084.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 74%|████████████████████████████▊          | 1776/2402 [15:01<03:06,  3.35it/s]

2084.pdb.csv 1775 Ac the attached monosaccharides does not contain a C11
2084.pdb.csv 1775 Ac the attached monosaccharides does not contain a C11
2084.pdb.csv 1775 Ac the attached monosaccharides does not contain a C11


 77%|██████████████████████████████         | 1848/2402 [15:39<02:46,  3.33it/s]

1847 2300.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2300.pdb.csv 1847 Ac the attached monosaccharides does not contain a C11


 78%|██████████████████████████████▍        | 1878/2402 [15:54<03:13,  2.70it/s]

1878 2325.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 78%|██████████████████████████████▌        | 1879/2402 [15:54<03:19,  2.62it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2325.pdb.csv 1878 Ac the attached monosaccharides does not contain a C11


 80%|███████████████████████████████        | 1916/2402 [16:17<05:18,  1.53it/s]

1916 2345.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 80%|███████████████████████████████▏       | 1917/2402 [16:17<04:34,  1.77it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2345.pdb.csv 1916 Ac the attached monosaccharides does not contain a C11


 80%|███████████████████████████████▏       | 1922/2402 [16:19<03:05,  2.59it/s]

1922 2107.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 80%|███████████████████████████████▏       | 1924/2402 [16:20<02:16,  3.51it/s]

2107.pdb.csv 1922 Ac the attached monosaccharides does not contain a C11
2107.pdb.csv 1922 Ac the attached monosaccharides does not contain a C11


 81%|███████████████████████████████▋       | 1955/2402 [16:38<03:49,  1.94it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object


 84%|████████████████████████████████▌      | 2006/2402 [17:05<03:35,  1.84it/s]

2006 437.pdb.csv


 84%|████████████████████████████████▌      | 2007/2402 [17:06<03:21,  1.96it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 84%|████████████████████████████████▊      | 2024/2402 [17:15<01:59,  3.17it/s]

PO4
Missing trust confidence interval:  2    P
Name: Residue, dtype: object


 86%|█████████████████████████████████▎     | 2054/2402 [17:35<03:46,  1.54it/s]

2054 237.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 86%|█████████████████████████████████▎     | 2055/2402 [17:35<03:09,  1.84it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
237.pdb.csv 2054 Ac the attached monosaccharides does not contain a C11


 87%|█████████████████████████████████▉     | 2088/2402 [17:48<03:14,  1.61it/s]

2088 2403.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 87%|█████████████████████████████████▉     | 2089/2402 [17:49<03:13,  1.62it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2403.pdb.csv 2088 Ac the attached monosaccharides does not contain a C11


 87%|██████████████████████████████████     | 2100/2402 [17:53<01:53,  2.67it/s]

2100 2355.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 87%|██████████████████████████████████     | 2101/2402 [17:54<01:57,  2.57it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2355.pdb.csv 2100 Ac the attached monosaccharides does not contain a C11


 88%|██████████████████████████████████▍    | 2122/2402 [18:08<02:43,  1.71it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  7    P
Name: Residue, dtype: object


 89%|██████████████████████████████████▋    | 2140/2402 [18:17<02:25,  1.80it/s]

2140 2018.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 89%|██████████████████████████████████▊    | 2141/2402 [18:17<02:05,  2.08it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2018.pdb.csv 2140 Ac the attached monosaccharides does not contain a C11


 90%|██████████████████████████████████▉    | 2151/2402 [18:23<02:29,  1.68it/s]

2151 2288.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 90%|██████████████████████████████████▉    | 2152/2402 [18:24<02:06,  1.98it/s]

2288.pdb.csv 2151 Ac the attached monosaccharides does not contain a C11
2288.pdb.csv 2151 Ac the attached monosaccharides does not contain a C11
2288.pdb.csv 2151 Ac the attached monosaccharides does not contain a C11


 91%|███████████████████████████████████▎   | 2178/2402 [18:38<01:29,  2.52it/s]

PO4
Missing trust confidence interval:  4    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object


 92%|███████████████████████████████████▉   | 2212/2402 [18:52<00:48,  3.91it/s]

PO4
Missing trust confidence interval:  2    P
Name: Residue, dtype: object


 92%|███████████████████████████████████▉   | 2214/2402 [18:54<01:32,  2.04it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object


 96%|█████████████████████████████████████▌ | 2316/2402 [19:47<00:44,  1.94it/s]

2316 2331.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 96%|█████████████████████████████████████▌ | 2317/2402 [19:47<00:41,  2.06it/s]

2331.pdb.csv 2316 Ac the attached monosaccharides does not contain a C11


 97%|█████████████████████████████████████▊ | 2331/2402 [19:54<00:33,  2.14it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object


 97%|█████████████████████████████████████▉ | 2335/2402 [19:56<00:35,  1.88it/s]

2335 2249.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 97%|█████████████████████████████████████▉ | 2336/2402 [19:56<00:32,  2.01it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2249.pdb.csv 2335 Ac the attached monosaccharides does not contain a C11


 98%|██████████████████████████████████████ | 2344/2402 [20:00<00:26,  2.20it/s]

366.pdb.csv 2343 Ac the attached monosaccharides does not contain a C11


 99%|██████████████████████████████████████▍| 2368/2402 [20:13<00:20,  1.65it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object
PO4
Missing trust confidence interval:  3    P
Name: Residue, dtype: object


 99%|██████████████████████████████████████▋| 2381/2402 [20:20<00:12,  1.73it/s]

PO4
Missing trust confidence interval:  1    P
Name: Residue, dtype: object


 99%|██████████████████████████████████████▋| 2382/2402 [20:20<00:10,  1.82it/s]

2382 2219.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


 99%|██████████████████████████████████████▋| 2384/2402 [20:21<00:06,  2.60it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
2219.pdb.csv 2382 Ac the attached monosaccharides does not contain a C11


100%|██████████████████████████████████████▉| 2401/2402 [20:32<00:00,  1.26it/s]

2401 2305.pdb.csv
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)
ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)


100%|███████████████████████████████████████| 2402/2402 [20:32<00:00,  1.95it/s]

ACY
Missing trust confidence interval:  Series([], Name: Residue, dtype: object)





In [15]:
# new_pdb_f

In [16]:
# csv_f

In [17]:
# new_pdb_f_with_ac.loc[100:120]

In [18]:
# ac_hydrogen_list = ['H2']

# trust_threshold = 50

# csv_hydrogen_f['Linkage'] = csv_hydrogen_f[['Linkage']].fillna('')
# csv_hydrogen_f = csv_hydrogen_f.astype({'Linkage':str})
# if ('1.0' in csv_hydrogen_f['Linkage'].values) or ('2.0' in csv_hydrogen_f['Linkage'].values) or ('3.0' in csv_hydrogen_f['Linkage'].values) or \
#     ('4.0' in csv_hydrogen_f['Linkage'].values) or ('5.0' in csv_hydrogen_f['Linkage'].values) or ('6.0' in csv_hydrogen_f['Linkage'].values):
#     print('error in pdb lineage')

# for i in range(len(pdb_f)):
#     current_pdb_lineage = pdb_f.loc[i, :]['Lineage']
#     current_pdb_atom = pdb_f.loc[i, :]['Atom_name']
#     current_pdb_residual_name = pdb_f.loc[i, :]['Residual_name'] 

#     # stores the monosaccharide labels 
#     current_corresponding_csv_hydrogen_f = csv_hydrogen_f.loc[csv_hydrogen_f['Linkage'] == current_pdb_lineage]

#     # stores the Ac labels
#     current_corresponding_csv_hydrogen_f_ac = pd.DataFrame(columns=csv_hydrogen_f.columns)

#     if 'Ac' in current_corresponding_csv_hydrogen_f['Residue'].values:

#         current_corresponding_csv_hydrogen_f_ac = current_corresponding_csv_hydrogen_f.\
#             loc[current_corresponding_csv_hydrogen_f['Residue'] == 'Ac']

#         current_corresponding_csv_hydrogen_f = current_corresponding_csv_hydrogen_f.\
#             loc[current_corresponding_csv_hydrogen_f['Residue'] != 'Ac']

#     # current monosaccharide, assign shift,
#     if (current_pdb_atom in hydrogen_list) and (not current_corresponding_csv_hydrogen_f.empty):

#         current_shift = current_corresponding_csv_hydrogen_f[current_pdb_atom].values[0]
        
#         if current_corresponding_csv_hydrogen_f['Trust'].values[0] =='':
#             print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f['Residue'])
#             current_trust = -1

#         elif isinstance(current_corresponding_csv_hydrogen_f['Trust'].values[0], float) and \
#             np.isnan(current_corresponding_csv_hydrogen_f['Trust'].values[0]):

#             print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f['Residue'])
#             current_trust = -1
#         else:
#             # not monosaccharide and missing trust
#             current_trust = current_corresponding_csv_hydrogen_f['Trust'].values[0]
#             current_trust = int(current_trust.split('%')[0])

#         pdb_f.loc[i, ['shift']] = current_shift
#         pdb_f.loc[i, ['trust']] = current_trust

#     # current Ac and the pdb file has a separate three letter code
#     elif (current_pdb_atom in hydrogen_list) and (not current_corresponding_csv_hydrogen_f_ac.empty) \
#         and (current_pdb_residual_name == 'ACY'):

#         current_shift = current_corresponding_csv_hydrogen_f_ac[current_pdb_atom].values[0]
        
#         if current_corresponding_csv_hydrogen_f['Trust'].values[0] =='':
#             print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f['Residue'])
#             current_trust = -1

#         elif isinstance(current_corresponding_csv_hydrogen_f_ac['Trust'].values[0], float) and \
#             np.isnan(current_corresponding_csv_hydrogen_f_ac['Trust'].values[0]):

#             print('Missing trust confidence interval: ', current_corresponding_csv_hydrogen_f_ac['Residue'])
#             current_trust = -1
#         else:
#             # not monosaccharide and missing trust
#             current_trust = current_corresponding_csv_hydrogen_f_ac['Trust'].values[0]
#             current_trust = int(current_trust.split('%')[0])
        
#         pdb_f.loc[i, ['shift']] = current_shift
#         pdb_f.loc[i, ['trust']] = current_trust

#     elif (current_pdb_atom in ac_hydrogen_list):
# #             print(i)
#         pass

In [19]:
[''] == ''

False

In [20]:
# for i in range(len(pdb_f)):
#     current_pdb_lineage = pdb_f.loc[i, :]['Lineage']
#     if current_pdb_lineage in csv_f['Linkage'].values:
#         current_corresponding_residue = csv_f.loc[csv_f['Linkage'] == current_pdb_lineage]['Residue'].values[0]
#     #         print(current_corresponding_residue)
#         pdb_f.loc[i, ['Residual_accurate_name']] = current_corresponding_residue

In [21]:
np.where(csv_hydrogen_f.columns == 'H1')[0][0]
csv_hydrogen_f.columns[3:].values

array(['H1', 'H2', 'H3', 'H4', 'H5', 'H6'], dtype=object)

In [22]:
# csv_f.loc[csv_f['Linkage'] == current_pdb_lineage]['Residue']

In [23]:
new_pdb_f_with_ac

Unnamed: 0,HETATM,Atom_num,Atom_name,Residual_name,Bound,Residual_num,x,y,z,Atom_type,Residual_accurate_name,Lineage,shift,trust,Ac_component,Trust
0,HETATM,1,R,NGA,A,1,0.896,9.168,17.021,X,b-D-GalpN,,-1,-1,0,
1,HETATM,2,O3,NGA,A,1,-0.656,-1.103,2.529,O,b-D-GalpN,,-1,-1,0,
2,HETATM,3,C3,NGA,A,1,-0.416,-0.412,3.763,C,b-D-GalpN,,77.1,80,0,
3,HETATM,4,C2,NGA,A,1,-1.788,-0.157,4.458,C,b-D-GalpN,,52.1,80,0,
4,HETATM,5,N2,NGA,A,1,-2.356,-1.524,4.695,N,b-D-GalpN,,-1,-1,0,
5,HETATM,6,C1,NGA,A,1,-1.598,0.81,5.679,C,b-D-GalpN,,103.2,80,0,
6,HETATM,7,R1,NGA,A,1,-2.912,1.531,6.119,X,b-D-GalpN,,-1,-1,0,
7,HETATM,8,O5,NGA,A,1,-0.698,1.906,5.457,O,b-D-GalpN,,-1,-1,0,
8,HETATM,9,C5,NGA,A,1,0.544,1.569,4.851,C,b-D-GalpN,,76.5,80,0,
9,HETATM,10,C6,NGA,A,1,1.406,2.828,4.727,C,b-D-GalpN,,62.1,80,0,


In [24]:
csv_hydrogen_f

Unnamed: 0,Linkage,Residue,Trust,H1,H2,H3,H4,H5,H6
0,,b-D-GalpN,81%,4.54,4.08,3.84,4.17,3.68,3.72-3.85
1,2.0,Ac,99%,-,2.05,,,,
2,3.0,a-D-Galp,89%,5.14,3.88,3.8,4.2,3.86,3.75-3.87
3,33.0,b-D-GalpN,89%,4.82,4.18,5.15,4.31,3.79,3.76-3.88
4,333.0,Ac,98%,-,2.13,,,,
5,332.0,Ac,99%,-,2.05,,,,
6,334.0,b-D-GlcpA,64%,4.62,3.45,3.64,3.73,3.94,-
7,3344.0,b-D-GlcpA,59%,4.51,3.53,3.76,3.86,3.88,-
8,33444.0,a-D-Glcp,88%,5.49,3.49,3.68,3.42,3.75,3.96-4.08


In [25]:
dict_f

{'1': '',
 '2': '3',
 '3': '3,3',
 '5': '3,3,4',
 '4': '3,3,3',
 '6': '3,3,4,4',
 '7': '3,3,4,4,4'}

In [26]:
dict_f

{'1': '',
 '2': '3',
 '3': '3,3',
 '5': '3,3,4',
 '4': '3,3,3',
 '6': '3,3,4,4',
 '7': '3,3,4,4,4'}

In [27]:
csv_f = csv_f.loc[csv_f['Residue'] != 'PDa']

In [28]:
csv_f['Linkage'] = csv_f[['Linkage']].fillna('')
csv_f = csv_f.astype({'Linkage':str})

trust_threshold = 60
ac_carbon_list = ['C11', 'C21']

for i in range(len(pdb_f)):
    current_pdb_lineage = pdb_f.loc[i, :]['Lineage']
    current_pdb_atom = pdb_f.loc[i, :]['Atom_name']
    current_pdb_residual_name = pdb_f.loc[i, :]['Residual_name']
    
    # stores the monosaccharide labels 
    current_corresponding_csv_f = csv_f.loc[csv_f['Linkage'] == current_pdb_lineage]

    # stores the Ac labels
    current_corresponding_csv_f_ac = pd.DataFrame(columns=csv_f.columns)

    if 'Ac' in current_corresponding_csv_f['Residue'].values:

        current_corresponding_csv_f_ac = current_corresponding_csv_f.\
            loc[current_corresponding_csv_f['Residue'] == 'Ac']
        
        current_corresponding_csv_f = current_corresponding_csv_f.\
            loc[current_corresponding_csv_f['Residue'] != 'Ac']
        
    # current monosaccharide, assign shift,
    if (current_pdb_atom in carbon_list) and (not current_corresponding_csv_f.empty):
        
        current_shift = current_corresponding_csv_f[current_pdb_atom].values[0]
        
        if isinstance(current_corresponding_csv_f['Trust'].values[0], float) and \
            np.isnan(current_corresponding_csv_f['Trust'].values[0]):
            
            print('Missing trust confidence interval: ', current_corresponding_csv_f['Residue'])
            current_trust = -1
        else:
            
            current_trust = current_corresponding_csv_f['Trust'].values[0]
            current_trust = int(current_trust.split('%')[0])
            
        pdb_f.loc[i, ['shift']] = current_shift
        pdb_f.loc[i, ['trust']] = current_trust
    
    # current Ac and the pdb file has a separate three letter code
    elif (current_pdb_atom in carbon_list) and (not current_corresponding_csv_f_ac.empty) \
        and (current_pdb_residual_name == 'ACY'):
        
        current_shift = current_corresponding_csv_f_ac[current_pdb_atom].values[0]
        
        if isinstance(current_corresponding_csv_f_ac['Trust'].values[0], float) and \
            np.isnan(current_corresponding_csv_f_ac['Trust'].values[0]):

            print('Missing trust confidence interval: ', current_corresponding_csv_f_ac['Residue'])
            current_trust = -1
        else:
            current_trust = current_corresponding_csv_f_ac['Trust'].values[0]
            current_trust = int(current_trust.split('%')[0])

        pdb_f.loc[i, ['shift']] = current_shift
        pdb_f.loc[i, ['trust']] = current_trust
    
    elif (current_pdb_atom in ac_carbon_list):
        print(i)

23
24
70
71


In [29]:
ac_carbon_list = ['C11', 'C21']
corres_ac_carbon_list = ['C1', 'C2']
pdb_f_lineage_list = pdb_f['Lineage'].values
for i in range(len(csv_f)):
    current_csv_lineage = csv_f.loc[i, :]['Linkage']
    current_csv_residue = csv_f.loc[i, :]['Residue']
    if current_csv_lineage not in pdb_f_lineage_list:
        current_csv_attached_lineage = current_csv_lineage[:-2]
        print(current_csv_lineage, current_csv_attached_lineage)
        
        for j in range(len(ac_carbon_list)):
            ac_c_shift = csv_f.loc[i, :][corres_ac_carbon_list[j]]
            
            ac_c_trust = csv_f.loc[i, :]['Trust']
            
            if '%' in ac_c_trust:
                ac_c_trust = int(ac_c_trust.split('%')[0])
            else:
                ac_c_trust = -1
            
#             print(ac_c_shift)
            pdb_f.loc[(pdb_f['Lineage'] == current_csv_attached_lineage) & 
                      (pdb_f['Atom_name'] == ac_carbon_list[j]), ['shift']] = ac_c_shift
    
            pdb_f.loc[(pdb_f['Lineage'] == current_csv_attached_lineage) & 
                      (pdb_f['Atom_name'] == ac_carbon_list[j]), ['trust']] = ac_c_trust

2 
3,3,2 3,3


In [30]:
pdb_f

Unnamed: 0,HETATM,Atom_num,Atom_name,Residual_name,Bound,Residual_num,x,y,z,Atom_type,Residual_accurate_name,Lineage,shift,trust
0,HETATM,1,R,NGA,A,1,0.896,9.168,17.021,X,b-D-GalpN,,-1,-1
1,HETATM,2,O3,NGA,A,1,-0.656,-1.103,2.529,O,b-D-GalpN,,-1,-1
2,HETATM,3,C3,NGA,A,1,-0.416,-0.412,3.763,C,b-D-GalpN,,77.1,80
3,HETATM,4,C2,NGA,A,1,-1.788,-0.157,4.458,C,b-D-GalpN,,52.1,80
4,HETATM,5,N2,NGA,A,1,-2.356,-1.524,4.695,N,b-D-GalpN,,-1,-1
5,HETATM,6,C1,NGA,A,1,-1.598,0.81,5.679,C,b-D-GalpN,,103.2,80
6,HETATM,7,R1,NGA,A,1,-2.912,1.531,6.119,X,b-D-GalpN,,-1,-1
7,HETATM,8,O5,NGA,A,1,-0.698,1.906,5.457,O,b-D-GalpN,,-1,-1
8,HETATM,9,C5,NGA,A,1,0.544,1.569,4.851,C,b-D-GalpN,,76.5,80
9,HETATM,10,C6,NGA,A,1,1.406,2.828,4.727,C,b-D-GalpN,,62.1,80


In [31]:
csv_hydrogen_f

Unnamed: 0,Linkage,Residue,Trust,H1,H2,H3,H4,H5,H6
0,,b-D-GalpN,81%,4.54,4.08,3.84,4.17,3.68,3.72-3.85
1,2.0,Ac,99%,-,2.05,,,,
2,3.0,a-D-Galp,89%,5.14,3.88,3.8,4.2,3.86,3.75-3.87
3,33.0,b-D-GalpN,89%,4.82,4.18,5.15,4.31,3.79,3.76-3.88
4,333.0,Ac,98%,-,2.13,,,,
5,332.0,Ac,99%,-,2.05,,,,
6,334.0,b-D-GlcpA,64%,4.62,3.45,3.64,3.73,3.94,-
7,3344.0,b-D-GlcpA,59%,4.51,3.53,3.76,3.86,3.88,-
8,33444.0,a-D-Glcp,88%,5.49,3.49,3.68,3.42,3.75,3.96-4.08


In [32]:
csv_f

Unnamed: 0,Linkage,Residue,Trust,C1,C2,C3,C4,C5,C6
0,,b-D-GalpN,80%,103.2,52.1,77.1,65.4,76.5,62.1
1,2.0,Ac,76%,175.7,23.6,,,,
2,3.0,a-D-Galp,80%,96.8,68.9,80.7,70.5,72.2,62.1
3,33.0,b-D-GalpN,80%,104.0,52.6,73.7,72.9,75.6,61.8
4,333.0,Ac,91%,174.3,21.4,,,,
5,332.0,Ac,76%,175.7,23.6,,,,
6,334.0,b-D-GlcpA,61%,105.5,75.4,75.2,81.3,77.0,174.5
7,3344.0,b-D-GlcpA,65%,103.5,72.8,75.8,77.9,76.9,175.0
8,33444.0,a-D-Glcp,81%,99.6,72.9,74.1,70.5,71.9,69.1


In [33]:
pdb_f.loc[(pdb_f['Lineage'] == current_csv_attached_lineage) & 
          (pdb_f['Atom_name'] == 'C11'), ['shift']]

Unnamed: 0,shift
70,175.7


In [34]:
current_pdb_lineage

'3,3,4,4,4'

In [35]:
current_corresponding_csv_f['Residue']

8    a-D-Glcp
Name: Residue, dtype: object

In [36]:
csv_f.loc[csv_f['Linkage'] == '4,3,8,9']

Unnamed: 0,Linkage,Residue,Trust,C1,C2,C3,C4,C5,C6


In [37]:
pdb_f

Unnamed: 0,HETATM,Atom_num,Atom_name,Residual_name,Bound,Residual_num,x,y,z,Atom_type,Residual_accurate_name,Lineage,shift,trust
0,HETATM,1,R,NGA,A,1,0.896,9.168,17.021,X,b-D-GalpN,,-1,-1
1,HETATM,2,O3,NGA,A,1,-0.656,-1.103,2.529,O,b-D-GalpN,,-1,-1
2,HETATM,3,C3,NGA,A,1,-0.416,-0.412,3.763,C,b-D-GalpN,,77.1,80
3,HETATM,4,C2,NGA,A,1,-1.788,-0.157,4.458,C,b-D-GalpN,,52.1,80
4,HETATM,5,N2,NGA,A,1,-2.356,-1.524,4.695,N,b-D-GalpN,,-1,-1
5,HETATM,6,C1,NGA,A,1,-1.598,0.81,5.679,C,b-D-GalpN,,103.2,80
6,HETATM,7,R1,NGA,A,1,-2.912,1.531,6.119,X,b-D-GalpN,,-1,-1
7,HETATM,8,O5,NGA,A,1,-0.698,1.906,5.457,O,b-D-GalpN,,-1,-1
8,HETATM,9,C5,NGA,A,1,0.544,1.569,4.851,C,b-D-GalpN,,76.5,80
9,HETATM,10,C6,NGA,A,1,1.406,2.828,4.727,C,b-D-GalpN,,62.1,80


In [38]:
csv_f

Unnamed: 0,Linkage,Residue,Trust,C1,C2,C3,C4,C5,C6
0,,b-D-GalpN,80%,103.2,52.1,77.1,65.4,76.5,62.1
1,2.0,Ac,76%,175.7,23.6,,,,
2,3.0,a-D-Galp,80%,96.8,68.9,80.7,70.5,72.2,62.1
3,33.0,b-D-GalpN,80%,104.0,52.6,73.7,72.9,75.6,61.8
4,333.0,Ac,91%,174.3,21.4,,,,
5,332.0,Ac,76%,175.7,23.6,,,,
6,334.0,b-D-GlcpA,61%,105.5,75.4,75.2,81.3,77.0,174.5
7,3344.0,b-D-GlcpA,65%,103.5,72.8,75.8,77.9,76.9,175.0
8,33444.0,a-D-Glcp,81%,99.6,72.9,74.1,70.5,71.9,69.1


In [39]:
pdb_file_list[0]

'1882.pdb.csv'

In [40]:
dict_f

{'1': '',
 '2': '3',
 '3': '3,3',
 '5': '3,3,4',
 '4': '3,3,3',
 '6': '3,3,4,4',
 '7': '3,3,4,4,4'}

In [41]:
pdb_f

Unnamed: 0,HETATM,Atom_num,Atom_name,Residual_name,Bound,Residual_num,x,y,z,Atom_type,Residual_accurate_name,Lineage,shift,trust
0,HETATM,1,R,NGA,A,1,0.896,9.168,17.021,X,b-D-GalpN,,-1,-1
1,HETATM,2,O3,NGA,A,1,-0.656,-1.103,2.529,O,b-D-GalpN,,-1,-1
2,HETATM,3,C3,NGA,A,1,-0.416,-0.412,3.763,C,b-D-GalpN,,77.1,80
3,HETATM,4,C2,NGA,A,1,-1.788,-0.157,4.458,C,b-D-GalpN,,52.1,80
4,HETATM,5,N2,NGA,A,1,-2.356,-1.524,4.695,N,b-D-GalpN,,-1,-1
5,HETATM,6,C1,NGA,A,1,-1.598,0.81,5.679,C,b-D-GalpN,,103.2,80
6,HETATM,7,R1,NGA,A,1,-2.912,1.531,6.119,X,b-D-GalpN,,-1,-1
7,HETATM,8,O5,NGA,A,1,-0.698,1.906,5.457,O,b-D-GalpN,,-1,-1
8,HETATM,9,C5,NGA,A,1,0.544,1.569,4.851,C,b-D-GalpN,,76.5,80
9,HETATM,10,C6,NGA,A,1,1.406,2.828,4.727,C,b-D-GalpN,,62.1,80


In [42]:
# pdb_f.to_csv('209_labeled.csv', index = False)

In [43]:
csv_f

Unnamed: 0,Linkage,Residue,Trust,C1,C2,C3,C4,C5,C6
0,,b-D-GalpN,80%,103.2,52.1,77.1,65.4,76.5,62.1
1,2.0,Ac,76%,175.7,23.6,,,,
2,3.0,a-D-Galp,80%,96.8,68.9,80.7,70.5,72.2,62.1
3,33.0,b-D-GalpN,80%,104.0,52.6,73.7,72.9,75.6,61.8
4,333.0,Ac,91%,174.3,21.4,,,,
5,332.0,Ac,76%,175.7,23.6,,,,
6,334.0,b-D-GlcpA,61%,105.5,75.4,75.2,81.3,77.0,174.5
7,3344.0,b-D-GlcpA,65%,103.5,72.8,75.8,77.9,76.9,175.0
8,33444.0,a-D-Glcp,81%,99.6,72.9,74.1,70.5,71.9,69.1


In [44]:
csv_hydrogen_f

Unnamed: 0,Linkage,Residue,Trust,H1,H2,H3,H4,H5,H6
0,,b-D-GalpN,81%,4.54,4.08,3.84,4.17,3.68,3.72-3.85
1,2.0,Ac,99%,-,2.05,,,,
2,3.0,a-D-Galp,89%,5.14,3.88,3.8,4.2,3.86,3.75-3.87
3,33.0,b-D-GalpN,89%,4.82,4.18,5.15,4.31,3.79,3.76-3.88
4,333.0,Ac,98%,-,2.13,,,,
5,332.0,Ac,99%,-,2.05,,,,
6,334.0,b-D-GlcpA,64%,4.62,3.45,3.64,3.73,3.94,-
7,3344.0,b-D-GlcpA,59%,4.51,3.53,3.76,3.86,3.88,-
8,33444.0,a-D-Glcp,88%,5.49,3.49,3.68,3.42,3.75,3.96-4.08


In [45]:
current_corresponding_csv_f['Residue']

8    a-D-Glcp
Name: Residue, dtype: object

In [46]:
a = csv_f.loc[csv_f['Linkage'] == '1,2,4,6']
a.loc[a['Residue'] != 'Ac']

Unnamed: 0,Linkage,Residue,Trust,C1,C2,C3,C4,C5,C6


In [47]:
csv_f

Unnamed: 0,Linkage,Residue,Trust,C1,C2,C3,C4,C5,C6
0,,b-D-GalpN,80%,103.2,52.1,77.1,65.4,76.5,62.1
1,2.0,Ac,76%,175.7,23.6,,,,
2,3.0,a-D-Galp,80%,96.8,68.9,80.7,70.5,72.2,62.1
3,33.0,b-D-GalpN,80%,104.0,52.6,73.7,72.9,75.6,61.8
4,333.0,Ac,91%,174.3,21.4,,,,
5,332.0,Ac,76%,175.7,23.6,,,,
6,334.0,b-D-GlcpA,61%,105.5,75.4,75.2,81.3,77.0,174.5
7,3344.0,b-D-GlcpA,65%,103.5,72.8,75.8,77.9,76.9,175.0
8,33444.0,a-D-Glcp,81%,99.6,72.9,74.1,70.5,71.9,69.1


In [48]:
pdb_f

Unnamed: 0,HETATM,Atom_num,Atom_name,Residual_name,Bound,Residual_num,x,y,z,Atom_type,Residual_accurate_name,Lineage,shift,trust
0,HETATM,1,R,NGA,A,1,0.896,9.168,17.021,X,b-D-GalpN,,-1,-1
1,HETATM,2,O3,NGA,A,1,-0.656,-1.103,2.529,O,b-D-GalpN,,-1,-1
2,HETATM,3,C3,NGA,A,1,-0.416,-0.412,3.763,C,b-D-GalpN,,77.1,80
3,HETATM,4,C2,NGA,A,1,-1.788,-0.157,4.458,C,b-D-GalpN,,52.1,80
4,HETATM,5,N2,NGA,A,1,-2.356,-1.524,4.695,N,b-D-GalpN,,-1,-1
5,HETATM,6,C1,NGA,A,1,-1.598,0.81,5.679,C,b-D-GalpN,,103.2,80
6,HETATM,7,R1,NGA,A,1,-2.912,1.531,6.119,X,b-D-GalpN,,-1,-1
7,HETATM,8,O5,NGA,A,1,-0.698,1.906,5.457,O,b-D-GalpN,,-1,-1
8,HETATM,9,C5,NGA,A,1,0.544,1.569,4.851,C,b-D-GalpN,,76.5,80
9,HETATM,10,C6,NGA,A,1,1.406,2.828,4.727,C,b-D-GalpN,,62.1,80


In [49]:
csv_f

Unnamed: 0,Linkage,Residue,Trust,C1,C2,C3,C4,C5,C6
0,,b-D-GalpN,80%,103.2,52.1,77.1,65.4,76.5,62.1
1,2.0,Ac,76%,175.7,23.6,,,,
2,3.0,a-D-Galp,80%,96.8,68.9,80.7,70.5,72.2,62.1
3,33.0,b-D-GalpN,80%,104.0,52.6,73.7,72.9,75.6,61.8
4,333.0,Ac,91%,174.3,21.4,,,,
5,332.0,Ac,76%,175.7,23.6,,,,
6,334.0,b-D-GlcpA,61%,105.5,75.4,75.2,81.3,77.0,174.5
7,3344.0,b-D-GlcpA,65%,103.5,72.8,75.8,77.9,76.9,175.0
8,33444.0,a-D-Glcp,81%,99.6,72.9,74.1,70.5,71.9,69.1


In [50]:
type(csv_f.loc[1, ]['C1'])

numpy.float64

In [51]:
pdb_f

Unnamed: 0,HETATM,Atom_num,Atom_name,Residual_name,Bound,Residual_num,x,y,z,Atom_type,Residual_accurate_name,Lineage,shift,trust
0,HETATM,1,R,NGA,A,1,0.896,9.168,17.021,X,b-D-GalpN,,-1,-1
1,HETATM,2,O3,NGA,A,1,-0.656,-1.103,2.529,O,b-D-GalpN,,-1,-1
2,HETATM,3,C3,NGA,A,1,-0.416,-0.412,3.763,C,b-D-GalpN,,77.1,80
3,HETATM,4,C2,NGA,A,1,-1.788,-0.157,4.458,C,b-D-GalpN,,52.1,80
4,HETATM,5,N2,NGA,A,1,-2.356,-1.524,4.695,N,b-D-GalpN,,-1,-1
5,HETATM,6,C1,NGA,A,1,-1.598,0.81,5.679,C,b-D-GalpN,,103.2,80
6,HETATM,7,R1,NGA,A,1,-2.912,1.531,6.119,X,b-D-GalpN,,-1,-1
7,HETATM,8,O5,NGA,A,1,-0.698,1.906,5.457,O,b-D-GalpN,,-1,-1
8,HETATM,9,C5,NGA,A,1,0.544,1.569,4.851,C,b-D-GalpN,,76.5,80
9,HETATM,10,C6,NGA,A,1,1.406,2.828,4.727,C,b-D-GalpN,,62.1,80


In [52]:
a = pd.read_csv('dataset/Godess_gnn/preprocess_data/Godess_carbon_name.csv')
a.loc[a['0'] == 150]

Unnamed: 0.1,Unnamed: 0,0,1,2
149,149,150,2065,Ac(1-6)bDGlcp(1-2)[Ac(1-6)]bDGlcp(1-17)lR17HOO...


In [53]:
# def match_pdb_csv_lineage(pdb_f, csv_f, carbon_list, ac_carbon_list = ['C11', 'C21'], trust_threshold = 50):
    
#     csv_f['Linkage'] = csv_f[['Linkage']].fillna('')
#     csv_f = csv_f.astype({'Linkage':str})
#     if ('1.0' in csv_f['Linkage'].values) or ('2.0' in csv_f['Linkage'].values) or ('3.0' in csv_f['Linkage'].values) or \
#         ('4.0' in csv_f['Linkage'].values) or ('5.0' in csv_f['Linkage'].values) or ('6.0' in csv_f['Linkage'].values):
#         print('error in pdb lineage')
    
#     for i in range(len(pdb_f)):
#         current_pdb_lineage = pdb_f.loc[i, :]['Lineage']
#         current_pdb_atom = pdb_f.loc[i, :]['Atom_name']
#         current_pdb_residual_name = pdb_f.loc[i, :]['Residual_name'] 

#         # stores the monosaccharide labels 
#         current_corresponding_csv_f = csv_f.loc[csv_f['Linkage'] == current_pdb_lineage]

#         # stores the Ac labels
#         current_corresponding_csv_f_ac = pd.DataFrame(columns=csv_f.columns)

#         if 'Ac' in current_corresponding_csv_f['Residue'].values:

#             current_corresponding_csv_f_ac = current_corresponding_csv_f.\
#                 loc[current_corresponding_csv_f['Residue'] == 'Ac']

#             current_corresponding_csv_f = current_corresponding_csv_f.\
#                 loc[current_corresponding_csv_f['Residue'] != 'Ac']

#         # current monosaccharide, assign shift,
#         if (current_pdb_atom in carbon_list) and (not current_corresponding_csv_f.empty):

#             current_shift = current_corresponding_csv_f[current_pdb_atom].values[0]

#             if isinstance(current_corresponding_csv_f['Trust'].values[0], float) and \
#                 np.isnan(current_corresponding_csv_f['Trust'].values[0]):

#                 print('Missing trust confidence interval: ', current_corresponding_csv_f['Residue'])
#                 current_trust = -1
#             else:

#                 current_trust = current_corresponding_csv_f['Trust'].values[0]
#                 current_trust = int(current_trust.split('%')[0])

#             pdb_f.loc[i, ['shift']] = current_shift
#             pdb_f.loc[i, ['trust']] = current_trust

#         # current Ac and the pdb file has a separate three letter code
#         elif (current_pdb_atom in carbon_list) and (not current_corresponding_csv_f_ac.empty) \
#             and (current_pdb_residual_name == 'ACY'):

#             current_shift = current_corresponding_csv_f_ac[current_pdb_atom].values[0]

#             if isinstance(current_corresponding_csv_f_ac['Trust'].values[0], float) and \
#                 np.isnan(current_corresponding_csv_f_ac['Trust'].values[0]):

#                 print('Missing trust confidence interval: ', current_corresponding_csv_f_ac['Residue'])
#                 current_trust = -1
#             else:
#                 current_trust = current_corresponding_csv_f_ac['Trust'].values[0]
#                 current_trust = int(current_trust.split('%')[0])

#             pdb_f.loc[i, ['shift']] = current_shift
#             pdb_f.loc[i, ['trust']] = current_trust

#         elif (current_pdb_atom in ac_carbon_list):
# #             print(i)
#             pass
        
#     return pdb_f