In [1]:
import numpy as np
import pandas as pd
import os
import re
from tqdm import tqdm
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)
pd.set_option('display.max_colwidth', None)


In [2]:
csv_dir = 'experimental_data/FullyAnnotatedPDB_label_corrected/'
pdb_dir = 'experimental_data/FullyAnnotatedPDB_pdb_added_atom/'
# pdb_dir = 'experimental_data/FullyAnnotatedPDB_pdb/'



csv_list = os.listdir(csv_dir)
pdb_list = os.listdir(pdb_dir)

In [3]:
carbon_list = ['C1', 'C2', 'C3', 'C4', 'C5', 'C6']
hydrogen_list_1_6 = ['H1', 'H2', 'H3', 'H4', 'H5', 'H6', 'H61', 'H62']

In [4]:
csv_mono_list = []
pdb_mono_list = []
file_list = []
mono_all = []
residual_num_list = []
max_residual_num_list = []

In [5]:
for csv_name in csv_list:
    pdb_name = csv_name.replace('.csv', '.pdb.csv') 
    
    csv_f = pd.read_csv(os.path.join(csv_dir, csv_name))
    pdb_f = pd.read_csv(os.path.join(pdb_dir, pdb_name))
    temp_labels = np.repeat(-1.0, len(pdb_f))
    
    """record atom and residual names"""
    temp_all_atoms = pdb_f['Atom_name'].values
    temp_all_residual_name = pdb_f['Residual_name'].values
    temp_all_residual_number = pdb_f['Residual_num'].values
    
    max_residual_num_list.append(np.max(temp_all_residual_number))

    mono_all.extend(list(temp_all_residual_name))
    for i in range(len(temp_labels)):
        current_atom = temp_all_atoms[i]
        current_residual_number = temp_all_residual_number[i]
        current_residual_name = temp_all_residual_name[i]
        if current_atom in carbon_list or current_atom in hydrogen_list_1_6:
            
            try:
                temp_labels[i] = csv_f.loc[(csv_f['Atom'] == current_atom) & 
                                           (csv_f['Residual Num'] == current_residual_number)].Shift.values[0]
                csv_mono_list.append(csv_f.loc[(csv_f['Atom'] == current_atom) & 
                                               (csv_f['Residual Num'] == current_residual_number)].Residual.values[0])
                pdb_mono_list.append(current_residual_name)
                
                residual_num_list.append(current_residual_number)
                
                file_list.append(csv_name.replace('.csv', ''))
            except:
                pass
    pdb_f['labels'] = temp_labels
    pdb_f.columns = ['Atom','Atom_Num','Atom_Type','Monosaccharide_belong',
                     'residual', 'x', 'y', 'z', 'atoms_simplify', 'labels']
#     pdb_f.to_csv(os.path.join('experimental_data/FullyAnnotatedPDB_pdb_labeled', pdb_name), index = False)
    pdb_f.to_csv(os.path.join('experimental_data/FullyAnnotatedPDB_pdb_labeled_add_atom', pdb_name), index = False)

In [6]:
pdb_f.columns

Index(['Atom', 'Atom_Num', 'Atom_Type', 'Monosaccharide_belong', 'residual',
       'x', 'y', 'z', 'atoms_simplify', 'labels'],
      dtype='object')

In [7]:
df_match = pd.DataFrame([csv_mono_list, pdb_mono_list, file_list, residual_num_list]).T

In [8]:
df_match_s = df_match.drop_duplicates()
df_match_s.index = range(len(df_match_s))
df_match_s.columns = ['csv', 'pdb', 'glycan name', 'residual num']

In [9]:
df_match_s.to_csv('matching_table/mono_match_linear_v2.csv', index = False)

In [10]:
df_match_s = df_match_s.sort_values(by='glycan name', ascending=False)
df_match_s.index = range(len(df_match_s))

In [11]:
df_match_s

Unnamed: 0,csv,pdb,glycan name,residual num
0,b-D-ManpNAc,BMA,repeat-4)-b-D-ManpNAc-(1-4)-a-D-GlcpNAc-(1-,2
1,a-D-GlcpNAc,GDL,repeat-4)-b-D-ManpNAc-(1-4)-a-D-GlcpNAc-(1-,1
2,b-D-Galp,GLB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,3
3,b-D-Galp,GLB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,5
4,a-D-GlcpNAc,GDL,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,4
5,a-D-GlcpNAc,GDL,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,2
6,b-D-Ribf,RIB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,1
7,a-D-Galp,GLA,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,3
8,b-D-Glcp,GLC,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,1
9,b-D-ManpNAc,BMA,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,2


In [12]:
df_match_s.groupby(['glycan name'], as_index=False).size()

Unnamed: 0,glycan name,size
0,P-(O-2)-a-D-Glcp-(1-1)-a-D-Glcp,2
1,P-(O-3)-a-D-Glcp-(1-1)-a-D-Glcp,2
2,P-(O-4)-a-D-Glcp-(1-1)-a-D-Glcp,2
3,P-(O-6)-a-D-Glcp-(1-1)-a-D-Glcp,2
4,Repeat-2)-a-L-Rhap-(1-3)-a-D-Galp-(1-,2
5,Repeat-3)-a-D-Rhap-(1-3)-a-D-Rhap-(1-4)-a-D-GalpNAc-(1-,3
6,Repeat-3)-a-L-Rhap-(1-2)-a-L-Rhap-(1-4)-a-D-GlcpA-(1-2)-a-D-Manp-(1-2)-a-D-Manp-(1-3)-a-D-Galp-(1-,6
7,Repeat-3)-b-D-Galp-(1-3)-b-D-GalpNAc-(1-4)-b-D-Galp-(1-,3
8,Repeat-3)-b-D-GlcpA-(1-3)-a-L-Rhap-(1-,2
9,Repeat-4)-a-D-GlcpA-(1-3)-a-D-Glcp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-4)-a-D-GlcpNAc-(1-,5


P-(O-2)-a-D-Glcp-(1-1)-a-D-Glcp	

P-(O-3)-a-D-Glcp-(1-1)-a-D-Glcp

P-(O-4)-a-D-Glcp-(1-1)-a-D-Glcp

P-(O-6)-a-D-Glcp-(1-1)-a-D-Glcp	

a-D-Kdop-(2-8)-a-D-Kdop-(2-4)-a-D-Kdop-(2-6)-b-D-GlcpN4PO4-(1-6)-a-D-GlcpN1PO4	


In [13]:
df_match_s[df_match_s['glycan name'] == 'P-(O-2)-a-D-Glcp-(1-1)-a-D-Glcp']

Unnamed: 0,csv,pdb,glycan name,residual num
507,a-D-Glcp,GLC,P-(O-2)-a-D-Glcp-(1-1)-a-D-Glcp,3
508,a-D-Glcp,ALX,P-(O-2)-a-D-Glcp-(1-1)-a-D-Glcp,1


In [14]:
df_match_count_duplicatie = df_match.groupby(df_match.columns.tolist(), as_index=False).size()
df_match_count_duplicatie.index = range(len(df_match_count_duplicatie))
df_match_count_duplicatie.columns = ['csv', 'pdb', 'glycan name', 'residual num', 'size']
df_match_count_duplicatie = df_match_count_duplicatie.sort_values(by='glycan name', ascending=False)
df_match_count_duplicatie.index = range(len(df_match_count_duplicatie))
df_match_count_duplicatie.to_csv('matching_table/mono_match_linear_count_v2.csv', index = False)

In [15]:
df_match_count_duplicatie

Unnamed: 0,csv,pdb,glycan name,residual num,size
0,b-D-ManpNAc,BMA,repeat-4)-b-D-ManpNAc-(1-4)-a-D-GlcpNAc-(1-,2,11
1,a-D-GlcpNAc,GDL,repeat-4)-b-D-ManpNAc-(1-4)-a-D-GlcpNAc-(1-,1,11
2,b-D-Ribf,RIB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,1,9
3,b-D-Galp,GLB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,5,10
4,a-D-GlcpNAc,GDL,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,2,10
5,a-D-GlcpNAc,GDL,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,4,10
6,b-D-Galp,GLB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,3,10
7,a-D-GlcpA,GLC,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,4,10
8,a-D-Galp,GLA,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,3,10
9,a-D-Glcp,GLC,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,5,10


In [16]:
df_match_s

Unnamed: 0,csv,pdb,glycan name,residual num
0,b-D-ManpNAc,BMA,repeat-4)-b-D-ManpNAc-(1-4)-a-D-GlcpNAc-(1-,2
1,a-D-GlcpNAc,GDL,repeat-4)-b-D-ManpNAc-(1-4)-a-D-GlcpNAc-(1-,1
2,b-D-Galp,GLB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,3
3,b-D-Galp,GLB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,5
4,a-D-GlcpNAc,GDL,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,4
5,a-D-GlcpNAc,GDL,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,2
6,b-D-Ribf,RIB,repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-,1
7,a-D-Galp,GLA,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,3
8,b-D-Glcp,GLC,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,1
9,b-D-ManpNAc,BMA,repeat-4)-a-D-Glcp-(1-4)-a-D-GlcpA-(1-3)-a-D-Galp-(1-3)-b-D-ManpNAc-(1-4)-b-D-Glcp-(1-,2


In [17]:
df_match_s.loc[2, ['glycan name']].values[0]

'repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-'

In [18]:
df_match_count_duplicatie.loc[3, ['glycan name']].values[0]

'repeat-4)-b-D-Galp-(1-4)-a-D-GlcpNAc-(1-4)-b-D-Galp-(1-3)-a-D-GlcpNAc-(1-2)-b-D-Ribf-(1-'

In [19]:
unique_mono_l = []
prev = ''
for m in csv_mono_list:
    if m != prev:
        unique_mono_l.append(m)
    
    prev = m

In [20]:
from itertools import groupby

a = unique_mono_l
b = [[key, len(list(group))] for key, group in groupby(sorted(a))]

In [21]:
pd.DataFrame(b).to_csv('matching_table/freq_table_v2.csv', index = False)

In [22]:
np.array(max_residual_num_list)

array([5, 4, 3, 4, 4, 3, 3, 1, 5, 4, 3, 5, 1, 1, 1, 3, 3, 3, 1, 5, 7, 5,
       4, 5, 2, 2, 4, 4, 3, 1, 5, 5, 3, 4, 5, 3, 3, 1, 6, 3, 4, 3, 3, 4,
       3, 3, 2, 2, 1, 1, 6, 2, 4, 2, 2, 2, 6, 3, 1, 2, 4, 3, 1, 5, 1, 4,
       5, 3, 1, 3, 1, 4, 1, 4, 3, 6, 4, 1, 5, 5, 3, 2, 6, 1, 4, 6, 4, 4,
       3, 1, 5, 4, 3, 3, 3, 6, 2, 1, 1, 2, 2, 7, 3, 2, 3, 1, 6, 3, 3, 2,
       4, 2, 2, 4, 2, 1, 1, 4, 5, 3, 3, 2, 3, 1, 4, 1, 5, 2, 2, 4, 2, 3,
       2, 1, 3, 4, 2, 3, 3, 4, 5, 2, 4, 4, 3, 4, 1, 3, 3, 2, 5, 4, 4, 1,
       2, 5, 2, 3, 3, 3, 3, 2, 3, 2, 3, 1, 1, 1, 1, 5, 5, 3, 2, 4, 2, 3,
       6, 3, 4, 2, 2, 3, 5, 4, 1])

In [23]:
'b-D-Galp-(1-3)-b-D-Glcp-(1-3)-b-D-Xylp-(1-4)-b-D-Xylp-(1-4)-a-L-Galp-(2-1)-b-D-Xylp'.__eq__('b-D-Galp-(1-3)-b-D-Glcp-(1-3)-b-D-Xylp-(1-4)-b-D-Xylp-(1-4)-b-L-Galp-(2-1)-b-D-Xylp')

False