In [1]:
import os
from Bond import Bond
from types import MappingProxyType
from collections import namedtuple
from Residue import Residue as Res
from ChMMCIFParser import ChMMCIFParser as ChmParser
from TopoDefinitions import ResidueDefinition



In [2]:
import importlib
import Residue
import ChMMCIFParser
importlib.reload(Residue)
importlib.reload(ChMMCIFParser)
from Residue import Residue as Res
from ChMMCIFParser import ChMMCIFParser as ChmParser

In [3]:
def find_local_cif_path(pdb_id):
    pdb_id = pdb_id.lower()
    entry_point = '/mnt/backup/PDB/'
    subdir = pdb_id[1:3]
    file_path = os.path.join(entry_point, subdir, pdb_id+'.cif')
    if os.path.exists(file_path):
        return file_path

In [4]:
def skip_line(x):
    return (x.startswith('!') or x.startswith('*') or x.strip() == '')

with open('../allosteric-miner/dock_probes/toppar/top_all36_prot.rtf','r') as f:
    lines = [l for l in f.readlines() if not skip_line(l.strip())]

In [5]:
def comment_parser(line):
    if line.find('!') == -1:
        # line has no comments
        return line, None
    # return fields string, comments
    line, desc = line.split('!', maxsplit = 1)[:2]
    return line, desc.strip()

def mass_parser(line):
    # Parse line with keyword MASS
    field_str, desc = comment_parser(line)
    # fields: [key, deprecated_entry, atom_type, atomic_mass]
    key, _, atom_type, mass = field_str.split()
    return atom_type, mass, desc

def decl_parser(line):
    # Parse line with keyword DECLare
    field_str = comment_parser(line)[0]
    atom = field_str.split()[-1]
    return atom

def defa_parser(line):
    # Parse default patch for residues
    field_str = comment_parser(line)[0]
    fields = field_str.split()
    # find where these two keyword located in the list
    first_i = fields.index('FIRS')
    last_i = fields.index('LAST')
    first_patch = fields[first_i+1]
    last_patch = fields[last_i+1]
    return first_patch, last_patch

def auto_parser(line):
    field_str = comment_parser(line)[0]
    autogen_ops = field_str.split()[1:]
    return autogen_ops

def resi_parser(line):
    field_str, desc = comment_parser(line)
    # [key, residue_name, total_charge]
    key, res_name, total_charge = field_str.split()
    return res_name, total_charge, desc

def atom_parser(line):
    field_str = comment_parser(line)[0]
    key, atom_name, atom_type, charge = field_str.split()
    return atom_name, atom_type, charge

def pairwise_parser(line):
    # Parser for lines with pairwise atoms separated by space
    field_str = comment_parser(line)[0]
    # The first field is the keyword
    # followed by atom type in pairs
    fields = field_str.split()[1:]
    # combine each pair into list of tuples
    if len(fields) % 2 == 1:
        raise ValueError('Odd number of atoms! Cannot group into pairs: {}'.format(line))
    atom_pairs = list(zip(fields[::2], fields[1::2]))
    return atom_pairs

def quad_parser(line):
    # Parse IMPR keyword for group of 4 atoms
    field_str = comment_parser(line)[0]
    fields = field_str.split()[1:]
    # combine each pair into list of tuples
    if len(fields) % 4 != 0:
        raise ValueError('Invalid length of topology specification: {}\n \
        Multiples  of 4 atoms required'.format(line))
    quad = (tuple(fields[:4]), tuple(fields[4:]))
    return quad

def octa_parser(line):
    # Parse CMAP keyword for a pair of 4 atoms
    field_str = comment_parser(line)[0]
    fields = field_str.split()[1:]
    if len(fields) != 8:
        raise ValueError('Invalid length of topology specification: {}\n \
        8 atoms required'.format(line))
    # combine into tuple of tuples with shape (2,4)
    octa = (tuple(fields[:4]), tuple(fields[4:]))
    return octa

def ic_parser(line):
    # Parse internal coordinate entries. 9 fields total
    # fields: [key, I,J,K,L, R(I(J/K)), T(I(JK/KJ)), PHI, T(JKL), R(KL)]
    # IJKL are atoms, star (*) on atom K indicate that it is an improper structure
    # Strucutres specified for both chain and improper (branch)
    # R(I(J/K)): dist of I-J/I-K, 
    # T(I(JK/KJ)): angle of I-J-K/I-K-J,
    # PHI: Dihedral, 
    # T(JKL): angel of J-K-L, 
    # R(KL): dist of K-L,
    field_str = comment_parser(line)[0]
    fields = field_str.split()[1:]
    if len(fields) != 9:
        raise ValueError('Invalid length of topology specification: {}\n \
        9 fields required'.format(line))
    i, j, k, l, r_ij, t_ijk, phi, t_jkl, r_kl = fields
    if k.startswith('*'):
        # '*' indicates improper structure, and K is the center
        sec_field = 'R(I-K)'
        third_field = 'T(I-K-J)'
    else:
        sec_field = 'R(I-J)'
        third_field = 'T(I-J-K)'
    return {
        'I': i, 'J': j, 'K': k, 'L': l,
        sec_field: float(r_ij), 
        third_field: float(t_ijk),
        'Phi': float(phi),
        'T(J-K-L)': float(t_jkl),
        'R(K-L)': float(r_kl)
    }

def delete_parser(line):
    # Parse the keyword DELETE. Keyword has to be followed by the type
    # of the data to be deleted and the value
    field_str = comment_parser(line)[0]
    fields = field_str.split()[1:]
    if len(fields) != 2:
        raise ValueError('Invalid length of topology specification: {}\n \
        2 fields required: key, data'.format(line))
    return {'key': fields[0], 'data': fields[1]}

In [6]:
rtf_ver = '.'.join(lines[0].strip().split())
mass_dict = dict()
decl_peptide_atoms = []
default_patchs = {'FIRST':None,'LAST':None}
default_autogen = None
residue_topo_dict = dict()
unparsed_lines = []
for l in lines[1:]:
    if l.startswith('MASS'):
        symbol, mass, desc = mass_parser(l)
        mass_dict.update({symbol: (float(mass), desc)})
    elif l.startswith('DECL'):
        atom = decl_parser(l)
        decl_peptide_atoms.append(atom)
    elif l.startswith('DEFA'):
        first_patch, last_patch = defa_parser(l)
        default_patchs['FIRST'] = first_patch
        default_patchs['LAST'] = last_patch
    elif l.startswith('AUTO'):
        default_autogen = auto_parser(l)
    elif l.startswith('RESI') or l.startswith('PRES'):
        res_name, total_charge, desc = resi_parser(l)
        if res_name not in residue_topo_dict:
            # index first group
            cur_group_i = -1
            cur_res = {
                'desc': desc,
                'total_charge': float(total_charge),
                'atoms':{
                },
                'bonds':{
                    'single':[],
                    'double':[],
                    'triple':[],
                    'aromatic':[]
                },
                'impropers':[],
                'cmap':[],
                'ic':[],
                'is_patch': l.startswith('PRES')
            }
        residue_topo_dict.update({res_name: cur_res})
    elif l.startswith('GROUP'):
        # Update group number
        cur_group_i += 1
        cur_atom_group = {cur_group_i: {}}
        cur_res['atoms'].update(cur_atom_group)
    elif l.startswith('ATOM'):
        if cur_group_i == -1:
            # if no GROUP keyword exist for patch, create a single group
            cur_group_i = 0
            cur_atom_group = {cur_group_i: {}}
            cur_res['atoms'].update(cur_atom_group)
        atom_name, atom_type, atom_charge = atom_parser(l)
        cur_atom_dict = {
            atom_name: 
            {
                'atom_type': atom_type, 
                'charge': float(atom_charge), 
                'mass': mass_dict[atom_type][0],
                'desc': mass_dict[atom_type][1]
            }
        }
        cur_atom_group[cur_group_i].update(cur_atom_dict)
    elif l.startswith('DONO'):
        if 'H_donors' not in cur_res:
            cur_res['H_donors'] = []
        donors = tuple(l.split()[1:])
        cur_res['H_donors'].append(donors)
    elif l.startswith('ACCE'):
        if 'H_acceptors' not in cur_res:
            cur_res['H_acceptors'] = []
        acceptors = tuple(l.split()[1:])
        cur_res['H_acceptors'].append(acceptors)
    elif l.startswith('BOND'):
        single_bonds = pairwise_parser(l)
        cur_res['bonds']['single'].extend(single_bonds)
    elif l.startswith('DOUBLE'):
        double_bonds = pairwise_parser(l)
        cur_res['bonds']['double'].extend(double_bonds)
    elif l.startswith('TRIPLE'):
        triple_bonds = pairwise_parser(l)
        cur_res['bonds']['triple'].extend(triple_bonds)
    elif l.startswith('AROMATIC'):
        aromatic_bonds = pairwise_parser(l)
        cur_res['bonds']['triple'].extend(aromatic_bonds)
    elif l.startswith('IMPR'):
        # Improper (branching structures)
        impropers = quad_parser(l)
        cur_res['impropers'].append(impropers)
    elif l.startswith('CMAP'):
        # Dihedral crossterm energy correction map
        cmap = octa_parser(l)
        cur_res['cmap'].append(cmap)
    elif l.startswith('IC') or l.startswith('ic'):
        # Internal Coordinates
        ic_dict = ic_parser(l)
        cur_res['ic'].append(ic_dict)
    elif l.startswith('DELETE'):
        if 'delete' not in cur_res:
            cur_res['delete'] = []
        delete_entry = delete_parser(l)
        cur_res['delete'].append(delete_entry)
    elif l.startswith('DIHE'):
        pass
    elif l.startswith('ANGLE'):
        pass
    elif l.startswith('PATCH') or l.startswith('patch'):
        pass
    elif l.startswith('END') or l.startswith('end'):
        break
    else:
        unparsed_lines.append(l)

In [14]:
Parser = ChmParser(
    include_solvent=False
)
pdb_id = '1CDL'
file_path = find_local_cif_path(pdb_id)
structure = Parser.get_structure(file_path)

In [15]:
all_res_def = {}
for resname, res_topo in residue_topo_dict.items():
    res_def = ResidueDefinition(rtf_ver, resname, res_topo)
    all_res_def[resname] = res_def
# Map all histidines to HSE
all_res_def['HIS'] = all_res_def['HSE']

In [16]:
chainA = structure[1]['A']

In [17]:
chainA.load_topo_definition(all_res_def)

In [19]:
histidines = []
for res in chainA:
    if res.resname == 'HIS':
        histidines.append(res)

In [30]:
histidines[0]['CA'].topo_definition.__dict__

{'parent_def': <Residue Definition name=HSE atoms=17>,
 'name': 'CA',
 'atom_type': 'CT1',
 'is_donor': False,
 'is_acceptor': False,
 'charge': 0.07,
 'mass': 12.011,
 'desc': 'aliphatic sp3 C for CH'}

In [48]:
histidines[0]['CA'].mass

12.0107

In [50]:
histidines[0]['N'].mass

14.0067

In [22]:
res = structure[1]['A'][5]

In [39]:
res

<Residue THR het=  resseq=5 icode= >

In [24]:
res.bonds

[Bond(<Atom CB>, <Atom CA>, type=single, order=1, length=1.522034),
 Bond(<Atom N>, <Atom CA>, type=single, order=1, length=1.481669),
 Bond(<Atom C>, <Atom CA>, type=single, order=1, length=1.521941),
 Bond(<Atom O>, <Atom C>, type=double, order=2, length=1.227294)]

In [26]:
res.missing_atoms

['HN', 'HA', 'HB', 'OG1', 'HG1', 'CG2', 'HG21', 'HG22', 'HG23']

In [43]:
res['CA']

<Atom CA>

In [40]:
thr_ic = chainA.topo_definitions['THR'].ic

In [41]:
thr_ic

[{'I': '-C',
  'J': 'CA',
  'K': '*N',
  'L': 'HN',
  'R(I-K)': 1.3471,
  'T(I-K-J)': 124.12,
  'Phi': 180.0,
  'T(J-K-L)': 114.26,
  'R(K-L)': 0.9995},
 {'I': '-C',
  'J': 'N',
  'K': 'CA',
  'L': 'C',
  'R(I-J)': 1.3471,
  'T(I-J-K)': 124.12,
  'Phi': 180.0,
  'T(J-K-L)': 106.09,
  'R(K-L)': 1.5162},
 {'I': 'N',
  'J': 'CA',
  'K': 'C',
  'L': '+N',
  'R(I-J)': 1.4607,
  'T(I-J-K)': 106.09,
  'Phi': 180.0,
  'T(J-K-L)': 117.69,
  'R(K-L)': 1.3449},
 {'I': '+N',
  'J': 'CA',
  'K': '*C',
  'L': 'O',
  'R(I-K)': 1.3449,
  'T(I-K-J)': 117.69,
  'Phi': 180.0,
  'T(J-K-L)': 120.3,
  'R(K-L)': 1.2294},
 {'I': 'CA',
  'J': 'C',
  'K': '+N',
  'L': '+CA',
  'R(I-J)': 1.5162,
  'T(I-J-K)': 117.69,
  'Phi': 180.0,
  'T(J-K-L)': 124.66,
  'R(K-L)': 1.4525},
 {'I': 'N',
  'J': 'C',
  'K': '*CA',
  'L': 'CB',
  'R(I-K)': 1.4607,
  'T(I-K-J)': 106.09,
  'Phi': 126.46,
  'T(J-K-L)': 112.74,
  'R(K-L)': 1.5693},
 {'I': 'N',
  'J': 'C',
  'K': '*CA',
  'L': 'HA',
  'R(I-K)': 1.4607,
  'T(I-K-J)': 106