# Struture Preparation with `pyCHARMM` and `crimm`

In this example, we are going to fetch a structure directly from RCSB by PDB ID. Use `crimm` to build any missing loop, set the protonation state on the protein residues, and load the structure into `CHARMM` directly.

We are using **1A8I** as an example for showing the protonation state and loop building, but the routine works on any other protein or RNA structures. You can try **7ZAP** for loading a structure with both protein and RNA present and **4PTI** for disulfide bond patch.

In [1]:
from crimm.Fetchers import fetch_rcsb
from crimm.Modeller import TopologyGenerator
from crimm.Modeller.LoopBuilder import ChainLoopBuilder
from crimm.Modeller.TopoFixer import fix_chain
from crimm.StructEntities import Model
import crimm.Adaptors.pyCHARMMAdaptors as pcm_interface
from crimm.Adaptors.PropKaAdaptors import PropKaProtonator

from pycharmm.psf import delete_atoms as pcm_del_atoms
from pycharmm.psf import get_natom as pcm_get_natom
from pycharmm.generate import patch as pcm_patch
from pycharmm.settings import set_verbosity as pcm_set_verbosity



In [2]:
def minimize_chain(chain, sd_nstep, abnr_nstep):
    # load into CHARMM to minimize the structure
    if pcm_get_natom() > 0:
        pcm_del_atoms()
    pcm_interface.load_chain(chain)
    pcm_interface.minimize(sd_nstep=sd_nstep, abnr_nstep=abnr_nstep)
    # Uodate the coordinate in crimm structure
    pcm_interface.sync_coords(chain)

## Parameters
parameter used for the preparation routine

In [3]:
pdb_id = '1a8i'
prot_first_patch = 'ACE'
prot_last_patch = 'CT3'
na_first_patch = '5TER'
na_last_patch = '3PHO'
sd_nstep = 300
abnr_nstep = 0
charmm_verbosity_level = 0
pH = 7.4

## Fetch structure from RCSB

In [4]:
structure = fetch_rcsb(
    pdb_id,
    use_bio_assembly=True,
    include_solvent=False,
    # any existing hydrogen will be removed and rebuilt later
    include_hydrogens=False,
    first_model_only=True
)
# Show the structure
structure



NGLWidget()

<Structure id=1A8I Models=1>
│
├───<Model id=1 Chains=4>
	│
	├───<Polypeptide(L) id=A Residues=813>
	├──────Description: GLYCOGEN PHOSPHORYLASE B
	│
	├───<Heterogens id=B Molecules=1>
	├──────Description: BETA-D-GLUCOPYRANOSE SPIROHYDANTOIN
	│
	├───<Polypeptide(L) id=C Residues=813>
	├──────Description: GLYCOGEN PHOSPHORYLASE B
	│
	├───<Heterogens id=D Molecules=1>
	├──────Description: BETA-D-GLUCOPYRANOSE SPIROHYDANTOIN


As we can see below in the sequence, this structure has missing loops (shown in red). We are going to build the loop with `crimm`.

In [5]:
structure.models[0].chains[0].masked_seq.show()

[91mS[0m[91mR[0m[91mP[0m[91mL[0m[91mS[0m[91mD[0mQEKRKQISVRGLAGVENVTELKKNFNRHLHFTLVKDRNVATPRDYYFALAHTVRDHLVGRWIRTQQHYYEKDPKRIYYLSLEFYMGRTLQNTMVNLALENACDEATYQLGLDMEELEEIEEDAGLGNGGLGRLAACFLDSMATLGLAAYGYGIRYEFGIFNQKICGGWQMEEADDWLRYGNPWEKARPEFTLPVHFYGRVEHTSQGAKWVDTQVVLAMPYDTPVPGYRNNVVNTMRLWSAKAPN[91mD[0m[91mF[0m[91mN[0m[91mL[0m[91mK[0m[91mD[0m[91mF[0m[91mN[0m[91mV[0m[91mG[0mGYIQAVLDRNLAENISRVLYPNDNFFEGKELRLKQEYFVVAATLQDIIRRFKSSKF[91mG[0m[91mC[0m[91mR[0m[91mD[0m[91mP[0m[91mV[0mRTNFDAFPDKVAIQLNDTHPSLAIPELMRVLVDLERLDWDKAWEVTVKTCAYTNHTVIPEALERWPVHLLETLLPRHLQIIYEINQRFLNRVAAAFPGDVDRLRRMSLVEEGAVKRINMAHLCIAGSHAVNGVARIHSEILKKTIFKDFYELEPHKFQNKTNGITPRRWLVLCNPGLAEIIAERIGEEYISDLDQLRKLLSYVDDEAFIRDVAKVKQENKLKFAAYLEREYKVHINPNSLFDVQVKRIHEYKRQLLNCLHVITLYNRIKKEPNKFVVPRTVMIGGKAAPGYHMAKMIIKLITAIGDVVNHDPVVGDRLRVIFLENYRVSLAEKVIPAADLSEQISTAGTEASGTGNMKFMLNGALTIGTMDGANVEMAEEAGEENFFIFGMRVEDVDRLDQRGYNAQEYYDRIPELRQIIEQLSSGFFSPKQPDLFKDIVNMLMHHDRFKVFADYEEYVKCQERVSALYKNPREWTRMVI

## Separate Chains by Chain Type
First we need to separate the chain types. Although in this example, we do not have RNA chain, but this routine is built to accommodate both types.

In [6]:
prot_chains = {}
na_chains = {}
# get the first model's id
model_id = structure.models[0].id
# create a new empty model to store chains of interests
new_model = Model(model_id)
for chain in structure[model_id].chains:
    if chain.chain_type == 'Polypeptide(L)':
        prot_chains[chain.id] = chain
    elif chain.chain_type  in ('Polyribonucleotide', 'Polydeoxyribonucleotide'):
        na_chains[chain.id] = chain

## Generate Topology and Loop Building with crimm First

### Protein Chains

In [7]:
topo = TopologyGenerator()

In [8]:
for chain_id, chain in prot_chains.items():
    need_minimization = False
    # Missing loop in the chain
    if not chain.is_continuous():
        loop_builder = ChainLoopBuilder(chain)
        # Coordinates of the missing residues will be copied from
        # Alphafold structures
        # only build the loop not the termini
        loop_builder.build_from_alphafold(include_terminal = False)
        chain = loop_builder.get_chain()
        prot_chains[chain_id] = chain
        need_minimization = True
    topo.generate(
        chain,
        first_patch=prot_first_patch,
        last_patch=prot_last_patch,
        # if the first residue is PRO or GLY, special PROP or GLYP patches will be applied respectively
        auto_correct_first_patch=True,
        # Coerce any modified residue to canonical residue that it is based on
        coerce=True
    )
    fix_chain(chain)

    if need_minimization:
        # load into CHARMM to minimize the structure
        pcm_interface.load_topology(topo)
        prev_level = pcm_set_verbosity(charmm_verbosity_level)
        minimize_chain(chain, sd_nstep, abnr_nstep)
        pcm_set_verbosity(prev_level)
    new_model.add(chain)



  
 CHARMM>     read rtf card -
 CHARMM>     name /tmp/tmpzkemjubv
 VOPEN> Attempting to open::/tmp/tmpzkemjubv::
 MAINIO> Residue topology file being read from unit  91.
 TITLE> *RTF LOADED FROM CRIMM
 TITLE> 36  2
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  
  
 CHARMM>     read param card -
 CHARMM>     name /tmp/tmpjmvvd0jr -
 CHARMM>     flex
 VOPEN> Attempting to open::/tmp/tmpjmvvd0jr::

          PARAMETER FILE BEING READ FROM UNIT 91
 TITLE> *PRM LOADED FROM CRIMM
 TITLE> *>>>> CHARMM36 ALL-HYDROGEN PARAMETER FILE FOR PROTEINS <<<<<<<<<<
 TITLE> *>>>>> INCLUDES PHI, PSI CROSS TERM MAP (CMAP) CORRECTION <<<<<<<<
 TITLE> *>>>>>>>>>>>>>>>>>>>>>>>>>> JAN. 2016 <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 TITLE> * ALL COMMENTS TO THE CHARMM WEB SITE: WWW.CHARMM.ORG
 TITLE> *             PARAMETER SET DISCUSSION FORUM
 TITLE> *
 PARMIO> NONBOND, HBOND lists and IMAGE atoms cleared.
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  
 ***** Message from SEQRDR ***



  
 CHARMM>     read rtf card -
 CHARMM>     name /tmp/tmpw_q91fb4
 VOPEN> Attempting to open::/tmp/tmpw_q91fb4::
 MAINIO> Residue topology file being read from unit  91.
 TITLE> *RTF LOADED FROM CRIMM
 TITLE> 36  2
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  
  
 CHARMM>     read param card -
 CHARMM>     name /tmp/tmp6bvkb8e2 -
 CHARMM>     flex
 VOPEN> Attempting to open::/tmp/tmp6bvkb8e2::

          PARAMETER FILE BEING READ FROM UNIT 91
 TITLE> *PRM LOADED FROM CRIMM
 TITLE> *>>>> CHARMM36 ALL-HYDROGEN PARAMETER FILE FOR PROTEINS <<<<<<<<<<
 TITLE> *>>>>> INCLUDES PHI, PSI CROSS TERM MAP (CMAP) CORRECTION <<<<<<<<
 TITLE> *>>>>>>>>>>>>>>>>>>>>>>>>>> JAN. 2016 <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 TITLE> * ALL COMMENTS TO THE CHARMM WEB SITE: WWW.CHARMM.ORG
 TITLE> *             PARAMETER SET DISCUSSION FORUM
 TITLE> *
 PARMIO> NONBOND, HBOND lists and IMAGE atoms cleared.
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  
 ***** Message from SEQRDR ***

### RNA Chains
DNA chains are not yet supported but will be implemented soon

In [9]:
for chain_id, chain in na_chains.items():
    # Missing loop is very unlikely in nucleotide chains on PDB
    # but if it exsits, an error will be raise
    if not chain.is_continuous():
        raise ValueError(
            f'Nucleotide chain {chain.id} is not continuous, '
            'topology cannot be generated.'
        )
    topo.generate(
        chain,
        first_patch=na_first_patch,
        last_patch=na_last_patch,
        coerce=True
    )
    fix_chain(chain)
    new_model.add(chain)

Finally, replace the model with the new model in the structure

In [10]:
new_model.set_connect(structure.models[0].connect_dict)
structure.detach_child(model_id)
structure.add(new_model)



The loops are built, and now it is ready for protonation state calculation.

In [11]:
structure

NGLWidget()

<Structure id=1A8I Models=1>
│
├───<Model id=1 Chains=2>
	│
	├───<Polypeptide(L) id=A Residues=829>
	├──────Description: GLYCOGEN PHOSPHORYLASE B
	│
	├───<Polypeptide(L) id=C Residues=829>
	├──────Description: GLYCOGEN PHOSPHORYLASE B


## Load the Topology Definition Files in pyCHARMM

Since the TopologyGenerator Module keeps track of what type of macromolecules have been generated, it will automatically load the topology files (rtf) into CHARMM

In [12]:
pcm_interface.load_topology(topo)

  
 CHARMM>     read rtf card -
 CHARMM>     name /tmp/tmph9q3fhht
 VOPEN> Attempting to open::/tmp/tmph9q3fhht::
 MAINIO> Residue topology file being read from unit  91.
 TITLE> *RTF LOADED FROM CRIMM
 TITLE> 36  2
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  
  
 CHARMM>     read param card -
 CHARMM>     name /tmp/tmp6lqtfr1o -
 CHARMM>     flex
 VOPEN> Attempting to open::/tmp/tmp6lqtfr1o::

          PARAMETER FILE BEING READ FROM UNIT 91
 TITLE> *PRM LOADED FROM CRIMM
 TITLE> *>>>> CHARMM36 ALL-HYDROGEN PARAMETER FILE FOR PROTEINS <<<<<<<<<<
 TITLE> *>>>>> INCLUDES PHI, PSI CROSS TERM MAP (CMAP) CORRECTION <<<<<<<<
 TITLE> *>>>>>>>>>>>>>>>>>>>>>>>>>> JAN. 2016 <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 TITLE> * ALL COMMENTS TO THE CHARMM WEB SITE: WWW.CHARMM.ORG
 TITLE> *             PARAMETER SET DISCUSSION FORUM
 TITLE> *
 PARMIO> NONBOND, HBOND lists and IMAGE atoms cleared.
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  


## Get Protonation State from the specified pH Value
Note that the protonator accept Model level entity not the structure itself

In [13]:
protonator = PropKaProtonator(topo, pH = pH)
protonator.load_model(new_model)
protonator.apply_patches()
if pcm_get_natom() > 0:
    pcm_del_atoms()
for chain in new_model:
    if chain.id in protonator.patches and len(protonator.patches[chain.id]) > 0:
        built_atoms = fix_chain(chain)
    # Also load the chain into CHARMM
    pcm_interface.load_chain(chain)

Unexpected number (12) of atoms in residue GLN   7 A   in conformation 1A
Unexpected number (9) of atoms in residue PRO 835 A   in conformation 1A
Unexpected number (12) of atoms in residue GLN   7 C   in conformation 1A
Unexpected number (9) of atoms in residue PRO 835 C   in conformation 1A
Missing atoms or failed protonation for GLU 177 A (BBN) -- please check the structure
Group (BBN) for  2731-   N   177-GLU (A) [  25.187   40.109   39.638] N
Expected 2 interaction atoms for acids, found:
              2731-   N   177-GLU (A) [  25.187   40.109   39.638] N
Expected 2 interaction atoms for bases, found:
              2731-   N   177-GLU (A) [  25.187   40.109   39.638] N
Missing atoms or failed protonation for GLU 177 A (COO) -- please check the structure
Group (COO) for  2737-  CD   177-GLU (A) [  28.225   39.585   38.305] C
Expected 2 interaction atoms for acids, found:
              2739- OE2   177-GLU (A) [  28.766   39.448   37.176] O
              2738- OE1   177-GLU (A) [  2


 Message from MAPIC: Atom numbers are changed.

 Message from MAPIC:        829 residues deleted.

 Message from MAPIC:          1 segments deleted.
 DELTIC:     13622 bonds deleted
 DELTIC:     24586 angles deleted
 DELTIC:     36025 dihedrals deleted
 DELTIC:      2393 improper dihedrals deleted
 DELTIC:       829 crossterm maps deleted
 DELTIC:      1542 donors deleted
 DELTIC:      1249 acceptors deleted
  
 CHARMM>     read sequence pdb -
 CHARMM>     name /tmp/tmpxj6j4ptd
 VOPEN> Attempting to open::/tmp/tmpxj6j4ptd::
 MAINIO> Sequence information being read from unit  91.
 TITLE>  *

          RESIDUE SEQUENCE --   829 RESIDUES
          GLN GLU LYS ARG LYS GLN ILE SER VAL ARG GLY LEU ALA GLY VAL GLU ASN VAL THR GLU 
          LEU LYS LYS ASN PHE ASN ARG HSD LEU HSD PHE THR LEU VAL LYS ASP ARG ASN VAL ALA 
          THR PRO ARG ASP TYR TYR PHE ALA LEU ALA HSD THR VAL ARG ASP HSD LEU VAL GLY ARG 
          TRP ILE ARG THR GLN GLN HSD TYR TYR GLU LYS ASP PRO LYS ARG ILE TYR TYR L

These protonation patches are identified but not yet loaded into `CHARMM`

In [14]:
protonator.patches

{'A': {123: 'GLUP',
  177: 'GLUP',
  180: 'ASPP',
  191: 'LSN',
  296: 'GLUP',
  538: 'LSN',
  608: 'LSN',
  617: 'LSN',
  664: 'GLUP'},
 'C': {123: 'GLUP',
  177: 'GLUP',
  180: 'ASPP',
  191: 'LSN',
  296: 'GLUP',
  538: 'LSN',
  608: 'LSN',
  617: 'LSN',
  664: 'GLUP'}}

## Update CHARMM Residues with pyCHARMM patch Command

In [15]:
for chain_id, patch_dict in protonator.patches.items():
    for resid, patch_name in patch_dict.items():
        pcm_patch(patch_name, f'PRO{chain_id} {resid}')

  
 CHARMM>     patch GLUP PROA 123
 ATOM  PROA GLU  123  HE2  ADDED.

 Message from MAPIC: Atom numbers are changed.
 AUTGEN: Autogenerating specified angles and dihedrals.
 AUTOGEN: 49172 angles are removed before regeneration for selected atoms.
 AUTOGEN: 72050 dihedrals are removed before regeneration for selected atoms.
 PATCH: Check angles and dihedrals autogenerated.
 PSFSUM> PSF modified: NONBOND lists and IMAGE atoms cleared.
 PSFSUM> Summary of the structure file counters :
         Number of segments      =        2   Number of residues   =     1658
         Number of atoms         =    26941   Number of groups     =     8052
         Number of bonds         =    27245   Number of angles     =    49173
         Number of dihedrals     =    72052   Number of impropers  =     4786
         Number of cross-terms   =     1658   Number of autogens   =        0
         Number of HB acceptors  =     2498   Number of HB donors  =     3085
         Number of NB exclusions =        0

Check the generated bonds, angle, dihedrals, etc. They should match between CHARMM and crimm

In [16]:
new_model.chains[0].topo_elements

<TopologyElementContainer for <Polypeptide(L) id=A Residues=829> with bonds=13623, angles=24579, dihedrals=36023, impropers=2393, cmap=0>

## Patch Disulfide Bond
If any disulfide bond exists in the structure, we will patch them in the `CHARMM` structure. However, disulfide bonds have not been fully implemented in `crimm`.

In [17]:
if 'disulf' in structure.models[0].connect_dict:
    for res1, res2 in structure.models[0].connect_dict['disulf']:
        seg1, seg2 = res1['chain'], res2['chain']
        seq1, seq2 = res1['resseq'], res2['resseq']
        patch_arg = f'PRO{seg1} {seq1} PRO{seg2} {seq2}'
        print('[Excuting CHARMM Command] patch DISU', patch_arg)
        pcm_patch('DISU', patch_arg)

## Save the structure as PDB and PSF files

In [18]:
from pycharmm import write

In [19]:
write.coor_pdb(f'{pdb_id}.pdb')
write.psf_card(f'{pdb_id}.psf')

  
 CHARMM>     write name 1a8i.pdb -
 CHARMM>     coor pdb
 VOPEN> Attempting to open::1a8i.pdb::
 RDTITL>  
 RDTITL> No title read.
  Write CHARMM-pdb format
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  
  
 CHARMM>     write name 1a8i.psf -
 CHARMM>     psf card
 VOPEN> Attempting to open::1a8i.psf::
 RDTITL>  
 RDTITL> No title read.
 VCLOSE: Closing unit   91 with status "KEEP"
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  


In [21]:
structure

NGLWidget()

<Structure id=1A8I Models=1>
│
├───<Model id=1 Chains=2>
	│
	├───<Polypeptide(L) id=A Residues=829>
	├──────Description: GLYCOGEN PHOSPHORYLASE B
	│
	├───<Polypeptide(L) id=C Residues=829>
	├──────Description: GLYCOGEN PHOSPHORYLASE B
