In [1]:
from pycharmm.settings import set_verbosity

from crimm.Adaptors.pyCHARMMAdaptors import load_chain, load_topology
from crimm.Adaptors.pyCHARMMAdaptors import empty_charmm, minimize, sync_coords
from crimm.Modeller.LoopBuilder import ChainLoopBuilder
from crimm.Modeller import TopologyGenerator
from crimm.Modeller.TopoFixer import fix_chain
from crimm.Fetchers import (
    fetch_rcsb, fetch_alphafold, uniprot_id_query, fetch_swiss_model
)



## Fetch a Structure with Missing Loops from RCSB

In [2]:
struct = fetch_rcsb('4BBY')
chain = struct[1]['A']

canonical sequence has all the residue identities for the chain

In [3]:
len(chain.can_seq)

658

In [4]:
chain

NGLWidget()

<Polypeptide(L) id=A Residues=555>
  Description: ALKYLDIHYDROXYACETONEPHOSPHATE SYNTHASE, PEROXISOMAL


The masked sequence `masked_seq` displays the missing residues and their identities

In [5]:
chain.masked_seq.show()

[91mM[0m[91mA[0m[91mE[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mG[0m[91mE[0m[91mT[0m[91mS[0m[91mA[0m[91mS[0m[91mS[0m[91mG[0m[91mS[0m[91mA[0m[91mA[0m[91mE[0m[91mR[0m[91mD[0m[91mP[0m[91mD[0m[91mQ[0m[91mD[0m[91mR[0m[91mA[0m[91mG[0m[91mR[0m[91mR[0m[91mL[0m[91mR[0m[91mV[0m[91mL[0m[91mS[0m[91mG[0m[91mH[0m[91mL[0m[91mL[0m[91mG[0m[91mR[0m[91mP[0m[91mQ[0m[91mE[0m[91mA[0m[91mL[0m[91mS[0m[91mT[0m[91mN[0m[91mE[0m[91mC[0m[91mK[0m[91mA[0m[91mR[0m[91mR[0m[91mA[0m[91mA[0m[91mS[0m[91mA[0m[91mA[0m[91mT[0m[91mA[0m[91mA[0m[91mP[0m[91mT[0m[91mA[0m[91mT[0m[91mP[0m[91mA[0m[91mA[0m[91mP[0m[91mE[0m[91mS[0mGIIPKKRQELMKWNGWGYNDSKFFLNKKGQLELTGKRYPLSGVALPTFKDWIQNTFGINLDHKT[91mT[0m[91mS[0m[91mK[0m[91mA[0m[91mS[0m[91mL[0m[91mN[0m[91mP[0m[91mS[0mDTPPSIVNEDFLHELKKTNISYSQEADDRVFRAHGHCLHEIFLLRE

Alternatively, the `missing_res` attribute also shows the missing residues' sequence number and resname

In [6]:
chain.missing_res

[(1, 'MET'),
 (2, 'ALA'),
 (3, 'GLU'),
 (4, 'ALA'),
 (5, 'ALA'),
 (6, 'ALA'),
 (7, 'ALA'),
 (8, 'ALA'),
 (9, 'ALA'),
 (10, 'ALA'),
 (11, 'ALA'),
 (12, 'ALA'),
 (13, 'ALA'),
 (14, 'ALA'),
 (15, 'GLY'),
 (16, 'GLU'),
 (17, 'THR'),
 (18, 'SER'),
 (19, 'ALA'),
 (20, 'SER'),
 (21, 'SER'),
 (22, 'GLY'),
 (23, 'SER'),
 (24, 'ALA'),
 (25, 'ALA'),
 (26, 'GLU'),
 (27, 'ARG'),
 (28, 'ASP'),
 (29, 'PRO'),
 (30, 'ASP'),
 (31, 'GLN'),
 (32, 'ASP'),
 (33, 'ARG'),
 (34, 'ALA'),
 (35, 'GLY'),
 (36, 'ARG'),
 (37, 'ARG'),
 (38, 'LEU'),
 (39, 'ARG'),
 (40, 'VAL'),
 (41, 'LEU'),
 (42, 'SER'),
 (43, 'GLY'),
 (44, 'HIS'),
 (45, 'LEU'),
 (46, 'LEU'),
 (47, 'GLY'),
 (48, 'ARG'),
 (49, 'PRO'),
 (50, 'GLN'),
 (51, 'GLU'),
 (52, 'ALA'),
 (53, 'LEU'),
 (54, 'SER'),
 (55, 'THR'),
 (56, 'ASN'),
 (57, 'GLU'),
 (58, 'CYS'),
 (59, 'LYS'),
 (60, 'ALA'),
 (61, 'ARG'),
 (62, 'ARG'),
 (63, 'ALA'),
 (64, 'ALA'),
 (65, 'SER'),
 (66, 'ALA'),
 (67, 'ALA'),
 (68, 'THR'),
 (69, 'ALA'),
 (70, 'ALA'),
 (71, 'PRO'),
 (72, 'THR'),
 

`ChainLoopBuilder` is the class that facilitate the loop building using homology models or Alphafold structure. It will query RCSB for sequence search with an identity score cutoff to find the homology models, and if any model contains the loop structure, it will be superimposed and transfer these loop residues to the chain being fixed.

If no homology models available for fixing the chain, AlphaFold structure can also be used.

In [7]:
looper = ChainLoopBuilder(chain)
looper.build_from_homology(
    max_num_match = 10, identity_score_cutoff = 0.95
)

In [8]:
final_chain = looper.get_chain()
final_chain.masked_seq.show()

[91mM[0m[91mA[0m[91mE[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mG[0m[91mE[0m[91mT[0m[91mS[0m[91mA[0m[91mS[0m[91mS[0m[91mG[0m[91mS[0m[91mA[0m[91mA[0m[91mE[0m[91mR[0m[91mD[0m[91mP[0m[91mD[0m[91mQ[0m[91mD[0m[91mR[0m[91mA[0m[91mG[0m[91mR[0m[91mR[0m[91mL[0m[91mR[0m[91mV[0m[91mL[0m[91mS[0m[91mG[0m[91mH[0m[91mL[0m[91mL[0m[91mG[0m[91mR[0m[91mP[0m[91mQ[0m[91mE[0m[91mA[0m[91mL[0m[91mS[0m[91mT[0m[91mN[0m[91mE[0m[91mC[0m[91mK[0m[91mA[0m[91mR[0m[91mR[0m[91mA[0m[91mA[0m[91mS[0m[91mA[0m[91mA[0m[91mT[0m[91mA[0m[91mA[0m[91mP[0m[91mT[0m[91mA[0m[91mT[0m[91mP[0m[91mA[0m[91mA[0m[91mP[0m[91mE[0m[91mS[0mGIIPKKRQELMKWNGWGYNDSKFFLNKKGQLELTGKRYPLSGVALPTFKDWIQNTFGINLDHKTTSKASLNPSDTPPSIVNEDFLHELKKTNISYSQEADDRVFRAHGHCLHEIFLLREGMFERIPDIVLWPTCHDDVVKIVNLACKYNLCIIPIGGGTSVSYGLMCPADETRTIISLDTSQMNRILWVDENNLTAHVEA

In [9]:
looper = ChainLoopBuilder(chain)
# missing terminals will also be built if `include_terminal = True`
looper.build_from_alphafold(include_terminal = False)

In [10]:
final_chain = looper.get_chain()
final_chain.masked_seq.show()

[91mM[0m[91mA[0m[91mE[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mA[0m[91mG[0m[91mE[0m[91mT[0m[91mS[0m[91mA[0m[91mS[0m[91mS[0m[91mG[0m[91mS[0m[91mA[0m[91mA[0m[91mE[0m[91mR[0m[91mD[0m[91mP[0m[91mD[0m[91mQ[0m[91mD[0m[91mR[0m[91mA[0m[91mG[0m[91mR[0m[91mR[0m[91mL[0m[91mR[0m[91mV[0m[91mL[0m[91mS[0m[91mG[0m[91mH[0m[91mL[0m[91mL[0m[91mG[0m[91mR[0m[91mP[0m[91mQ[0m[91mE[0m[91mA[0m[91mL[0m[91mS[0m[91mT[0m[91mN[0m[91mE[0m[91mC[0m[91mK[0m[91mA[0m[91mR[0m[91mR[0m[91mA[0m[91mA[0m[91mS[0m[91mA[0m[91mA[0m[91mT[0m[91mA[0m[91mA[0m[91mP[0m[91mT[0m[91mA[0m[91mT[0m[91mP[0m[91mA[0m[91mA[0m[91mP[0m[91mE[0m[91mS[0mGIIPKKRQELMKWNGWGYNDSKFFLNKKGQLELTGKRYPLSGVALPTFKDWIQNTFGINLDHKTTSKASLNPSDTPPSIVNEDFLHELKKTNISYSQEADDRVFRAHGHCLHEIFLLREGMFERIPDIVLWPTCHDDVVKIVNLACKYNLCIIPIGGGTSVSYGLMCPADETRTIISLDTSQMNRILWVDENNLTAHVEA

If for any reason you want to remove missing terminal residues from the sequence info and the reported residue info from the chain, you can use
`chain.truncate_missing_terminal()`

In [11]:
final_chain.truncate_missing_terminal()
final_chain.masked_seq.show()

GIIPKKRQELMKWNGWGYNDSKFFLNKKGQLELTGKRYPLSGVALPTFKDWIQNTFGINLDHKTTSKASLNPSDTPPSIVNEDFLHELKKTNISYSQEADDRVFRAHGHCLHEIFLLREGMFERIPDIVLWPTCHDDVVKIVNLACKYNLCIIPIGGGTSVSYGLMCPADETRTIISLDTSQMNRILWVDENNLTAHVEAGITGQELERQLKESGYCTGHEPDSLEFSTVGGWISTRASGMKKNIYGNIEDLVVHMKVVTPRGVIEKSCQGPRMSTGPDIHHFIMGSEGTLGVITEATIKIRPTPEYQKYGSVAFPNFEQGVACLREIAKQRCAPASIRLMDNQQFQFGHALKPQVSSIFTSFLDGLKKFYITKFKGFDPNQLSVATLLFEGDREKVLQHEKQVYDIAAKFGGLAAGEDNGQRGYLLTYVIAYMRDLGLEYYIIGESFETSAPWDRVVDLCRNVKERIRRECKEKGVQFPPLSTCRVTQTYDAGACIYFYFAFNYRGISDPLAVFEQTEAAAREEILANGGSLSHHHGVGKLRKQWLKESISDVGFGMLKSVKDYVDPTNIFGNRNL


The `highlight_repaired_gaps()` method will visualize what the repaired loop. However, as you can see in the example below, the loops are not connected to the chain, and structure minimization is needed. For this, we will use pyCHARMM's minimization routine.

In [12]:
looper.highlight_repaired_gaps()

View()

## Generate Topology and Load Parameter for the Chain

Before we load any chain into pyCHARMM, the topology and parameter for the chain need to be generated

In [13]:
topo = TopologyGenerator()
topo.generate(
    final_chain, first_patch = 'ACE', last_patch = 'CT3', coerce = True
)

built_atoms = fix_chain(final_chain)



In [14]:
# load these rtf and param files into pyCHARMM too
load_topology(topo)

  
 CHARMM>     read rtf card -
 CHARMM>     name /tmp/tmpv6ov_0xp
 VOPEN> Attempting to open::/tmp/tmpv6ov_0xp::
 MAINIO> Residue topology file being read from unit  91.
 TITLE> *RTF LOADED FROM CRIMM
 TITLE> 36  2
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  
  
 CHARMM>     read param card -
 CHARMM>     name /tmp/tmpzfn_86w1 -
 CHARMM>     flex
 VOPEN> Attempting to open::/tmp/tmpzfn_86w1::

          PARAMETER FILE BEING READ FROM UNIT 91
 TITLE> *PRM LOADED FROM CRIMM
 TITLE> *>>>> CHARMM36 ALL-HYDROGEN PARAMETER FILE FOR PROTEINS <<<<<<<<<<
 TITLE> *>>>>> INCLUDES PHI, PSI CROSS TERM MAP (CMAP) CORRECTION <<<<<<<<
 TITLE> *>>>>>>>>>>>>>>>>>>>>>>>>>> JAN. 2016 <<<<<<<<<<<<<<<<<<<<<<<<<<<<
 TITLE> * ALL COMMENTS TO THE CHARMM WEB SITE: WWW.CHARMM.ORG
 TITLE> *             PARAMETER SET DISCUSSION FORUM
 TITLE> *
 PARMIO> NONBOND, HBOND lists and IMAGE atoms cleared.
 VCLOSE: Closing unit   91 with status "KEEP"
  
 CHARMM>     
  


## Load Chain and Minimize in pyCHARMM

In [15]:
load_chain(final_chain)

  
 CHARMM>     read sequence pdb -
 CHARMM>     name /tmp/tmp_a495ysc
 VOPEN> Attempting to open::/tmp/tmp_a495ysc::
 MAINIO> Sequence information being read from unit  91.
 TITLE>  *

          RESIDUE SEQUENCE --   578 RESIDUES
          GLY ILE ILE PRO LYS LYS ARG GLN GLU LEU MET LYS TRP ASN GLY TRP GLY TYR ASN ASP 
          SER LYS PHE PHE LEU ASN LYS LYS GLY GLN LEU GLU LEU THR GLY LYS ARG TYR PRO LEU 
          SER GLY VAL ALA LEU PRO THR PHE LYS ASP TRP ILE GLN ASN THR PHE GLY ILE ASN LEU 
          ASP HSD LYS THR THR SER LYS ALA SER LEU ASN PRO SER ASP THR PRO PRO SER ILE VAL 
          ASN GLU ASP PHE LEU HSD GLU LEU LYS LYS THR ASN ILE SER TYR SER GLN GLU ALA ASP 
          ASP ARG VAL PHE ARG ALA HSD GLY HSD CYS LEU HSD GLU ILE PHE LEU LEU ARG GLU GLY 
          MET PHE GLU ARG ILE PRO ASP ILE VAL LEU TRP PRO THR CYS HSD ASP ASP VAL VAL LYS 
          ILE VAL ASN LEU ALA CYS LYS TYR ASN LEU CYS ILE ILE PRO ILE GLY GLY GLY THR SER 
          VAL SER TYR GLY LEU MET CYS PRO

In [16]:
# set verbosity to zero to reduce output from minimization steps
prev_level = set_verbosity(0)
minimize(sd_nstep=300, abnr_nstep=0)
set_verbosity(prev_level)


far from minimum for;
    IPHI= 1018  with deltaPHI=  93.6707 MIN=   0.0000 ATOMS: 5717 5715 5719 5718

far from minimum for;
    IPHI= 1018  with deltaPHI=  92.1273 MIN=   0.0000 ATOMS: 5717 5715 5719 5718

far from minimum for;
    IPHI= 1014  with deltaPHI= 122.0244 MIN=   0.0000 ATOMS: 5678 5676 5680 5679

far from minimum for;
    IPHI= 1014  with deltaPHI=  91.4986 MIN=   0.0000 ATOMS: 5678 5676 5680 5679
Adopted Basis Newton-Raphson minimization not performed


0

Since the coordinates were calculated in pyCHARMM, we need to copy these coord from pyCHARMM to crimm. There is a function provided `sync_coords` for synchronize the structures from both side.

In [17]:
sync_coords(final_chain)

Synchronized: <Polypeptide(L) id=A Residues=578>


We can visualize again to see if the strcuture looks reasonable

In [18]:
looper.highlight_repaired_gaps()

View()

## Working on Multiple Structures

Below is an example for working on multiple strcutures with missing loop regions

In [19]:
test_cases = ['3Q4K','5IEV','4BBY',"6FGQ","7M0C",'6UDL','5VQA','6SSV']
structures = [fetch_rcsb(pdbid, include_solvent = False) for pdbid in test_cases]



In [20]:
broken_chains = []
for structure in structures:
    for chain in structure.models[0]:
        if chain.chain_type == 'Polypeptide(L)' and not chain.is_continuous():
            broken_chains.append(chain)
broken_chains

[<Polypeptide(L) id=A Residues=356>,
 <Polypeptide(L) id=B Residues=360>,
 <Polypeptide(L) id=A Residues=284>,
 <Polypeptide(L) id=A Residues=555>,
 <Polypeptide(L) id=B Residues=543>,
 <Polypeptide(L) id=A Residues=229>,
 <Polypeptide(L) id=A Residues=332>,
 <Polypeptide(L) id=A Residues=469>,
 <Polypeptide(L) id=A Residues=368>,
 <Polypeptide(L) id=A Residues=365>]

In [22]:
complete_chains = {}
incomplete_chains = {}
for chain in broken_chains:
    pdbid = chain.parent.parent.id
    print(pdbid, chain)
    looper = ChainLoopBuilder(chain)
    looper.build_from_homology(
        max_num_match = 10, identity_score_cutoff = 0.95,
    )
    if not looper.model_chain.is_continuous():
        print('Building from AlphaFold model')
        looper.build_from_alphafold(include_terminal = False)
    fixed_chain = looper.get_chain()
    if fixed_chain.is_continuous():
        if pdbid not in complete_chains:
            complete_chains[pdbid] = {}
        complete_chains[pdbid][chain.id] = fixed_chain
    else:
        if pdbid not in incomplete_chains:
            incomplete_chains[pdbid] = {}
        incomplete_chains[pdbid][chain.id] = fixed_chain

3Q4K <Polypeptide(L) id=A Residues=356>
3Q4K <Polypeptide(L) id=B Residues=360>
5IEV <Polypeptide(L) id=A Residues=284>




4BBY <Polypeptide(L) id=A Residues=555>
4BBY <Polypeptide(L) id=B Residues=543>
6FGQ <Polypeptide(L) id=A Residues=229>




7M0C <Polypeptide(L) id=A Residues=332>




6UDL <Polypeptide(L) id=A Residues=469>
5VQA <Polypeptide(L) id=A Residues=368>
Building from AlphaFold model




6SSV <Polypeptide(L) id=A Residues=365>
Building from AlphaFold model




In [23]:
complete_chains

{'3Q4K': {'A': <Polypeptide(L) id=A Residues=366>,
  'B': <Polypeptide(L) id=B Residues=366>},
 '5IEV': {'A': <Polypeptide(L) id=A Residues=298>},
 '4BBY': {'A': <Polypeptide(L) id=A Residues=578>,
  'B': <Polypeptide(L) id=B Residues=578>},
 '6FGQ': {'A': <Polypeptide(L) id=A Residues=245>},
 '7M0C': {'A': <Polypeptide(L) id=A Residues=340>},
 '6UDL': {'A': <Polypeptide(L) id=A Residues=477>},
 '5VQA': {'A': <Polypeptide(L) id=A Residues=412>},
 '6SSV': {'A': <Polypeptide(L) id=A Residues=383>}}

In [24]:
for pdbid, chain_dict in complete_chains.items():
    for chain_id, chain in chain_dict.items():
        # PRO and GLY need special treatment when patched at the N-terminus 
        first_resname = chain.residues[0].resname
        if first_resname == 'PRO':
            first_patch = 'PROP'
        elif first_resname == 'GLY':
            first_patch = 'GLYP'
        else:
            first_patch = 'ACE'
        topo.generate(
            chain, first_patch=first_patch, last_patch='CT3', coerce=True
        )
        built_atoms = fix_chain(chain)



In [25]:
complete_chains['4BBY']['A']

NGLWidget()

<Polypeptide(L) id=A Residues=578>
  Description: ALKYLDIHYDROXYACETONEPHOSPHATE SYNTHASE, PEROXISOMAL


In [26]:
prev_level = set_verbosity(0)
for pdbid, chain_dict in complete_chains.items():
    for chain_id, chain in chain_dict.items():
        print(pdbid, chain_id)
        ## clear the currently loaded system
        empty_charmm()
        load_chain(chain)
        minimize(sd_nstep=300, abnr_nstep=0)
        sync_coords(chain)
        print('---'*20)
set_verbosity(prev_level)

3Q4K A
 ***** Message from SEQRDR ***** THE SYSTEM CONTAINS 74 TITRATABLE GROUPS
 THE USER MUST PREDETERMINE THE PROTONATION STATE THROUGH THE SEQUENCE AND RTF
 HIS -  0  HSD -  7  HSE -  0  HSP -  0  ASP - 19  GLU - 29  LYS - 12  TYR -  7
Adopted Basis Newton-Raphson minimization not performed
Synchronized: <Polypeptide(L) id=A Residues=366>
------------------------------------------------------------
3Q4K B
 ***** Message from SEQRDR ***** THE SYSTEM CONTAINS 74 TITRATABLE GROUPS
 THE USER MUST PREDETERMINE THE PROTONATION STATE THROUGH THE SEQUENCE AND RTF
 HIS -  0  HSD -  7  HSE -  0  HSP -  0  ASP - 19  GLU - 29  LYS - 12  TYR -  7

far from minimum for;
    IPHI=  563  with deltaPHI=-129.2985 MIN=   0.0000 ATOMS: 3199 3200 3201 3195
Adopted Basis Newton-Raphson minimization not performed
Synchronized: <Polypeptide(L) id=B Residues=366>
------------------------------------------------------------
5IEV A
 ***** Message from SEQRDR ***** THE SYSTEM CONTAINS 74 TITRATABLE GROUPS
 TH

0

## Important Notes

This method **does not guarantee** a correct final structure! Always inspect the complete chain visually and identify any artifacts. For example, the loop in Chain A of 5VQA shown below actually is crossed into the alpha helix, and no amount of minimization would fix this issue. You will need to either manually fix it or just use the SwissModel or Alphafold structure directly.

In [32]:
complete_chains['5VQA']['A']

NGLWidget()

<Polypeptide(L) id=A Residues=412>
  Description: Pachytene checkpoint protein 2 homolog


## Fetch Swiss Model or AlphaFold structure for Problematic Chains

In [33]:
entity_id = complete_chains['5VQA']['A'].entity_id
uniprot_id = uniprot_id_query('5VQA', entity_id)

sw_struct = fetch_swiss_model(uniprot_id)

In [34]:
sw_struct

NGLWidget()

<Structure id=Q15645-SwissModel Models=1>
│
├───<Model id=0 Chains=1>
	│
	├───<Polypeptide(L) id=A Residues=412>
	├──────Description: Pachytene checkpoint protein 2 homolog


In [35]:
af_struct = fetch_alphafold(uniprot_id)

In [36]:
af_struct

NGLWidget()

<Structure id=AF-Q15645-F1 Models=1>
│
├───<Model id=1 Chains=1>
	│
	├───<Polypeptide(L) id=A Residues=432>
	├──────Description: Pachytene checkpoint protein 2 homolog
