In [1]:
import warnings
from crimm import fetch_rcsb
from crimm.StructEntities.OrganizedModel import OrganizedModel
from crimm.Modeller.Solvator import Solvator

from crimm.Fetchers import fetch_rcsb
from crimm.Modeller import TopologyGenerator
from crimm.Modeller.CoordManipulator import CoordManipulator
from crimm.Modeller.LoopBuilder import ChainLoopBuilder
from crimm.Adaptors.PropKaAdaptors import PropKaProtonator
from crimm.Utils.StructureUtils import get_coords

# #NEW: PSF/CRD native I/O - read and write CHARMM files without pyCHARMM
from crimm.IO import read_psf, write_psf, write_crd, CRDParser

import pycharmm
from pycharmm.settings import set_verbosity as pcm_set_verbosity
from pycharmm import write as pcm_write
from pycharmm import NonBondedScript

from crimm.Adaptors.pyCHARMMAdaptors import (
    load_model,  # NEW: Unified loading function (recommended)
    load_chain, load_topology, load_water, load_ions, load_ligands,
    create_water_hs_from_charmm, fetch_coords_from_charmm, patch_disu_from_model,
    sd_minimize
)

import pycharmm.minimize as minimize
import pycharmm.energy as energy
from pycharmm import coor, crystal, image, cons_harm, cons_fix, generate



  from pkg_resources import resource_filename


# Structure Preparation with crimm and pyCHARMM

This tutorial demonstrates how to prepare a molecular structure for CHARMM simulations using `crimm` and `pyCHARMM`. 

## Workflow Overview

The tutorial covers two main pathways:

### Pathway 1: Start from PDB/mmCIF (Default)
1. **Fetch structure** from RCSB PDB
2. **Organize** into `OrganizedModel` (protein, ligand, solvent, ions)
3. **Build missing loops** using AlphaFold templates
4. **Generate topology** with CHARMM force field parameters
5. **Apply protonation** patches based on pH
6. **Load into pyCHARMM** for minimization and solvation
7. **Write output** as PSF/CRD files

### Pathway 2: Start from Existing PSF/CRD (#NEW)
If you already have PSF/CRD files from a previous session, you can load them directly using crimm's native readers.

## Requirements
- `crimm` with topology generation support
- `pyCHARMM` for CHARMM integration
- (Optional) CGenFF executable for ligand parameterization

In [None]:
# cgenff excutable path is used later in topology generation
CGENFF_PATH = "/export/app/cgenff/silcsbio.2024.1/cgenff/cgenff"
PDBID = '4E0J' #'5iev'#'1bg8' #'3q4k' #'4pti' #'2HZI' 

## Fetch from RCSB

The fetch_rcsb has be updated that it takes argument `organize`. When it is `True`, the structure will be organized into chain types, and an `OrganizedModel` will be returned instead of the unorganized structure entity.

In [None]:
structure = fetch_rcsb(
    '4pti',
    include_solvent=True, # We want to incude crystallographic water
    use_bio_assembly=True,
    organize=False,
    first_model_only=False
)

## Alternative: Load from Existing PSF/CRD Files (#NEW)

If you already have PSF and CRD files from a previous simulation or another source, you can skip the 
fetch/build steps and load directly using crimm's native readers. This is useful when:
- Resuming work from a previous session
- Loading structures prepared with standalone CHARMM
- Working with pre-parameterized systems

**Note**: The `read_psf` function returns a `PSFData` container with atoms, bonds, angles, etc.
The `CRDParser` returns coordinate data that can be applied to your structure.

In [None]:
# #NEW: Example of loading from existing PSF/CRD files
# Uncomment and modify paths to use this alternative workflow

# PSF_FILE = 'your_system.psf'
# CRD_FILE = 'your_system.crd'

# # Read PSF file - returns PSFData container with topology information
# psf_data = read_psf(PSF_FILE)
# print(f"Loaded {len(psf_data.atoms)} atoms from PSF")
# print(f"Bonds: {len(psf_data.bonds)}, Angles: {len(psf_data.angles)}, Dihedrals: {len(psf_data.dihedrals)}")

# # Read CRD file - returns coordinate information
# crd_parser = CRDParser(CRD_FILE)
# coords = crd_parser.parse()
# print(f"Loaded coordinates for {len(coords)} atoms")

In [None]:
## the OrganinzedModel is improved with more feature and APIs
## and has become the main object that deals with modeling and interfacing pyCHARMM
## There will be another notebook showcasing more about OrganizedModel

model = OrganizedModel(structure)
model

In [None]:
## Place the model center to (0, 0, 0) and place the principle axis along x-axis
coord_man = CoordManipulator()
coord_man.load_entity(model)
coord_man.orient_coords()

In [None]:
# build missing loops if exist
for chain in model.protein:
    if not chain.is_continuous():
        # chain can be built in place now by specifying `inplace = True`
        looper = ChainLoopBuilder(chain, inplace = True)
        # looper.build_from_homology(max_num_match=10, identity_score_cutoff=0.95)
        # missing terminals will also be built if `include_terminal = True`
        looper.build_from_alphafold(include_terminal = False)

In [None]:
chain.is_continuous()

## Generate Topology

Topology generation is simplified by using organized model. If `cgenff_path` is specified, ligands are also generated

In [None]:
TopologyGenerator?

In [None]:
topo = TopologyGenerator(
    cgenff_excutable_path=CGENFF_PATH,
    cgenff_output_path='./cgenff/'
)
topo.generate_model(
    model,
    prot_first_patch='ACE',
    prot_last_patch='CT3',
    coerce=True
)

### Printing out the TOPPAR and their Versions Being Used and Loaded

In [None]:
for rtf_type, topo_loader in topo.res_def_dict.items():
    print(rtf_type, 'toppar version:', topo_loader.rtf_version)

In [None]:
TopologyGenerator?

In [None]:
# Organized model
model

Modified residue creates breaks in chain after coersion

In [None]:
# Protonation 
protonator = PropKaProtonator(topo, pH = 6)
protonator.load_model(model)
# if there is any pathching applied in crimm, CHARMM PATCH command will be automatically run 
# when protein chains are loaded into CHARMM
protonator.apply_patches()

## Load Model into pyCHARMM

### Option 1: Unified Loading with `load_model()` (Recommended)

**New in crimm 2026.1**: The `load_model()` function provides a simplified one-call approach to load your entire model into pyCHARMM. This uses the native PSF/CRD format internally, which is more reliable and preserves all topology information including disulfide bonds.

```python
load_model(model)  # Loads topology params + entire model via PSF/CRD
```

This single call:
- Loads all topology parameters (RTF/PRM files) automatically
- Loads protein chains, ligands, water, and ions via PSF/CRD format
- Preserves disulfide bonds without needing separate `patch_disu_from_model()` call
- Handles lone pairs for CGENFF ligands automatically

In [None]:
# NEW: Load entire model in one call (recommended approach)
# This loads topology params AND structure via PSF/CRD format
load_model(model)

### Option 2: Sequential Loading (Legacy/Advanced)

**Note**: The following sequential loading approach is preserved for backwards compatibility and for users who need fine-grained control over the loading process. In previous versions, this was the only way to load structures into pyCHARMM.

If you used `load_model()` above, **skip this section** and proceed to "Minimize the Protein Chain First".

The sequential approach requires:
1. Loading topology parameters separately
2. Loading each component type individually
3. Manually patching disulfide bonds

In [None]:
# LEGACY APPROACH: Load topology separately
# Skip this if you used load_model() above

## All the topology definition and parameter generated for the model is 
## organized in model.topology_loader. load_topology() takes care of 
## loading sequence and only loads what is need for the model
load_topology(model.topology_loader)

In [None]:
# LEGACY APPROACH: Load protein chains sequentially
# Skip this if you used load_model() above

for chain in model.protein:
    load_chain(chain)
# In legacy mode, we need to patch disulfide bonds in CHARMM manually
# (load_model() handles this automatically via PSF)
patch_disu_from_model(model)

In [None]:
# LEGACY APPROACH: Load ligands separately  
# Skip this if you used load_model() above

# model.ligand+model.phos_ligand+model.co_solvent is the concatenated list of entities
load_ligands(model.ligand+model.phos_ligand+model.co_solvent)

## Minimize the Protein Chain First

In [None]:
# Specify nonbonded python object called my_nbonds - this just sets it up
# equivalant CHARMM scripting command: 
# nbonds cutnb 18 ctonnb 13 ctofnb 17 cdie eps 1 atom vatom fswitch vfswitch
non_bonded_script = NonBondedScript(
    cutnb=18.0, ctonnb=13.0, ctofnb=17.0,
    eps=1.0,
    cdie=True,
    atom=True, vatom=True,
    fswitch=True, vfswitch=True
)
# select the C-alpha atoms for harmonic restraints
cons_harm_atoms = pycharmm.SelectAtoms(atom_type='CA')
ener_dict = sd_minimize(300, non_bonded_script, cons_harm_selection=cons_harm_atoms)

## Sync Coord with pyCHARMM
We need to update the coords of crimm protein after minimization

In [None]:
## This is the new API for crimm sync coordinates with CHARMM
## The old sync_coord only works in a limited number of situations thus is DEPRECATED
fetch_coords_from_charmm(model.protein+model.ligand+model.phos_ligand+model.co_solvent)

In [None]:
model

## Solvation

In [None]:
solvator = Solvator(model)
# we want to keep the crystallograpic water using remove_existing_water=False
added_water = solvator.solvate(
    cutoff=8.0, solvcut=2.1, remove_existing_water=False, orient_coords=False
)
# Add 150 mM KCl using the new add_ions() method
# This uses SLTCAP/SPLIT methods for accurate ion concentration
ion_chain = solvator.add_ions(
    concentration=0.15,  # 150 mM
    cation='POT',        # K+
    anion='CLA',         # Cl-
    method='auto'        # auto-selects best method based on system charge
)

## Doc Strings for Solvator

The `add_ions()` method supports three ionization methods:
- **SPLIT**: Best when system has moderate charge
- **SLTCAP**: More accurate for highly charged systems  
- **Add-Neutralize**: Simple approach (may overestimate concentration)

Use `method='auto'` to let crimm select the best method based on your system.

In [None]:
Solvator?

In [None]:
Solvator.solvate?

In [None]:
Solvator.add_ions?

## Model after Solvation

After solvation, the model includes added water and ions. With 150 mM KCl, both K+ (POT) and Cl- (CLA) 
ions are added to achieve the target salt concentration while neutralizing the system charge.
The water box may be split into multiple chains due to the PDB residue number limit of **9999**.

In [None]:
model

## Load Solvated Entities into CHARMM

After solvation, we need to load the newly added water and ions into pyCHARMM. This step is required regardless of whether you used `load_model()` or the legacy sequential approach earlier, because the `Solvator` adds new entities to the model after the initial loading.

We use `append=True` to add these new entities to the existing CHARMM PSF structure.

In [None]:
model.solvent

In [None]:
# Load ions and water into pyCHARMM (required even if you used load_model earlier,
# because solvation adds new entities after the initial load)
load_ions(model.ion, append=True)
# This loads both crystallographic water and the water box generated by Solvator
load_water(model.solvent, append=True)
# crimm now builds hydrogens for crystallographic water automatically during solvation
# but we can also use CHARMM to rebuild them if needed
create_water_hs_from_charmm(model)

In [None]:
# We can visualize the crystallographic water and see the hydrogens are added
model.solvent[0]

## Set up PBC and Minimize Water

In [None]:
# organize segids and ion types for image and cons_fix
non_solvent_segids = set()
all_ion_types = set()
for chain in model:
    if chain.chain_type == 'Solvent':
        continue
    elif chain.chain_type == 'Ion':
        for res in chain:
            all_ion_types.add(res.resname)
    else:
        for res in chain:
            non_solvent_segids.add(res.segid)

In [None]:
# anything but solvent or ions in the model
non_solvent_segids

In [None]:
# all types of ions loaded in pyCHARMM by crimm
all_ion_types

In [None]:
# CHARMM scripting: crystal define cubic @boxsize @boxsize @boxsize 90 90 90
crystal.define_cubic(solvator.box_dim)
# CHARMM scripting: crystal build cutoff @boxhalf noper 0
crystal.build(solvator.box_dim/2)

In [None]:
# Turn on image centering - bysegment for protein, by residue for solvent and ions
# CHARMM scripting: image byseg xcen 0 ycen 0 zcen 0 select segid SEGID end
for segid in non_solvent_segids:
    image.setup_segment(0.0, 0.0, 0.0, segid)
# CHARMM scripting: image byres xcen 0 ycen 0 zcen 0 select resname tip3 end
image.setup_residue(0.0, 0.0, 0.0, 'TIP3')
# CHARMM scripting: image byres xcen 0 ycen 0 zcen 0 select resname ion_type end
for ion_type in all_ion_types:
    image.setup_residue(0.0, 0.0, 0.0, ion_type)

In [None]:
# Now specify nonbonded cutoffs for solvated box
cutnb = min(solvator.box_dim/2, 12)
cutim = cutnb
ctofnb = cutnb - 1.0
ctonnb = cutnb - 3.0

# Another nbonds example
# CHARMM scripting: nbonds cutnb @cutnb cutim @cutim ctofnb @ctofnb ctonnb @ctonnb -
#        inbfrq -1 imgfrq -1
non_bonded_script = pycharmm.NonBondedScript(
    cutnb=cutnb, cutim=cutim, ctonnb=ctonnb, ctofnb=ctofnb,
    eps=1.0,
    cdie=True,
    atom=True, vatom=True,
    fswitch=True, vfswitch=True,
    inbfrq=-1, imgfrq=-1
)

In [None]:
# We want to fix the protein and ligands and minimize the solvent to "fit"
# Select everything but solvent and ions
cons_fix_atoms = pycharmm.SelectAtoms()
for segid in non_solvent_segids:
    cons_fix_atoms |= pycharmm.SelectAtoms(seg_id=segid)

# Minimize the solvent positions with periodic boundary conditions using steepest descents
ener_dict = sd_minimize(200, non_bonded_script, cons_fix_selection=cons_fix_atoms)

In [None]:
fetch_coords_from_charmm(model)
model

In [None]:
pcm_write.coor_card(f'{PDBID}.crd')
pcm_write.psf_card(f'{PDBID}.psf')

## Alternative: Write PSF/CRD with Native crimm Writers (#NEW)

In addition to pyCHARMM's `write.psf_card()` and `write.coor_card()`, crimm provides native Python 
writers that don't require pyCHARMM. This is useful when:
- You need to write files without an active pyCHARMM session
- Working in environments where pyCHARMM isn't available
- Need programmatic control over the output format

**Important**: The native writers extract topology and coordinate information directly from the 
`OrganizedModel` and its associated `TopologyLoader`, ensuring consistency with the structure 
you've been working with in crimm.

In [None]:
# #NEW: Write PSF and CRD files using crimm's native writers
# These work directly with the OrganizedModel without requiring pyCHARMM

# Write PSF file - extracts topology from model.topology_loader
write_psf(model, f'{PDBID}_crimm.psf')
print(f"Written {PDBID}_crimm.psf")

# Write CRD file - extracts coordinates from model
write_crd(model, f'{PDBID}_crimm.crd')
print(f"Written {PDBID}_crimm.crd")

## Verify Output Files (#NEW)

You can verify that the native crimm writers produce equivalent output to pyCHARMM by comparing the files. 
The PSF files should contain identical atom counts, bonds, angles, and other topology information.
Minor differences in formatting or floating-point precision are expected but shouldn't affect simulations.

In [None]:
# #NEW: Quick verification - compare atom counts between pyCHARMM and crimm outputs
psf_charmm = read_psf(f'{PDBID}.psf')
psf_crimm = read_psf(f'{PDBID}_crimm.psf')

print("=== PSF Comparison ===")
print(f"pyCHARMM PSF: {len(psf_charmm.atoms)} atoms, {len(psf_charmm.bonds)} bonds")
print(f"crimm PSF:    {len(psf_crimm.atoms)} atoms, {len(psf_crimm.bonds)} bonds")
print(f"Match: {len(psf_charmm.atoms) == len(psf_crimm.atoms) and len(psf_charmm.bonds) == len(psf_crimm.bonds)}")