In [1]:
####################
## Initial stuff
####################

# Load functions, modules and global variables required in our pipeline
from simulate_structures_functions import *
from htmd.builder.charmm import _recoverProtonations

# Are we running curated structures??
curated = True

# PDB codes of the GPCRs to be simulated. 
# If no codes are provided, all avalible structures in GPCRdb will be used (except the ones already simulated)
noncomplex = {"3C9L","3C9M","6ZDR","6ZDV","6TPK", "6S0Q", "6V9S", "6WJC", "6W25", "6OBA", "6LW5", "6KP6", "6LUQ", "6LI1", "6LI0", "6LI2", "6KPC", "6LRY", "6KNM", "6TOT", "6TOS", "6TO7", "6TP6", "6TQ7", "6TOD", "6TP3", "6TQ4", "6TQ9", "6TP4", "6TQ6", "6TPN", "6TPG", "6TPJ", "6OL9", "6RZ6", "6RZ9", "6RZ7", "6RZ8", "6PT2", "6PT3", "6KUX", "6KUY", "6KUW", "6IQL", "6PS7", "6PS0", "6PS3", "6PS5", "6PS1", "6PS4", "6PRZ", "6PS2", "6PS6", "6KK1", "6KK7", "6KJV", "6PS8", "6JZH", "6RZ4", "6RZ5", "6KQI", "6K1Q", "6GT3", "6MH8", "6ME2", "6ME3", "6ME4", "6ME5", "6ME6", "6ME7", "6ME8", "6ME9", "6J21", "6J20", "6A94", "6A93", "5ZTY", "6HLP", "6HLO", "6HLL", "6GPX", "6GPS", "6IIV", "6IIU", "6E59", "6M9T", "6AK3", "5ZHP", "5ZKC", "5ZK3", "5YC8", "5ZK8", "5ZKB", "6IGK", "6IGL", "6FJ3", "6AKX", "6AKY", "6D27", "6D26", "6DRX", "6DS0", "6DRY", "6DRZ", "6BD4", "5ZKQ", "5ZKP", "6C1R", "6C1Q", "6D32", "6D35", "5KW2", "5ZBH", "5ZBQ", "6FK7", "6FKA",  "6FK6", "6FK9", "6FKC", "6FK8", "6FKB", "6CM4", "6FFI", "6FFH", "5WF5", "5WF6", "5V54", "5OLH", "5OLZ", "5OLO", "5OM1", "5OLG", "5OLV", "5OM4", "5YQZ", "6AQF", "5O9H", "5X33", "5VRA", "5WS3", "5WQC", "5WIV", "5WIU", "5NM4", "5NM2", "5NLX", "5X7D", "5X93", "5XPR", "5XSZ", "5N2S", "5MZP", "5MZJ", "5N2R", "5XRA", "5XR8", "5UIW", "5NX2", "5TZY", "5TZR", "5JTB", "5VBL", "5UVI", "5VEW", "5V56", "5V57", "5VEX", "5NDD", "5NDZ", "5UNH", "5UNF", "5UNG", "5TE3", "5TE5", "5UEN", "5UIG", "5TVN", "5T04", "5T1A", "5U09", "5TGZ", "5K2C", "5K2B", "5K2A", "5K2D", "5GLI", "5GLH", "5D6L", "5DYS", "5L7D", "5L7I", "5IU7", "5IUB", "5IU8", "5IU4", "5IUA", "4Z9G", "5EE7", "5DSG", "5CXV", "4ZJ8", "4ZJC", "5F8U", "5DHG", "5DHH", "4ZUD", "5A8E", "5CGC", "5CGD", "4XEE", "4XES", "4Z35", "4Z34", "4Z36", "4YAY", "4UHR", "4XNW", "4XNV", "4XT3", "4RWS", "4RWD", "4S0V", "4U15", "4U16", "4QIM", "4QIN", "4PHU", "4OO9", "4PXZ", "4PY0", "4BVN", "4NTJ", "4OR2", "4O9R", "4BUO", "4N4W", "4N6H", "4NC3", "4MBS", "4L6R", "4K5Y", "4JKV", "3ZPR", "3ZPQ", "4IAQ", "4IAR", "4IB4", "4GPO", "3VW7", "4GBR", "4GRV", "4EIY", "4AMJ", "4AMI", "4EJ4", "4EA3", "3UZA", "3UZC", "4DJH", "4DKL", "3V2Y", "3UON", "3REY", "3RFM", "3RZE", "2YCZ", "2YCW", "2YDV", "2YDO", "3QAK", "2Y04", "2Y00", "2Y03", "2Y02", "3PDS", "3PBL", "3ODU", "3OE0", "3NY9", "3NY8", "3NYA", "3D4S", "2Z73", "2RH1", "1U19", "1GZM"}
noncomplex_nonGPCRmd = {"3C9L","3C9M","6ZDR","6ZDV",'6RZ8', '6FFI', '6KJV', '6TQ6', '6DRY', '5WIV', '5V57', '6LI2', '6MH8', '5MZJ', '5KW2', '5X33', '5ZHP', '6PS6', '6FJ3', '6D35', '4NTJ', '6A94', '6AK3', '5UVI', '6FFH', '5WIU', '6KQI', '5XR8', '6RZ9', '5VEX', '6LW5', '6FKA', '6PS7', '6M9T', '6KNM', '6ME6', '6GPX', '5XSZ', '5K2A', '5YQZ', '6PS0', '5K2C', '6ME7', '2Z73', '6D32', '6RZ4', '5YC8', '5T1A', '6AKY', '4N4W', '5WS3', '6LRY', '6J21', '5TE5', '4O9R', '6FK6', '1GZM', '6DRX', '6TQ7', '5NDZ', '6IGL', '5VRA', '5ZBQ', '6TPN', '5ZTY', '6DS0', '5NLX', '5X93', '6HLP', '5OLG', '5OM4', '5O9H', '5XPR', '6A93', '5GLI', '6OBA', '6D27', '6HLO', '6RZ5', '5TE3', '6IIV', '6KP6', '6TQ4', '6TPJ', '5V56', '5UNG', '6FK9', '6PS8', '2YCZ', '6JZH', '5XRA', '5ZKB', '4JKV', '5MZP', '5K2D', '5OLV', '5X7D', '5T04', '5OLO', '5DYS', '6FK7', '6J20', '6K1Q', '5EE7', '6D26', '6PS1', '6DRZ', '4GBR', '6FKC', '6TP6', '6LUQ', '5ZKQ', '5UNH', '6PRZ', '5UNF', '5NX2', '6LI0', '5NM2', '5D6L', '6AQF', '6KK1', '6C1Q', '4Z9G', '6LI1', '5WF5', '6FKB',  '5OLH', '4XT3', '5ZK3', '5VEW', '4XES', '6HLL', '6TOT', '5N2R', '5WF6', '5K2B', '6TPG', '6PS4', '5F8U', '6ME4', '6GT3', '6RZ6', '6ME8', '6ME2', '5VBL', '5N2S', '5ZBH', '6PS3', '6IIU', '4NC3', '5JTB', '5ZKP', '6TQ9', '6GPS', '6TP3', '6RZ7', '6ME3', '5ZK8', '6C1R', '5V54', '5UIG', '5TVN', '5OM1', '6AKX', '5ZKC', '5NM4', '5TZY', '6TOD', '4GPO', '6TO7', '6CM4', '6TOS', '6PS5', '6ME9', '5NDD', '6KPC', '6KK7', '4BUO', '6OL9', '6PS2', '6FK8', '6ME5', '6TP4', '4QIM', '6IGK', '5WQC', '5UIW', '5TZR', '5OLZ', '4EJ4', '6E59'}
noncomplex_nonGPCRmd_agonist = {'5T04', '6ME9', '4BUO', '6ME6', '5VBL', '5TZR', '6D35', '6LW5', '5YQZ', '6ME3', '5WF6', '6D32', '5TE5', '4XES', '6AK3', '5DYS', '5KW2', '5WF5', '6FK6', '6ME7', '4NC3', '6ME2', '6FKA', '6DRY', '6FK7', '5TVN', '6KPC', '6KQI', '6LRY', '6IGL', '5NX2', '6M9T', '6FKC', '6FKB', '6FJ3', '6IGK', '6KNM', '6FK9', '6ME8', '6ME5', '6ME4', '5XRA', '5N2S', '5TZY', '6FK8', '6LI0', '5XR8'}
modres_set = {'5NX2', '5UIW', '5VBL', '5TE5', '6LW5', '5X7D', '4XT3'}
directedevo_set = {'6FJ3', '6NBF'}
pdb_set = directedevo_set

# Our main path
basepath = '/gpcr/users/daranda/doctorat/directed_evolution/'
# Other Paths
strucpath = basepath + 'data_structures/'
resultspath = basepath + 'simulation_output/'
membranepdb = basepath + 'membrane/popc36_box_renumbered.pdb'
topparpath = basepath + 'toppar/TOP_PARAMS_ACE3/'#toppar= topology+parameters
ligandsdict_path = basepath + 'ligands.json'
modres_path = basepath + 'modified_residues.json'
slurmpath = basepath+'fake_slurm/'
# Path to slurm queing system binaries
# In our case, Ismael designed a bunch of small bash scripts (fake_slurm) which do ssh to Hydra and execute slurm there
path= os.environ['PATH']

# Modify path to include fake slurm
%env PATH=$path:$slurmpath

# Load topology, parameter and stream files with our current basepath
topos = [os.path.join(topparpath,file) for file in toposfilenames] 
params = [os.path.join(topparpath,file) for file in paramsfilenames]
streams = [os.path.join(topparpath,file) for file in streamsfilenames]

# Open and load mutations from mutations file
mutdict = json_dict(basepath+"mutations.json")





Please cite HTMD: Doerr et al.(2016)JCTC,12,1845. https://dx.doi.org/10.1021/acs.jctc.6b00049

HTMD Documentation at: https://www.htmd.org/docs/latest/



2021-06-10 08:26:08,842 - binstar - INFO - Using Anaconda API: https://api.anaconda.org


New stable HTMD version (1.24.8 python[3.7,<3.8.0a0,3.6,<3.7.0a0]) is available. You are currently on (1.22.1).There are several methods to update:    - Create a new conda env. using `conda create -n htmd1.24.8 htmd=1.24.8 -c acellera -c psi4 -c conda-forge`    - Create a brand new conda installation and run `conda install htmd -c acellera -c psi4 -c conda-forge`    - Run: `conda update htmd -c acellera -c psi4 -c conda-forge` (NOT RECOMMENDED)





env: PATH=/home/david/bin:/home/david/.local/bin:/home/david/miniconda3/bin:/home/david/miniconda3/condabin:/home/david/bin:/home/david/.local/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/david/bin:/home/david/bin:/gpcr/users/daranda/doctorat/directed_evolution/fake_slurm/


In [2]:
################################################
# Part 1: Download data and prepare dictionaries
################################################

if not bool(pdb_set):
    #Get not yet simulated PDB codes from GPCRdb
    pdb_set = get_GPCRdb_nonsimulated(gpcrdb_dict)

# Download and store structures from GPCRdb
download_GPCRdb_structures(pdb_set, strucpath)

#Create or moidfy the ligands dictionary
(ligandsdict, ligandsset, modresdict) = ligand_dictionary(pdb_set, ligandsdict_path, modres_path, blacklist)

# Make list of aminergic ligands
aminergic_lig = find_aminergic(ligandsdict, gpcrdb_dict, blacklist, strucpath)

# Download ligand structures from system
download_ligands(ligandsset, basepath, aminergic_lig)

# Get topology-parameter files for ligands
get_lig_toppar(ligandsdict, basepath, username, password)

# Get topology-parameter files for modified residues
get_modres_toppar(modresdict, basepath, username, password)

Downloading 6FJ3 structure (1/2)
Structure for 6FJ3 already present. Skipping...
Downloading 6NBF structure (2/2)
Structure for 6NBF already present. Skipping...


In [42]:
###########################
## Part 2: Build the models 
###########################
# Iterate by GPCRdb structures to simulate
pdbs_number = len(pdb_set)
i = 0
pdb_set = {'6NBF'}
for pdbcode in pdb_set:
#     try:
        for apo in [False]:
#         for apo in [True, False]:
            
            #Starting simulation
            start_time = time.time()        
            i += 1
            sysname = pdbcode+'_apo' if apo else pdbcode
            mystrucpath = strucpath+pdbcode+'/'

            # Skip if there is already a model build for this
            if os.path.exists(resultspath+'build/'+sysname+'/structure.pdb'):
                print('Build model for '+sysname+' already exists. Skipping...')
#                 continue

            # Check if simulation is aminergic
            aminergic = gpcrdb_dict[pdbcode]['family'].startswith('001_001')
            adenosine = gpcrdb_dict[pdbcode]['family'].startswith('001_006_001')

            ## Load curated structures
            mystrucfile = mystrucpath+pdbcode+"_curated.pdb"
            gpcrdb_mol = Molecule(mystrucfile)
                
            # If the pipeline is running in 'apoform mode', remove any non-protein, non-ion, non-water thing on the system      
            # Delete also sod2x50 (we don't want them here)
            if apo:
                gpcrdb_mol.remove('not (protein or water or ion) or element Na')
                
            # Remove unnecessary ligand molecules: mostly crystalization detergents, quelants, buffers,
            # or post-traductional glicosilations
            gpcrdb_mol.remove('resname '+' '.join(blacklist))
            
            # Remove 2x50Sodium from non-A-class GPCRs
            if not gpcrdb_dict[pdbcode]['family'].startswith('001'):
                gpcrdb_mol.remove('element NA')

            # Get aligned OPM structure
            thickness,opm_mol = get_opm(pdbcode)
            
            # If there's any, parameterize and rename covalent-bound ligands
            if not apo:
                (gpcrdb_mol, covligs) = covalent_ligands(gpcrdb_mol, pdbcode, ligandsdict)
            else:
                covligs = []

            # Ismael's function to add labels (segid) for 'ligand' and 'protein' parts of the system
            gpcrdb_mol_fixed,prot_segids = fix_and_prepare_input(gpcrdb_mol,pdbcode,modresdict,new_pdb_chain)
            
            # write file to remember to which chain in the original structure belongs each segment
            segchain_json(gpcrdb_mol_fixed, pdbcode, basepath, prot_segids)
            
            # Align structrues using sequences, and take first one
            alignment_results = sequenceStructureAlignment(gpcrdb_mol_fixed, opm_mol, maxalignments = 1)
            mol_aligned = alignment_results[0] 
            
            #Center to receptor XY
            center = np.mean(mol_aligned.get('coords',sel='chain P'),axis=0)
            mol_aligned.moveBy([-center[0],-center[1],0])

            # Prepare protein: asign titration states, flipping side chains of HIS, ASN and GLN; rotate some sidechains, optimize waters, etc.
            # Most of this is done with a HTMD function called proteinPrepare()
            # Skip step if we are working with curators structures
            prepared_mol = mol_aligned if curated else prepare_system(mol_aligned, pdbcode, thickness, sod2x50, aminergic, adenosine)
            
            #Add membrane
            print('Adding membrane...')
            membranemol = Molecule(membranepdb)
            mol_membraned, membrane_resnames, membrane_segids, xreps, yreps = add_membrane(prepared_mol, membranemol,prot_segids,membrane_distance)
            
            # Needed later for equilibration
            with open(mystrucpath+"const_sel.txt",'w') as out: 
                const_sel = 'segid '+' '.join(prot_segids)+' and name C CA N O or not (segid ' + \
                  ' '.join(prot_segids)+' or resname '+' '.join(membrane_resnames) + \
                  ' or water or ions ) and noh or segid ION WAT and noh'
                out.write(const_sel)

            #Solvate
            print('Solvating...')
            mol_solvated = solvate_pdbmol(mol_membraned,membrane_segids,water_thickness,water_margin,buffer=buffer,coldist=coldist,prefix='WT')

            # Check if system has lone-pair hallogen atoms. If it does, use legacy CGenFF parameters
            (cgenff_par,cgenff_top, has_halo) =cgenff_params(gpcrdb_mol_fixed, topparpath)
            
            #Obtain extra parameters for ligands and modified residues 
            ligstreams=extra_parameters(pdbcode, ligandsdict, modresdict, blacklist, covligs, basepath, has_halo)
            
            # Assignign terminology for cap atoms of protein chain, depending if it is the receptor protein or not
            caps = get_caps(prot_segids, mol_solvated)
            #{'P0': ['first ACE', 'last CT3'], 'P1': ['first ACE', 'last CT3']}
            
            #Pre-build model
            print('Pre-build...')
            prebuildmol = charmm.build(mol_solvated, 
                                       topo=topos+cgenff_top, 
                                       param=params+cgenff_par,
                                       stream=streams+ligstreams,
                                       caps=caps,
                                       outdir=resultspath+'/pre-build/'+sysname,
                                       ionize=False)
            
            # Save prebuild model topologies in files, and  store prebuild model in molecule object
            prebuild_psffile = prebuildmol.topoloc
            prebuild_pdbfile = os.path.splitext(prebuildmol.topoloc)[0]+'.pdb'
            prebuildmol = Molecule(prebuild_pdbfile)
            _recoverProtonations(prebuildmol)

            # Checking of aromatic insertions (takes quite a lot fo time)
            print('Checking aromatic insertions...')
            mol_removed,removed_indexes = remove_aromatic_insertions(prebuildmol,prot_segids, outpdb=resultspath+'/pre-build/'+sysname+'/aromatic_check.pdb')

            # Checking of water/lipid ratio
            lipid_num = len(set(prebuildmol.get('resid',sel='segid '+membrane_lipid_segid)))
            solv_num = len(mol_removed.get('index',sel='resname TIP3 and name OH2'))
            if float(solv_num) / lipid_num < 35:
                raise ValueError('Water/lipid ratio lower than 35.')

            #Renumber residues
            print('Renumbering...')
            mol_renumbered = renumber_resid_vmd(mol_removed,'segid '+' '.join(membrane_segids),by=2)

            # Ionizing system
            print('Ionizing...')
            molbuilt = charmm.build(mol_removed,
                                    topo=topos+cgenff_top, 
                                    param=params+cgenff_par,
                                    stream=streams+ligstreams,                        
                                    outdir=resultspath+'/ionize/'+sysname,
                                    saltconc=0.15,
                                    caps=caps)
            build_psffile = molbuilt.topoloc
            build_pdbfile = os.path.splitext(molbuilt.topoloc)[0]+'.pdb'
            molbuilt = Molecule(build_pdbfile)
            _recoverProtonations(molbuilt)

            #Building system
            print('Building...')
            molbuilt = renumber_resid_vmd(molbuilt,'segid "WT.*" or segid I',by=2)
            molbuilt = charmm.build(molbuilt, 
                                    topo=topos+cgenff_top, 
                                    param=params+cgenff_par,
                                    stream=streams+ligstreams,                        
                                    outdir=resultspath+'/build/'+sysname,
                                    caps=caps,ionize=False)

            print('End of %s after %s seconds\n' % (sysname, time.time() - start_time))

#     except Exception as e:
#         print("model "+pdbcode+" could not be build because ",e)

Build model for 6NBF already exists. Skipping...


2021-04-26 12:46:58,696 - moleculekit.molecule - INFO - Removed 0 atoms. 5988 atoms remaining in the molecule.
2021-04-26 12:46:58,797 - moleculekit.molecule - INFO - Removed 0 atoms. 5988 atoms remaining in the molecule.
2021-04-26 12:47:01,410 - moleculekit.molecule - INFO - Removed 0 atoms. 5988 atoms remaining in the molecule.
2021-04-26 12:47:01,511 - moleculekit.molecule - INFO - Removed 4 atoms. 5984 atoms remaining in the molecule.
2021-04-26 12:47:01,807 - moleculekit.molecule - INFO - Removed 0 atoms. 5984 atoms remaining in the molecule.
2021-04-26 12:47:13,193 - moleculekit.molecule - INFO - Removed 0 atoms. 5984 atoms remaining in the molecule.
2021-04-26 12:47:13,782 - moleculekit.tools.sequencestructuralalignment - INFO - No segment was specified by the user for `mol` and multiple segments (['L', 'P']) were detected. Alignment will be done on all protein segments.
2021-04-26 12:47:13,913 - moleculekit.tools.sequencestructuralalignment - INFO - No segment was specified by

Adding membrane...


2021-04-26 12:47:17,287 - htmd.builder.builder - INFO - Replicating Membrane 3x3
Replicating Membrane: 100%|██████████| 9/9 [00:06<00:00,  1.36it/s]
2021-04-26 12:50:52,989 - moleculekit.molecule - INFO - Removed 2593 atoms. 47636 atoms remaining in the molecule.
2021-04-26 12:50:59,665 - moleculekit.molecule - INFO - Removed 123 residues from appended Molecule due to collisions.


Solvating...
wataerbox Max and min:  [54.916504 53.56571  27.948414] [-49.010494 -50.375294 -27.846586]


2021-04-26 12:51:01,675 - htmd.builder.solvate - INFO - Using water pdb file at: /soft/system/easybuild/software/Miniconda3/4.7.10/lib/python3.6/site-packages/htmd/share/solvate/wat.pdb
2021-04-26 12:51:03,637 - htmd.builder.solvate - INFO - Replicating 8 water segments, 2 by 2 by 2
Solvating: 100%|██████████| 8/8 [00:24<00:00,  3.01s/it]
2021-04-26 12:51:32,425 - htmd.builder.solvate - INFO - 14448 water molecules were added to the system.
2021-04-26 12:51:50,501 - moleculekit.molecule - INFO - Removed 927 atoms. 90035 atoms remaining in the molecule.


Pre-build...


2021-04-26 12:51:57,393 - htmd.builder.charmm - INFO - Writing out segments.
2021-04-26 12:52:13,910 - htmd.builder.builder - INFO - One disulfide bond was added


Disulfide Bond between: UniqueResidueID<resname: 'CYS', chain: 'P', resid: 281, insertion: '', segid: 'P'>
                   and: UniqueResidueID<resname: 'CYS', chain: 'P', resid: 351, insertion: '', segid: 'P'>



2021-04-26 12:52:15,019 - htmd.builder.charmm - INFO - Starting the build.
2021-04-26 12:52:16,939 - htmd.builder.charmm - INFO - Finished building.


Checking aromatic insertions...


KeyboardInterrupt: 

In [43]:
#########################
## Part 3: Equillibration
#########################

#for pdbcode in pdb_set:
for pdbcode in ['6FJ3','6NBF']:
    for apo in [False]:
#     for apo in [True, False]:
        try:

            # If simulation for this PDB has already been run
            modelname = pdbcode+'_apo' if apo else pdbcode
            equildir='%sequil/%s/' % (resultspath, modelname)
            if os.path.exists(equildir+'output.xtc') or os.path.exists(equildir+'simrunning'):
                print("Structure %s already has been equillibrated" %(pdbcode))
                continue

            # Preparing scripts to run equillibration
            if not os.path.exists(equildir):
                os.makedirs(equildir)

            # Taking vmd selection line
            with open(strucpath+pdbcode+'/const_sel.txt', 'r') as outfile:
                const_sel = outfile.readlines()[0]

            md = define_equilibration(const_sel)
            md.write(resultspath+'build/'+modelname,equildir)

            #Substitute run.sh generated by HTMD by a different one, adapted to the specified path of ACEMD
            with open(equildir + 'run.sh', 'w') as f:
                f.write('#!/bin/bash\n%s > %slog.txt 2>&1' % (acemd_path, equildir))

            #Prepare slurm job  
            sq = SlurmQueue()
            sq.envvars = acemd_license
            sq.jobname = 'eql_evo'
            sq.datadir = None
            sq.partition = 'gpcr_gpu'
            sq.priority = '1'
            sq.ngpu = 1
            sq.ncpu = 1
            sq.memory = 2000
            sq.prerun = job_commands(equildir, '/home/daranda/%s_eq/'%(modelname))
            sq.exclude = ['aragorn','arwen']
#                 sq.nodelist = ['aragorn','arwen']

            # Submit
            sq.submit(equildir)

        except Exception as e:
            print("model "+modelname+" could not be send to equilibrate because of ",e)

Structure 6FJ3 already has been equillibrated
Structure 6NBF already has been equillibrated


In [11]:
#######################################
## Part 3.5: Mutate and re-equillibrate
#######################################

#Iterate over pdbcodes
for pdbcode in ['6NBF']:
    equildir = '%sequil/%s/' % (resultspath, pdbcode)
    
    # Taking vmd selection line
    with open(strucpath+pdbcode+'/const_sel.txt', 'r') as outfile:
        const_sel = outfile.readlines()[0]
    
    # Write a (wrapped) PDB from last frame of equillibrated system
    equilwrap_structure(equildir)
    
    # Load equillibrated structure into molecule
    eqmol = Molecule(equildir+'equillibrated.pdb')
    
    # Iterate over mutated clones 
    for mutant in ['ML643-73','ML633-57']:
#     for mutant in mutdict:

        #Prepare directories for equillibration of mutated structure
        mutdir = '%s/mutate/%s/%s/' % (resultspath, pdbcode, mutant)
        os.makedirs(mutdir, exist_ok=True)
        equilmut = '%s/equilmut/%s/%s/' % (resultspath, pdbcode, mutant)
        os.makedirs(equilmut, exist_ok=True)
        
        # Create mutant of pdbcode in mutdir with the mutations indicated
        mol = eqmol.copy()
        mutate(mol, pdbcode, equildir, mutdir, mutdict[mutant], basepath, topparpath)

        # Start equillibration
        md = define_equilibration(const_sel, simtime=6, minimize = 1000)
        md.write(mutdir,equilmut)
        
        # Copy files of system equillibration into the mutant folder
        for sufix in ['vel','xsc']:
            shutil.copyfile(equildir+'output.'+sufix, equilmut+'input.'+sufix)
        
        # Append lines in the "input" file to use previous equillibration results
        with open(equilmut+'input', 'a') as infile:
            infile.write("extendedsystem          input.xsc\n")
        
        #Substitute run.sh generated by HTMD by a different one, adapted to the specified path of ACEMD
        with open(equilmut + 'run.sh', 'w') as f:
            f.write('#!/bin/bash\n%s > %slog.txt 2>&1' % (acemd_path, equilmut))

        #Prepare slurm job  
        sq = SlurmQueue()
        sq.envvars = acemd_license
        sq.jobname = 'mut_evo'
        sq.datadir = None
        sq.partition = 'gpcr_gpu'
        sq.priority = '1'
        sq.ngpu = 1
        sq.ncpu = 1
        sq.memory = 2000
        sq.prerun = job_commands(equilmut, '/home/daranda/%s_eq_%s/'%(pdbcode,mutant))
        sq.exclude = ['aragorn','arwen','gimli','bifur']
#                 sq.nodelist = ['aragorn','arwen']
        # Submit
        sq.submit(equilmut)
        


2021-05-03 14:25:12,303 - htmd.protocols.equilibration_v2 - INFO - Using user-provided restraints and ignoring constraints and fb_potential
2021-05-03 14:25:14,647 - jobqueues.slurmqueue - INFO - Queueing /gpcr/users/daranda/doctorat/directed_evolution/simulation_output//equilmut/6NBF/ML643-73/
2021-05-03 14:25:54,618 - htmd.protocols.equilibration_v2 - INFO - Using user-provided restraints and ignoring constraints and fb_potential
2021-05-03 14:25:56,379 - jobqueues.slurmqueue - INFO - Queueing /gpcr/users/daranda/doctorat/directed_evolution/simulation_output//equilmut/6NBF/ML633-57/


In [31]:
#####################
## Part 4: Production
#####################

# Production protocol
md = define_production(timestep, trajperiod)

# Open and load mutations from mutations file
mutdict = json_dict(basepath+"mutations.json")

# For each PDB 
for pdbcode in ['6NBF']:
#for pdbcode in pdb_set:
    for mutant in mutdict:
#     for mutant in ["ML643-73"]:
        # must match with equildir in equilibration launcher code and contain input and output of equilibration.
        modelname = pdbcode
        equildir = '%s/equil/%s/' % (resultspath, modelname)
        for rep in range(1,repnum+1):
            try:
                #Prepare directories for equillibration of mutated structure
                proddir = '%s/production/%s/%s/rep_%d/' % (resultspath, pdbcode, mutant, rep)
                os.makedirs(proddir, exist_ok=True)
                equilmut = '%s/equilmut/%s/%s/' % (resultspath, pdbcode, mutant)
                os.makedirs(equilmut, exist_ok=True)

                # If simulation for this PDB has already been run
                if os.path.exists(proddir+'/output.xtc') or os.path.exists(proddir+'simrunning'):
                    print("mutant %s replicate %d of structure %s has already been simulated" %(mutant, rep, pdbcode))
                    continue

                print('submitting mutant %s replicate %d of %s' % (mutant, rep, pdbcode))
                # directory copy output of equilibration to production input (initial working directory for run_prod.sh).
                md.write(equilmut,proddir)

                sq = SlurmQueue()
                sq.envvars = acemd_license
                sq.jobname = mutant
                sq.datadir = None
                sq.partition = 'gpcr_gpu'
                sq.prerun = job_commands(proddir, '/home/daranda/%s_pr_%d/'%(modelname,rep))
                sq.ngpu = 1
                sq.ncpu = 2
                sq.exclude = ['arwen','aragorn','bifur','gimli']            

                #Substitute run.sh generated by HTMD by a different one, adapted to the specified path of ACEMD
                with open(proddir + 'run.sh', 'w') as f:
                    f.write('#!/bin/bash\n%s > %slog.txt 2>&1' % (acemd_path, proddir))

                sq.submit(proddir)
            except Exception as e:
                print("model "+modelname+" could not be send to production because of ",e)            
                

mutant ML644-1 replicate 1 of structure 6NBF has already been simulated
mutant ML644-1 replicate 2 of structure 6NBF has already been simulated
mutant ML644-1 replicate 3 of structure 6NBF has already been simulated
mutant ML643-31 replicate 1 of structure 6NBF has already been simulated
mutant ML643-31 replicate 2 of structure 6NBF has already been simulated
mutant ML643-31 replicate 3 of structure 6NBF has already been simulated
mutant ML643-27 replicate 1 of structure 6NBF has already been simulated
mutant ML643-27 replicate 2 of structure 6NBF has already been simulated
mutant ML643-27 replicate 3 of structure 6NBF has already been simulated
mutant ML643-12 replicate 1 of structure 6NBF has already been simulated
mutant ML643-12 replicate 2 of structure 6NBF has already been simulated
mutant ML643-12 replicate 3 of structure 6NBF has already been simulated
mutant ML643-36 replicate 1 of structure 6NBF has already been simulated
mutant ML643-36 replicate 2 of structure 6NBF has alre

2021-06-08 12:24:38,175 - jobqueues.slurmqueue - INFO - Queueing /gpcr/users/daranda/doctorat/directed_evolution/simulation_output//production/6NBF/ML643-73/rep_1/


mutant ML643-73 replicate 2 of structure 6NBF has already been simulated
submitting mutant ML643-73 replicate 3 of 6NBF


2021-06-08 12:25:04,008 - jobqueues.slurmqueue - INFO - Queueing /gpcr/users/daranda/doctorat/directed_evolution/simulation_output//production/6NBF/ML643-73/rep_3/


mutant ML644-7 replicate 1 of structure 6NBF has already been simulated
mutant ML644-7 replicate 2 of structure 6NBF has already been simulated
mutant ML644-7 replicate 3 of structure 6NBF has already been simulated
mutant ML634-10 replicate 1 of structure 6NBF has already been simulated
mutant ML634-10 replicate 2 of structure 6NBF has already been simulated
mutant ML634-10 replicate 3 of structure 6NBF has already been simulated


In [None]:
# WARNING!!!: run me to KILL simulations that are still running
for modelname_rep in sqs_p:
    sqs_p[modelname_rep].stop()

In [13]:
# reset tracking for all
sqs_p = {}

In [30]:
##########################
## Part 5: Wrap Structures
##########################

# Things to wrap around
gpcr_sel = "protein and chain "+new_pdb_chain

# For each PDB 
for pdbcode in ['6NBF']:
#for pdbcode in pdb_set:
    for mutant in ['ML644-7','ML643-36','ML643-73']:
#     for mutant in ["ML643-73"]:
        # must match with equildir in equilibration launcher code and contain input and output of equilibration.
        modelname = pdbcode
        mutdir = '%sproduction/%s/%s/' % (resultspath, modelname, mutant)
        pdbname = glob(mutdir+'/rep_*/structure.pdb')[0]
        mymol_pdb = Molecule(pdbname)
        for rep in range(2,repnum+1):
        #for rep in [3]:
            start_time = time.time()        
            print('wrapping replicate %d of %s-%s' % (rep, modelname, mutant))
            proddir='%s/rep_%d/' % (mutdir, rep)
            rep = str(rep)

            # To avoid repeating wrapping in Trajectories already wrapped, check the existance of this file
            outname = proddir+'output_wrapped.xtc'
            if os.path.exists(outname):
                print('replicate already wrapped. Skipping...')
                continue

            # Skip if traj not avalible
            trajname = proddir+'output.xtc'
            if not os.path.exists(trajname):
                print("trajectory %s of mutant %s not avalible. Skipping..."%(rep,mutant))
                continue
                
            # Open a vmd viewer, and load molecule inside 
            mymol = Molecule(proddir+'structure.psf')
            mymol.read(trajname)
            mymol.wrap(gpcr_sel)

            # Align frames
            mymol.align('all', refmol=mymol_pdb)
            mymol.write(outname)

            print('End of %s after %s seconds\n' % (pdbcode, time.time() - start_time))


wrapping replicate 2 of 6NBF-ML644-7
replicate already wrapped. Skipping...
wrapping replicate 3 of 6NBF-ML644-7
replicate already wrapped. Skipping...
wrapping replicate 2 of 6NBF-ML643-36
trajectory 2 of mutant ML643-36 not avalible. Skipping...
wrapping replicate 3 of 6NBF-ML643-36
trajectory 3 of mutant ML643-36 not avalible. Skipping...
wrapping replicate 2 of 6NBF-ML643-73
End of 6NBF after 161.52763557434082 seconds

wrapping replicate 3 of 6NBF-ML643-73


ValueError: Number of atoms in file (89618) mismatch with number of atoms in the molecule (89602)

In [118]:
###################################
## Part 5: Upload results to GPCRmd
###################################

#### Functions

mainurl = 'http://localhost:8000' 
# mainurl = 'https://submission.gpcrmd.org'

def resp_to_dict(resp):
    # Convert a json reponse into a dictionary
    return eval(resp.content.decode('UTF-8').replace('true', 'True').replace('false','False'))

def check_chains(pdbcode, mymol):
    """
    Check how many chains from the original PDB structure remain in mymol structure
    And to which Segments of our structure they correspond
    """
    # Load blosum score matrix to align proteins
    blosum62 = substitution_matrices.load("BLOSUM62")

    # Obtain sequences for original PDB file chains, and classifying them by chainID
    # Also check which segment corresponds to what chain
    pdbmol = Molecule(pdbcode)
    chainseg = {}
    chainset = set(pdbmol.get('chain', sel='protein'))
    pdbmol_segseqs = pdbmol.sequence()
    pdbmol_chainseqs = {}
    for chain in chainset:
        segid = np.unique(pdbmol.get('segid', sel='chain '+chain))[0]
        pdbmol_chainseqs[chain] = pdbmol_segseqs[segid]

    # Merging all protein chains in our systems into a single megachain
    mymol_megachain = ''
    for seg,chain in mymol.sequence().items():
        mymol_megachain = mymol_megachain + chain

    # Aligning sequences from original PDB to simulated PDB megasequence
    # To know which chains from the original PDB are preserved
    chain_present = {}
    segtochain = {}
    # For each chain in the original PDB file
    for chain,seq in pdbmol_chainseqs.items():
        chain_present[chain] = False
        # For each segment in our molecule
        for seg,myseq in mymol.sequence().items():
            mylen = len(myseq)
            # Align our molecule segments to the pdb chains
            aligs = pairwise2.align.localms(seq, myseq, 5,-1, -1.5, -1)
            # Check if any of the alignments has more than 4 score by position
            is_present = any([ (alig.score/mylen) > 4 for alig in aligs ]) 
            if is_present:
                chain_present[chain] = is_present
                segtochain[seg] = chain

    return (chain_present, segtochain)

def get_pdb_info(pdbcode, mymol, ligandsdict):
    """
    Get informarion of the system specified in the pdbcode from the RCSB-PDB webpage.
    Mainly uniprot sequences, chainIDs and uniprot codes for the chains
    """
    
    # Make string for selecting lignads in rcsb's api
    ligs = '"'+'","'.join(ligandsdict[pdbcode].keys())+'"'
    
    #Check which chains from the original PDB structure are preserved in mymol structure
    (chain_present, segtochain) = check_chains(pdbcode, mymol)
    
    # Get ligand molecules present in our system (basically anything that is not a protein)
    ligset = set(mymol.get('resname', sel='not protein'))
    
    # Extract information from pdb webpage using api
    ligdict = dict()
    protdict = dict()
    datadict = dict()
    
    # Get information from PDB api
    pdbdict = requests.get('https://data.rcsb.org/graphql?query={\
        entry(entry_id: "'+pdbcode+'") {\
            exptl {method}\
            polymer_entities {\
                entity_poly{pdbx_strand_id}\
                rcsb_polymer_entity{pdbx_description}\
                rcsb_polymer_entity_container_identifiers{uniprot_ids}\
                entity_src_gen{gene_src_common_name, pdbx_gene_src_scientific_name}\
            }\
        }\
        chem_comps(comp_ids: ['+ligs+']) {\
            rcsb_chem_comp_descriptor {InChIKey}\
            chem_comp{id,name}\
        }\
    }').json()['data']
    
    # Get ligand information and classify it (in case the system actually has ligands)
    if len(pdbdict['chem_comps'])>0:
        for lig in pdbdict['chem_comps']:
            ligandInchi = lig['rcsb_chem_comp_descriptor']['InChIKey']
            ligandResname = lig['chem_comp']['id']
            ligandName = lig['chem_comp']['name']
    
        # Exclude ligands not present in our simulated system
        if ligandResname in ligset:
            ligdict[ligandResname] = (ligandName,ligandInchi)

    # Extract protein chains information
    for poly in pdbdict['entry']['polymer_entities']:
        uniprot = poly['rcsb_polymer_entity_container_identifiers']['uniprot_ids'][0] if poly['rcsb_polymer_entity_container_identifiers']['uniprot_ids'] else ''
        uniname = poly['rcsb_polymer_entity']['pdbx_description'].lower()
        chainId = poly['entity_poly']['pdbx_strand_id'].split(',')[0]
        if poly['entity_src_gen']:
            species = "%s (%s)"%(str(poly['entity_src_gen'][0]['gene_src_common_name']).upper(), poly['entity_src_gen'][0]['pdbx_gene_src_scientific_name'])
            species.replace('NONE', '')
        else:
            species = ""
            
        # Determine if this polymer (chain) is a GPCR
        isgpcr = True if (('receptor' in uniname) or ('rhodopsin' in uniname)) else False

        # If this chain is present in our mymol structure
        if chain_present[chainId]:
                
            # Get which segment(s) this chain is assigned to 
            segs = [ seg for seg,chain in segtochain.items() if chain == chainId ]
            protdict[chainId] = (uniprot, isgpcr, segs, uniname, species)

        # Little exception for my favorite mutants
        if pdbcode in ('6FJ3', '6NBF') and chainId == 'P':
            protdict[chainId] = ('Q27IM2', isgpcr, ['L'], uniname, species)
            segtochain['L'] = 'P'
            
    # Determine experimental method used, and use the corresponding id in GPCRmd database
    method = pdbdict['entry']['exptl'][0]['method'].lower()
    if 'x-ray' in method:
        method_id = 0
    elif 'nmr' in method:
        method_id = 1
    elif 'electron microscopy' == method:
        method_id = 4
    else:
        method_id = 5 # Other method
    
    return (protdict,ligdict,segtochain,method_id)

def login(s):
    headers = {
        'Cookie': 'csrftoken=cuGA6CSGmXfMbLwqlPoGjLLN7QkO6rZ7',
        'Referer': mainurl+'/accounts/login/',
    }
    datalogin = {
        'username': 'david',
        'password': 'Ameboid',
        'next' : '/accounts/memberpage/',
        'csrfmiddlewaretoken' : 'cuGA6CSGmXfMbLwqlPoGjLLN7QkO6rZ7'
    }
    logo = s.post(mainurl+'/accounts/login/', 
               data=datalogin,
               headers=headers)
    return s

def submission_step1(subm_id,s,protdict,mymol):
    """
    Do step 1 of GPCRmd submission protocol
    That is, submit information about the protein chains contained in the system
    """
    
    sessionid = str(s.cookies['sessionid'])
    csrftoken = str(s.cookies['csrftoken'])

    headers = {
        'Referer' : mainurl+'/dynadb/protein/'+subm_id+'/',
        'Cookie' : 'csrftoken=%s; sessionid=%s' %(csrftoken,sessionid),
        'X-CSRFToken' : csrftoken
    }
    print('initiating step 1: protein data')
    i = 0
    protdict_2 = dict()
    step1_data = {'csrfmiddlewaretoken': csrftoken}
    rec_name = None
    # For each chain in this system (using the chainids of the original PDB)
    for chainid,(uniprot,isgpcr,seglist,name,species) in protdict.items():
        h = str(i)
        
        # Retrieve uniprot data
        if uniprot:
            data = {'uniprotkbac': uniprot}
            resp = s.post(mainurl+'/dynadb/protein/get_data_upkb/',
                  data = data,
                  headers = headers)
            unidict = resp_to_dict(resp)
        else:
            unidict = {
                'Entry' : None,
                'Isoform' : '',
                'Name' : name,
                'Organism' : species,
                'Aliases' : None,
                'Sequence' : ''
            }
        
        # Align wild type chain and the chan of our molecule
        myseq = ''
        for seg in seglist:
            myseq += mymol.sequence()[seg]
        wtseq = unidict['Sequence']
        resp = s.post(mainurl+'/dynadb/protein/'+subm_id+'/alignment/',
                      data = {'wtseq' : wtseq, 'mutant': myseq},
                      headers = headers
        )
        alignment = resp_to_dict(resp)['alignment']

        # Get mutations from alignment
        resp = s.post(mainurl+'/dynadb/protein/get_mutations/',
                      data = {'alignment' : alignment, 'sequence': wtseq},
                      headers = headers
        )
        mutations = resp_to_dict(resp)['mutations']
        # Filter mutations (to avoid taking as mutations the cut of the N and C terminal loops)
        realmutations = mutations
        #realmutations = [ mut for mut in mutations if (mut['from'] != '-') and (mut['to'] != '-') ]

        # Store mutations into post data
        if len(realmutations):
            step1_data['form-'+h+'-msequence'] = myseq
            step1_data['form-'+h+'-is_mutated'] = 0
            step1_data['form-'+h+'-alignment'] = alignment
            u = 0
            for mut in realmutations:
                v = str(u)
                step1_data['form-'+h+'-resid-'+v] = mut['resid']
                step1_data['form-'+h+'-resletter_from-'+v] = mut['from']
                step1_data['form-'+h+'-resletter_to-'+v] = mut['to']
                u += 1
        
        # Store retrieved data into post data dictionary
        step1_data['form-'+h+'-is_not_uniprot'] = '' if unidict['Entry'] else 'on'
        step1_data['form-'+h+'-sequence'] = wtseq
        step1_data['form-'+h+'-uniprotkbac'] = unidict['Entry']
        step1_data['form-'+h+'-isoform'] = unidict['Isoform']
        step1_data['form-'+h+'-name'] = unidict['Name']
        step1_data['form-'+h+'-id_species_autocomplete'] = unidict['Organism']
        step1_data['form-'+h+'-other_names'] = unidict['Aliases']
        step1_data['form-'+h+'-sequence'] = unidict['Sequence']
        if isgpcr:
            rec_name = unidict['Name']
            print(rec_name+' identified as GPCR')
            step1_data['form-'+h+'-receptor'] = 0
        else:
            step1_data['form-'+h+'-receptor'] = 1
        i+=1

        # Put uniprot code in list
        protdict_2[chainid] = {'uniprot': unidict['Entry'],
                                 'position' : i,
                                 'name' : unidict['Name']
                                }
    
    # Sent step 1 data
    step1_response = s.post(mainurl+'/dynadb/protein/'+subm_id+'/',
                     data = step1_data,
                     headers = headers)
        
    # We'll need to remember the order in which the system's chains have been submited
    # Also the name of the receptor for step3
    return (protdict_2, rec_name)

def smol_submission(s, i, subm_id, sdfpath, smol, smol_key, smol_dict, mol):
    
    # Prepare headers, data and files
    h = str(i)
    sessionid = str(s.cookies['sessionid'])
    csrftoken = str(s.cookies['csrftoken'])
    print('submitting small molecule '+smol)
    data = {
        'csrfmiddlewaretoken': csrftoken,
        'molpostkey': 'form-'+h+'-molsdf',
    }
    headers = {
        'Referer' : mainurl+'/dynadb/molecule/'+subm_id+'/',
        'Cookie' : 'csrftoken=%s; sessionid=%s' %(csrftoken,sessionid),
        'Origin': 'http://localhost:8000',
        'X-CSRFToken' : csrftoken
    }
    files = { 'form-'+h+'-molsdf' : open(sdfpath, 'r') }

    # UPLOAD sdf
    data_upload = {
        'csrfmiddlewaretoken': csrftoken,
        'form-'+h+'-is_present': 'on',
        'form-'+h+'-description': 'Standard form',
        'form-'+h+'-search_by_pubchem': 'sinchi',
        'form-'+h+'-retrieve_type_pubchem': 'parent',
        'form-'+h+'-neutralize_pubchem': '1',
        'form-'+h+'-search_by_chembl': 'smiles',
        'form-'+h+'-similarity': '100',
        'form-'+h+'-retrieve_type_chembl': 'parent',
        'form-'+h+'-neutralize_chembl': '1',
        'molpostkey': 'form-'+h+'-molsdf',
        'pngsize': '300'
    }
    resp = s.post(mainurl+'/dynadb/molecule/'+subm_id+'/generate_properties/',
                 headers = headers,
                 files = files,
                 data = data_upload)
    
    # Get pubchem info of compound
    resp = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/'+smol_key+'/property/CanonicalSMILES,Charge,InChI,IUPACName,/JSON')
    pub_dict = eval(resp.content.decode('UTF-8').replace('null', 'None'))['PropertyTable']['Properties'][0]

    # Get names of compound
    resp = requests.get('https://pubchem.ncbi.nlm.nih.gov/rest/pug/compound/inchikey/'+smol_key+'/synonyms/TXT')
    synonyms = resp.text.replace("\n", '; ')

    # Get official name of compound
    resp = requests.get('https://pubchem.ncbi.nlm.nih.gov/compound/'+str(pub_dict['CID']))
    soup = BeautifulSoup(resp.text,'html')
    smol_name = soup.find('meta',attrs={'property' : 'og:title'}).get('content')
    
    # Get chemblid
    try:
        resp = requests.get('https://www.ebi.ac.uk/chembl/api/data/molecule/'+smol_key)
        tree = ET.fromstring(resp.text)
        chemblid = tree.find('molecule_chembl_id').text.replace('CHEMBL','')
    except Exception as e:
        print("Chemblid not avalible for molecule "+smol_name)
        chemblid = ""
    
    # Use obtained data to submit small molecule
    submit_data = {
        'csrfmiddlewaretoken': csrftoken,
        'form-'+h+'-molsdf': '',
        'form-'+h+'-upload_button': '', 
        'form-'+h+'-is_present': 'on',
        'form-'+h+'-inchi': pub_dict['InChI'],
        'form-'+h+'-sinchikey': smol_key,
        'form-'+h+'-net_charge': str(pub_dict['Charge']),
        'form-'+h+'-inchikey': smol_key,
        'form-'+h+'-smiles': pub_dict['CanonicalSMILES'],
        'form-'+h+'-description': '', 
        'form-'+h+'-get_mol_info': '',
        'form-'+h+'-is_not_in_databases': 'on',
        'form-'+h+'-search_by_pubchem': 'sinchi',
        'form-'+h+'-retrieve_type_pubchem': 'original',
        'form-'+h+'-neutralize_pubchem': '1',
        'form-'+h+'-search_by_chembl': 'smiles',
        'form-'+h+'-similarity': '100',
        'form-'+h+'-retrieve_type_chembl': 'original',
        'form-'+h+'-neutralize_chembl': '1',
        'form-'+h+'-name': smol_name,
        'form-'+h+'-iupac_name': pub_dict['IUPACName'],
        'form-'+h+'-pubchem_cid': str(pub_dict['CID']),
        'form-'+h+'-update_from_pubchem': '', 
        'form-'+h+'-chemblid': chemblid,
        'form-'+h+'-update_from_chembl': '',
        'form-'+h+'-other_names': synonyms,
        'form-'+h+'-passMoleculePOST': 'passMoleculePOST',
        'form-'+h+'-add_molecule': '+ Add molecule',
        'form-'+h+'-del_molecule': '- Remove molecule',
        'form-'+h+'-reset': '',
    }

    # Dictionary of crystalized components (useful in the future)
    smol_dict[smol] = {
                      'name' : smol_name,
                      'num_mol' : len(np.unique(mol.get('resid', sel='resname '+smol))),
                      'order_mol' : str(i+1)
    }  

    # Add bulk or co-crystalized properties
    ligname = None
    if smol == 'TIP3':
        submit_data['form-'+h+'-bulk_type'] = '0'
        submit_data['form-'+h+'-type'] = '6'
        smol_dict[smol]['crystalized'] = 0
        smol_dict[smol]['type'] = 'Water'
    elif smol == 'POPC':
        submit_data['form-'+h+'-bulk_type'] = '0'
        submit_data['form-'+h+'-type'] = '7'
        smol_dict[smol]['crystalized'] = 0
        smol_dict[smol]['type'] = 'Lipid'
    elif smol == 'CLR':
        submit_data['form-'+h+'-type'] = '3'
        smol_dict[smol]['crystalized'] = 1
        smol_dict[smol]['type'] = 'Lipid'
        ligname = 'CLR'
    elif (smol == 'CLA') or (smol == 'SOD'):
        submit_data['form-'+h+'-bulk_type'] = '0'
        submit_data['form-'+h+'-type'] = '8'
        smol_dict[smol]['crystalized'] = 0
        smol_dict[smol]['type'] = 'Ions'        
    else:#Else make it co-cristalized orthosteric ligand. TODO: recognize orthosteric ligand
        submit_data['form-'+h+'-type'] = '0'
        smol_dict[smol]['crystalized'] = 1
        smol_dict[smol]['type'] = 'Ligand'
        ligname = smol_name
        
    #Send small molecule
    resp = s.post(mainurl+'/dynadb/molecule/'+subm_id+'/',
                  data = submit_data,
                  headers = headers)
    
    # Return smol_dict with a new entry, new molecule marker (i) and the small molecule name if it is the ligand
    i += 1
    if ligname:
        return (i, smol_dict, smol_name)
    else:
        return (i, smol_dict)

def submission_step3(s, subm_id, pdbpath, recname, lignames, protdict, smol_dict, segtochain, method_id, sufix_sysname = "" ):
    """
    Perform step3 of GPCRmd submission for specified system
    """
    
    # Data needed for the submission
    sessionid = str(s.cookies['sessionid'])
    csrftoken = str(s.cookies['csrftoken'])
    headers = {
        'Referer' : mainurl+'/dynadb/model/'+subm_id+'/',
        'Cookie' : 'csrftoken=%s; sessionid=%s' %(csrftoken,sessionid)
    }
    data_submit = {
        'csrfmiddlewaretoken' : csrftoken,
        'prtnam' : [],
        'prtnum' : [],
        'uniprot' : []
    }
    print('initiating step 3: crystalized components')

    # Determine name of the complex
    ligname = ','.join(lignames)
    sysname = recname + ' in complex with ' +ligname if ligname else recname
    sysname = sysname+sufix_sysname
    print('sysname: ', sysname)
    #Cut excessively large names
    if len(sysname) > 100:
        sysname = sysname[0:99]
    
    # Part A: General information
    data_submit['name'] = sysname
    data_submit['type'] = 1 # TODO: check which number corresponds to apoform and which to complex
    data_submit['pdbid'] = pdbcode
    data_submit['description'] = "" # No description. Sorry
    data_submit['source_type'] = method_id

    # Upload model file
    upl_resp = s.post(mainurl+'/dynadb/model/'+subm_id+'/upload_model_pdb/',
          headers = headers,
          data = data_submit,
          files = {'file_source' : open(pdbpath)})
    
    # Part B1: Submitted proteins summary
    for chain in protdict:
        data_submit['prtnam'].append(protdict[chain]['name'])
        data_submit['prtnum'].append(protdict[chain]['position'])
        data_submit['uniprot'].append(protdict[chain]['uniprot'])

    # Part B2: Curated protein data: protein segments
    prot_segs = set(mymol.get('segid', sel="protein"))
    iprot = 0
    for segid in prot_segs:
        # Get segment information (uniprot code, chainid, and starting-ending residues)
        auldchain = segtochain[segid]
        prot_num = protdict[auldchain]['position']
        chainid = np.unique(mymol.get('chain', sel='segid '+segid))[0]
        segres = set(mymol.get('resid', sel='segid '+segid))
        from_res = min(segres)
        to_res = max(segres)
        iprot_s = str(iprot)

        # Add obtained info to data submit
        data_submit['formps-'+iprot_s+'-prot'] = prot_num
        data_submit['formps-'+iprot_s+'-chain'] = chainid
        data_submit['formps-'+iprot_s+'-segid'] = segid
        data_submit['formps-'+iprot_s+'-resid_from'] = from_res
        data_submit['formps-'+iprot_s+'-resid_to'] = to_res
        data_submit['formps-'+iprot_s+'-seq_resid_from'] = from_res # Are you sure they are always equivalents?
        data_submit['formps-'+iprot_s+'-seq_resid_to'] = to_res
        data_submit['formps-'+iprot_s+'-pdbidps'] = pdbcode
        data_submit['formps-'+iprot_s+'-source_typeps'] = 1 # I dont care about this part. Is meant to be removed anyways
        data_submit['formps-'+iprot_s+'-bonded_to_id_modeled_residues'] = None
        iprot += 1
    
    # Part C: Cocrystalized small molecules
    typesid={# This data submit uses numerical values for small_molecule types
        'Ions' : '0',
        'Ligand' : '1',
        'Lipid' : '2',
        'Water' : '3',
        'Other' : '4'
    }
    # Get submission form page, and extract required information from there
    rep = s.get(mainurl+'/dynadb/model/'+subm_id+'/', headers = headers)
    soup = BeautifulSoup(rep.text, 'html.parser')
    # For every crystalized small molecule, add an entry in data_submit
    # Many things are wrong here, but since this part is not going to be used anywhere nobody cares.
    for lig in smol_dict:
        if smol_dict[lig]['crystalized']:
            ordmol = smol_dict[lig]['order_mol']
            id_i = soup.find('input', attrs = {'id': re.compile(r'id_formmc-\d+-molecule'), 'value':ordmol}).get('id')
            i = re.findall("\d+", id_i)[0]
            data_submit['formmc-'+i+'-resname'] = lig
            data_submit['formmc-'+i+'-numberofmol'] = smol_dict[lig]['num_mol']
            data_submit['formmc-'+i+'-molecule'] = smol_dict[lig]['order_mol']
            data_submit['formmc-'+i+'-id_molecule'] = soup.find('input',attrs = {'id': 'id_formmc-'+i+'-id_molecule'}).get('value')
            data_submit['formmc-'+i+'-namemc'] = smol_dict[lig]['name']
            data_submit['formmc-'+i+'-typemc'] = typesid[smol_dict[lig]['type']]
    print(data_submit)
    
    # Submit step3
    resp = s.post(mainurl+'/dynadb/model/'+subm_id+'/',
          headers = headers,
          data = data_submit)

def submission_step4(s, subm_id, modelname, protdict, smol_dict, timestep, trajperiod, files_path):
    
    # For topology, coordinates, and patameters files I use replicate 1 files as I could use 
    #any other ones. Replicates only differ in trajectory files
    #Nonetheless the represnetation has to actually exist
    for rep in range(1,repnum+1):
        files_path_rep = "%srep_%d/" % (files_path, rep)
        if os.path.exists(files_path_rep+"structure.pdb"):
            files_path_rep1 = files_path_rep
    
    # Requests session and headers
    sessionid = str(s.cookies['sessionid'])
    csrftoken = str(s.cookies['csrftoken'])
    data_submit = {
        'csrfmiddlewaretoken' : csrftoken,
        'prtnam' : [],
        'prtnum' : [],
        'uniprot' : []
    }
    headers_submit = {
        'Referer' : mainurl+'/dynadb/dynamics/'+subm_id,
        'Cookie' : 'csrftoken=%s; sessionid=%s' %(csrftoken,sessionid),
        'X-CSRFToken' : csrftoken,
    }
    print('initiating step 4: simulation information')

    # Part A: Upload simulation files
    referer_path = mainurl+'/dynadb/dynamics/'+subm_id+'/upload_files/?file_type='
    
    # Coordinate file
    headers = {
        'Referer' : referer_path+'coor',
        'Cookie' : 'csrftoken=%s; sessionid=%s' %(csrftoken,sessionid),
        'X-CSRFToken' : csrftoken,
        'Connection' : 'keep-alive',
    }
    data = {
        'csrfmiddlewaretoken' : csrftoken,
        'file_type' : 'coor',
        'filekey' : 'coor',
    }
    files = {
        'coor' : open(files_path_rep1+'structure.pdb')
    }

    resp = s.post(mainurl+'/dynadb/dynamics/'+subm_id+'/upload_files/',
          data = data,
          files = files,
          headers = headers)
    
    # Topology file
    headers['Referer'] = referer_path+'top'
    data['file_type'] = 'top'
    data['filekey'] = 'top'
    files = { 'top' : open(files_path_rep1+'structure.psf')}
    resp = s.post(mainurl+'/dynadb/dynamics/'+subm_id+'/upload_files/',
          data = data,
          files = files,
          headers = headers)
    
    # Trajectory files
    headers['Referer'] = referer_path+'traj'
    data['file_type'] = 'traj'
    data['filekey'] = 'traj'
    files = []
    print('uploading trajs')
    # Include trajectory files that actually exist
    """for rep in range(1,repnum+1):
        traj_path = files_path+'/rep_%d/output_wrapped.xtc'%rep
        if os.path.exists(traj_path):
            files.append(('traj', open(traj_path, 'rb')))
    
    resp = s.post(mainurl+'/dynadb/dynamics/'+subm_id+'/upload_files/traj/',
          data = data,
          files = files,
          headers = headers)
    """
    
    # Parameters files (compress and upload)
    with tarfile.open(files_path_rep1+'parameters.tar.gz', "w:gz") as tar:
        tar.add(files_path_rep1+'parameters', arcname='parameters')
    headers['Referer'] = referer_path+'parm'
    data['file_type'] = 'parm'
    data['filekey'] = 'parm'
    files = { 'parm' : open(files_path_rep1+'parameters.tar.gz','rb')}
    resp = s.post(mainurl+'/dynadb/dynamics/'+subm_id+'/upload_files/',
          data = data,
          files = files,
          headers = headers)
    
    # Part B1: Submitted proteins summary
    for chain in protdict:
        data_submit['prtnam'].append(protdict[chain]['name'])
        data_submit['prtnum'].append(protdict[chain]['position'])
        data_submit['uniprot'].append(protdict[chain]['uniprot'])

    # Part B2: Resubmit (third time...) the ligand elements
    # Take page 4 of form to extract smalmol info
    rep = s.get(mainurl+'/dynadb/dynamics/'+subm_id+'/', headers = headers)
    soup = BeautifulSoup(rep.text, 'html.parser')
    for smol in smol_dict:

        # Take the 'h' (number assigned to the ids of the inputs of this molecule in the web)
        idmol = smol_dict[smol]['order_mol']
        id_obj = soup.find('input', {'name' : re.compile(r"formc-\d+-molecule"), "value" : [idmol,' '+idmol+' ']}).get('id')
        h = re.search('-(\d+)-', id_obj).group(1)

        # Molecule data to submit 
        data_submit['formc-'+h+'-resname'] = smol
        data_submit['formc-'+h+'-molecule'] = smol_dict[smol]['order_mol']
        data_submit['formc-'+h+'-id_molecule'] = soup.find('input',attrs = {'id': 'id_formc-'+h+'-id_molecule'}).get('value')
        data_submit['formc-'+h+'-name'] = smol_dict[smol]['name']
        data_submit['formc-'+h+'-numberofmol'] = smol_dict[smol]['num_mol']
        data_submit['formc-'+h+'-typemc'] = smol_dict[smol]['type']
        data_submit['formc-'+h+'-type_int'] = soup.find('input',attrs = {'id': 'id_formc-'+h+'-type_int'}).get('value')

    # Part C: Simulation specs
    data_submit['id_dynamics_methods']= '1' #Molecular mechanics
    data_submit['software']= 'ACEMD3'
    data_submit['sversion']= 'GPUGRID' # TODO: Is that correct??
    data_submit['ff']= 'CHARMM'
    data_submit['ffversion']= '36m Feb 2016'
    data_submit['id_assay_types']= 1 # Orthosteric (un)/binding
    data_submit['id_dynamics_membrane_types']= 2 # Homogeneus membrane
    data_submit['id_dynamics_solvent_types']= 2 # TIP3P solvent
    data_submit['solvent_num']= len(mymol.get('resid', sel='resname TIP3'))
    data_submit['atom_num']= len(mymol.get('name'))
    data_submit['timestep']= timestep
    data_submit['delta'] = (trajperiod*timestep)/10e6
    data_submit['description'] = 'autosubmission' # time/frames

    # Submit step 4
    rep = s.post(mainurl+'/dynadb/dynamics/'+subm_id+'/',
            headers = headers,
            data = data_submit)
    

In [18]:
#### Actual Simulation submission

# For each of the currently-working-with systems defined in Part 1
#for pdbcode in pdb_set:
#for pdbcode in testset:
for pdbcode in ['6NBF']:
    for apo in [False]:
        for mutant in ['ML644-7', 'ML643-36', 'ML643-73']:
        #for apo in [True, False]:
            
            # Load initial PDB
            pdbpath = glob('%sproduction/%s/%s/rep_*/structure.pdb'%(resultspath,pdbcode,mutant))[0]
            prodpath = '%sproduction/%s/%s/'%(resultspath,pdbcode,mutant)
            modelname = os.path.basename(os.path.splitext(pdbpath)[0])
            modelname = pdbcode
            mymol = Molecule(pdbpath)
            print('Submitting '+modelname+' simulation...')

            ## Step -2: Get information of protein chains and ligand molecules from PDB web
            (protdict,ligdict,segtochain,method_id) = get_pdb_info(pdbcode, mymol, ligandsdict)

            ## Step -1: Login into GPCRmd
            with requests.Session() as s:
                login(s)

            ## Step 0: New submission (temporaly commented to avoid saturating GPCRmd with trashy new submissions)
#             response_new = s.get(mainurl + '/dynadb/db_inputform/')
            soup = BeautifulSoup(response_new.text, 'html.parser')
            step1_link = soup.find('a',attrs={'id' : 'selection-button'}).get('href')
            subm_id = step1_link.split('/')[-2]
            print(subm_id)
            
            ## Step 1: Protein information
            (newprotdict,recname) = submission_step1(subm_id,s,protdict, mymol)
            
            ## Step 2: Introduce small molecules 
            print('initiating step 2: small molecule data')

            # Introduce common small molecules (waters, lipids and ions)
            i = 0
            smol_dict = dict()
            common_mols = [
                ('TIP3', 'XLYOFNOQVPJJNP-UHFFFAOYSA-N'),
                ('POPC', 'WTJKGGKOPKCXLL-VYOBOKEXSA-N'),
                ('SOD', 'FKNQFGJONOIPTF-UHFFFAOYSA-N'),
                ('CLA', 'VEXZGXHMUGYJMC-UHFFFAOYSA-M')
            ]
            for smol,smol_key in common_mols:
                sdfpath = basepath+'smalmol_sdfs/'+smol+'.sdf'
                (i, smol_dict) = smol_submission(s, i, subm_id, sdfpath, smol, smol_key, smol_dict, mymol)

            # Introduce ligands: all will be defined as orthosteric ligands
            lignames = []
            for lig in ligdict:
                # Skip this if molecule is apoform
                if apo:
                    break
                # Avoid blacklisted molecules or cholesterol or ion
                if (lig in detergent_blacklist) or (lig in glucids_blacklist) or (len(lig) == 2):
                    continue
                # Download ligand, store it into temporary file
                response = requests.get('https://files.rcsb.org/ligands/view/'+lig+'_ideal.sdf')
                with open('tmpfile.sdf','wb') as tmpout:
                    tmpout.write(response.content)

                # Send molecule
                smol_key = ligdict[lig][1]
                (i, smol_dict, ligname) = smol_submission(s, i, subm_id, 'tmpfile.sdf', lig, smol_key, smol_dict, mymol)
                if lig != 'CLR':
                    lignames.append(ligname)
                os.remove('tmpfile.sdf')

            # Check for peptide ligands
            for lig in gpcrdb_dict[pdbcode]['ligands']:
                if (lig['type'] in {'protein', 'peptide'}) and not apo:
                    lignames.append(lig['name'])
        
            ## Step 3: Crystalized components information
            submission_step3(s, subm_id, pdbpath, recname, lignames, newprotdict, smol_dict, segtochain, method_id, ", mutant "+mutant)
            
            ## Step 4: Dynamics information
            submission_step4(s, subm_id, modelname, newprotdict, smol_dict, timestep, trajperiod, prodpath)


2021-06-10 11:57:08,152 - moleculekit.readers - INFO - Attempting PDB query for 6NBF


Submitting 6NBF simulation...




141
initiating step 1: protein data




Parathyroid hormone/parathyroid hormone-related peptide receptor identified as GPCR




initiating step 2: small molecule data
submitting small molecule TIP3
submitting small molecule POPC
Chemblid not avalible for molecule 1-Palmitoyl-2-oleoyl-sn-glycero-3-phosphocholine
submitting small molecule SOD
Chemblid not avalible for molecule Sodium ion
submitting small molecule CLA
Chemblid not avalible for molecule Chloride ion
submitting small molecule CLR
initiating step 3: crystalized components
sysname:  Parathyroid hormone/parathyroid hormone-related peptide receptor in complex with Long-acting parathyroid hormone analog, mutant ML644-7
initiating step 4: simulation information
uploading trajs


AttributeError: 'NoneType' object has no attribute 'get'