In [1]:
import sys

sys.path.append('/projects/metalsitenn/pdbx')

from metalsitenn.placer_modules.cifutils import CIFParser, mutate_chain

from metalsitenn.utils import visualize_metal_site_3d, visualize_chain_3d
from metalsitenn.dataloading import MetalSiteDataset

from metalsitenn.featurizer import MetalSiteFeaturizer
from metalsitenn.utils import visualize_featurized_metal_site_3d
import pandas as pd
import numpy as np
import torch

## CIF parser for getting metal sites

In [2]:
parser = CIFParser()

In [3]:
parsed_data = parser.parse('/datasets/alphafold_data/data_v2/pdb_mmcif/mmcif_files/6fpw.cif')

In [4]:
chains, assemblies, covalent_bonds, metadata = parsed_data

In [5]:
chains.keys()

dict_keys(['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'AA'])

In [6]:
chains['A'].residues

{'1': Residue(name='LEU', atoms={'N': Atom(name='N', xyz=[0.0, 0.0, 0.0], occ=0.0, bfac=0.0, leaving=False, leaving_group=['H2'], parent='CA', element=7, metal=False, charge=1, hyb=3, nhyd=3, hvydeg=1, align=1, hetero=False), 'CA': Atom(name='CA', xyz=[0.0, 0.0, 0.0], occ=0.0, bfac=0.0, leaving=False, leaving_group=[], parent='CB', element=6, metal=False, charge=0, hyb=3, nhyd=1, hvydeg=3, align=1, hetero=False), 'C': Atom(name='C', xyz=[0.0, 0.0, 0.0], occ=0.0, bfac=0.0, leaving=False, leaving_group=['HXT', 'OXT'], parent='OXT', element=6, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=3, align=1, hetero=False), 'O': Atom(name='O', xyz=[0.0, 0.0, 0.0], occ=0.0, bfac=0.0, leaving=False, leaving_group=[], parent='C', element=8, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=1, align=1, hetero=False), 'CB': Atom(name='CB', xyz=[0.0, 0.0, 0.0], occ=0.0, bfac=0.0, leaving=False, leaving_group=[], parent='CG', element=6, metal=False, charge=0, hyb=3, nhyd=2, hvydeg=2, align=1, hetero=False), 'C

In [7]:
sites = parser.get_metal_sites(parsed_data, max_atoms_per_site=500, max_water_bfactor=15, merge_threshold=6, cutoff_distance=6, backbone_treatment='free')

In [8]:
site = sites[1]

In [9]:
site_chain = site['site_chain']

In [10]:
site_chain.residues

{'1': Residue(name='GLY', atoms={'N': Atom(name=('A', '1', 'GLY', 'N'), xyz=[-19.936, 7.408, 17.572], occ=1.0, bfac=8.54, leaving=False, leaving_group=['H2'], parent='CA', element=7, metal=False, charge=0, hyb=2, nhyd=1, hvydeg=1, align=1, hetero=False), 'CA': Atom(name=('A', '1', 'GLY', 'CA'), xyz=[-19.452, 6.122, 17.104], occ=1.0, bfac=9.08, leaving=False, leaving_group=[], parent='C', element=6, metal=False, charge=0, hyb=3, nhyd=2, hvydeg=2, align=1, hetero=False), 'C': Atom(name=('A', '1', 'GLY', 'C'), xyz=[-18.415, 5.592, 18.061], occ=1.0, bfac=7.63, leaving=False, leaving_group=['HXT', 'OXT'], parent='OXT', element=6, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=3, align=1, hetero=False), 'O': Atom(name=('A', '1', 'GLY', 'O'), xyz=[-17.736, 6.358, 18.754], occ=1.0, bfac=8.77, leaving=False, leaving_group=[], parent='C', element=8, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=1, align=1, hetero=False)}, bonds=[Bond(a=('A', '1', 'GLY', 'N'), b=('A', '1', 'GLY', 'CA'), aromatic=Fal

In [11]:
site_chain.atoms

{('A',
  '1',
  'GLY',
  'N'): Atom(name=('A', '1', 'GLY', 'N'), xyz=[-19.936, 7.408, 17.572], occ=1.0, bfac=8.54, leaving=False, leaving_group=['H2'], parent='CA', element=7, metal=False, charge=0, hyb=2, nhyd=1, hvydeg=1, align=1, hetero=False),
 ('A',
  '1',
  'GLY',
  'CA'): Atom(name=('A', '1', 'GLY', 'CA'), xyz=[-19.452, 6.122, 17.104], occ=1.0, bfac=9.08, leaving=False, leaving_group=[], parent='C', element=6, metal=False, charge=0, hyb=3, nhyd=2, hvydeg=2, align=1, hetero=False),
 ('A',
  '1',
  'GLY',
  'C'): Atom(name=('A', '1', 'GLY', 'C'), xyz=[-18.415, 5.592, 18.061], occ=1.0, bfac=7.63, leaving=False, leaving_group=['HXT', 'OXT'], parent='OXT', element=6, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=3, align=1, hetero=False),
 ('A',
  '1',
  'GLY',
  'O'): Atom(name=('A', '1', 'GLY', 'O'), xyz=[-17.736, 6.358, 18.754], occ=1.0, bfac=8.77, leaving=False, leaving_group=[], parent='C', element=8, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=1, align=1, hetero=False),
 ('A',


In [12]:
site_chain.planars

[[('A', '6', 'ASN', 'CG'),
  ('A', '6', 'ASN', 'CB'),
  ('A', '6', 'ASN', 'OD1'),
  ('A', '6', 'ASN', 'ND2')],
 [('A', '8', 'TRP', 'CG'),
  ('A', '8', 'TRP', 'CB'),
  ('A', '8', 'TRP', 'CD1'),
  ('A', '8', 'TRP', 'CD2')],
 [('A', '8', 'TRP', 'CD2'),
  ('A', '8', 'TRP', 'CG'),
  ('A', '8', 'TRP', 'CE2'),
  ('A', '8', 'TRP', 'CE3')],
 [('A', '8', 'TRP', 'CE2'),
  ('A', '8', 'TRP', 'CD2'),
  ('A', '8', 'TRP', 'NE1'),
  ('A', '8', 'TRP', 'CZ2')],
 [('A', '9', 'PHE', 'CG'),
  ('A', '9', 'PHE', 'CB'),
  ('A', '9', 'PHE', 'CD1'),
  ('A', '9', 'PHE', 'CD2')],
 [('A', '19', 'F3S', 'FE1'),
  ('A', '19', 'F3S', 'S1'),
  ('A', '19', 'F3S', 'S2'),
  ('A', '19', 'F3S', 'S3')],
 [('A', '19', 'F3S', 'FE3'),
  ('A', '19', 'F3S', 'S1'),
  ('A', '19', 'F3S', 'S3'),
  ('A', '19', 'F3S', 'S4')],
 [('A', '19', 'F3S', 'FE4'),
  ('A', '19', 'F3S', 'S2'),
  ('A', '19', 'F3S', 'S3'),
  ('A', '19', 'F3S', 'S4')]]

In [13]:
site_chain.chirals

[[('A', '2', 'CYS', 'CA'),
  ('A', '2', 'CYS', 'N'),
  ('A', '2', 'CYS', 'CB'),
  ('A', '2', 'CYS', 'C')],
 [('A', '3', 'PRO', 'CA'),
  ('A', '3', 'PRO', 'N'),
  ('A', '3', 'PRO', 'CB'),
  ('A', '3', 'PRO', 'C')],
 [('A', '4', 'ILE', 'CA'),
  ('A', '4', 'ILE', 'N'),
  ('A', '4', 'ILE', 'CB'),
  ('A', '4', 'ILE', 'C')],
 [('A', '4', 'ILE', 'CB'),
  ('A', '4', 'ILE', 'CA'),
  ('A', '4', 'ILE', 'CG2'),
  ('A', '4', 'ILE', 'CG1')],
 [('A', '5', 'THR', 'CA'),
  ('A', '5', 'THR', 'N'),
  ('A', '5', 'THR', 'CB'),
  ('A', '5', 'THR', 'C')],
 [('A', '5', 'THR', 'CB'),
  ('A', '5', 'THR', 'CA'),
  ('A', '5', 'THR', 'CG2'),
  ('A', '5', 'THR', 'OG1')],
 [('A', '6', 'ASN', 'CA'),
  ('A', '6', 'ASN', 'N'),
  ('A', '6', 'ASN', 'CB'),
  ('A', '6', 'ASN', 'C')],
 [('A', '7', 'CYS', 'CA'),
  ('A', '7', 'CYS', 'N'),
  ('A', '7', 'CYS', 'CB'),
  ('A', '7', 'CYS', 'C')],
 [('A', '8', 'TRP', 'CA'),
  ('A', '8', 'TRP', 'N'),
  ('A', '8', 'TRP', 'CB'),
  ('A', '8', 'TRP', 'C')],
 [('A', '9', 'PHE', 'CA'),
  

In [14]:

viewer = visualize_metal_site_3d(site)
viewer.show()

## Dataset

Note called dataset parser as a batch job.
See stage 1.1_parse_sites_metadata

Here we are loading from the cache

In [15]:
ds = MetalSiteDataset(cache_folder='../data/1/1.1_parse_sites_metadata')

In [16]:
df = ds.get_all_metadata()
df

Unnamed: 0,pdb_code,site_name,site_idx,n_entities,n_atoms,n_bonds,metal,n_metals,n_waters,n_organic_ligands,n_metal_ligands,n_amino_acids,n_coordinating_amino_acids,n_nucleotides,non_residue_non_metal_names,n_non_residue_non_metal,coordination_distance
0,6fpw,6fpw_0,0,19,158,150,Fe,4,0,0,1,18,4,0,,0,2.9
1,6fpw,6fpw_1,1,22,138,124,Fe,3,3,0,1,18,3,0,,0,2.9
2,6fpw,6fpw_2,2,29,203,184,Fe,4,2,0,1,26,6,0,,0,2.9
3,6fpw,6fpw_3,3,20,141,123,"Fe,Ni",2,0,0,1,19,4,0,,0,2.9
4,6fpw,6fpw_4,4,19,101,83,Mg,1,6,0,1,12,3,0,,0,2.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131137,1fyu,1fyu_0,0,15,103,93,Mn,1,1,0,2,12,4,0,,0,2.9
131138,1fyu,1fyu_1,1,18,130,117,Ca,1,1,1,2,14,4,0,GAL,1,2.9
131139,1fyu,1fyu_2,2,16,104,93,Mn,1,2,0,2,12,4,0,,0,2.9
131140,1fyu,1fyu_3,3,18,123,110,Ca,1,2,1,2,13,4,0,GAL,1,2.9


In [17]:
_, site_chain = ds[1]
print(_)

6fpw_1


In [18]:
site_chain

Chain(id='A', type='metal_binding_site', sequence='', residues={'1': Residue(name='GLY', atoms={'N': Atom(name=('A', '1', 'GLY', 'N'), xyz=[-19.936, 7.408, 17.572], occ=1.0, bfac=8.54, leaving=False, leaving_group=['H2'], parent='CA', element=7, metal=False, charge=0, hyb=2, nhyd=1, hvydeg=1, align=1, hetero=False), 'CA': Atom(name=('A', '1', 'GLY', 'CA'), xyz=[-19.452, 6.122, 17.104], occ=1.0, bfac=9.08, leaving=False, leaving_group=[], parent='C', element=6, metal=False, charge=0, hyb=3, nhyd=2, hvydeg=2, align=1, hetero=False), 'C': Atom(name=('A', '1', 'GLY', 'C'), xyz=[-18.415, 5.592, 18.061], occ=1.0, bfac=7.63, leaving=False, leaving_group=['OXT', 'HXT'], parent='OXT', element=6, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=3, align=1, hetero=False), 'O': Atom(name=('A', '1', 'GLY', 'O'), xyz=[-17.736, 6.358, 18.754], occ=1.0, bfac=8.77, leaving=False, leaving_group=[], parent='C', element=8, metal=False, charge=0, hyb=2, nhyd=0, hvydeg=1, align=1, hetero=False)}, bonds=[Bond(a=

In [19]:
visualize_chain_3d(site_chain)

<py3Dmol.view at 0x7f6a4791ee20>

### Try filtering sites ...

Only one metal per site, only one sit per pdb

In [27]:
ds = MetalSiteDataset(
    cache_folder='../data/1/1.1_parse_sites_metadata',
    max_sites_per_pdb=1,
    max_metals=1,
)

In [28]:
df = ds.get_filtered_metadata()
df

Unnamed: 0,pdb_code,site_name,site_idx,n_entities,n_atoms,n_bonds,metal,n_metals,n_waters,n_organic_ligands,n_metal_ligands,n_amino_acids,n_coordinating_amino_acids,n_nucleotides,non_residue_non_metal_names,n_non_residue_non_metal,coordination_distance
15,1foi,1foi_0,0,13,85,72,Zn,1,0,0,1,12,4,0,,0,2.9
23,2c2j,2c2j_0,0,11,71,60,Mg,1,0,0,1,10,3,0,,0,2.9
26,6bpv,6bpv_0,0,13,106,99,Fe,1,0,1,1,11,4,0,F2Y,1,2.9
39,6tgt,6tgt_0,0,11,71,60,Ca,1,0,0,1,10,3,0,,0,2.9
45,1rrk,1rrk_0,0,13,91,79,Co,1,0,0,1,12,3,0,,0,2.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131075,2ghc,2ghc_0,0,15,99,87,Na,1,2,0,1,12,3,0,,0,2.9
131094,4b0a,4b0a_0,0,9,69,62,Ca,1,0,0,1,8,3,0,,0,2.9
131100,4rvf,4rvf_0,0,13,92,81,Zn,1,0,0,1,12,4,0,,0,2.9
131103,1tqs,1tqs_0,0,14,116,108,Zn,1,2,1,1,10,4,0,SSO,1,2.9


In [29]:
df

Unnamed: 0,pdb_code,site_name,site_idx,n_entities,n_atoms,n_bonds,metal,n_metals,n_waters,n_organic_ligands,n_metal_ligands,n_amino_acids,n_coordinating_amino_acids,n_nucleotides,non_residue_non_metal_names,n_non_residue_non_metal,coordination_distance
15,1foi,1foi_0,0,13,85,72,Zn,1,0,0,1,12,4,0,,0,2.9
23,2c2j,2c2j_0,0,11,71,60,Mg,1,0,0,1,10,3,0,,0,2.9
26,6bpv,6bpv_0,0,13,106,99,Fe,1,0,1,1,11,4,0,F2Y,1,2.9
39,6tgt,6tgt_0,0,11,71,60,Ca,1,0,0,1,10,3,0,,0,2.9
45,1rrk,1rrk_0,0,13,91,79,Co,1,0,0,1,12,3,0,,0,2.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
131075,2ghc,2ghc_0,0,15,99,87,Na,1,2,0,1,12,3,0,,0,2.9
131094,4b0a,4b0a_0,0,9,69,62,Ca,1,0,0,1,8,3,0,,0,2.9
131100,4rvf,4rvf_0,0,13,92,81,Zn,1,0,0,1,12,4,0,,0,2.9
131103,1tqs,1tqs_0,0,14,116,108,Zn,1,2,1,1,10,4,0,SSO,1,2.9


In [23]:
visualize_chain_3d(ds[-1][1])

<py3Dmol.view at 0x7f776ce94520>

Also only cobolt and nickel, no orgnanic ligands

In [24]:
ds = MetalSiteDataset(
    cache_folder='../data/1/1.1_parse_sites_metadata',
    max_sites_per_pdb=1,
    max_metals=1,
    max_organic_ligands=0,
    valid_metals=['Co', 'Ni'],
)
df = ds.get_filtered_metadata()
df

Unnamed: 0,pdb_code,site_name,site_idx,n_entities,n_atoms,n_bonds,metal,n_metals,n_waters,n_organic_ligands,n_metal_ligands,n_amino_acids,n_coordinating_amino_acids,n_nucleotides,non_residue_non_metal_names,n_non_residue_non_metal,coordination_distance
45,1rrk,1rrk_0,0,13,91,79,Co,1,0,0,1,12,3,0,,0,2.9
239,4fca,4fca_0,0,7,50,45,Ni,1,0,0,1,6,3,0,,0,2.9
1626,7b9b,7b9b_0,0,8,65,62,Ni,1,0,0,1,7,4,0,,0,2.9
1817,2ga0,2ga0_0,0,6,44,39,Ni,1,0,0,1,5,3,0,,0,2.9
2077,2w3t,2w3t_0,0,15,89,77,Ni,1,3,0,1,11,3,0,,0,2.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
127654,5x57,5x57_0,0,12,94,84,Ni,1,0,0,1,11,4,0,,0,2.9
129364,4q3y,4q3y_0,0,40,295,263,Co,1,3,0,1,36,3,0,,0,2.9
129504,1xu2,1xu2_0,0,8,67,62,Ni,1,0,0,1,7,3,0,,0,2.9
129765,6h1x,6h1x_0,0,5,37,35,Co,1,0,0,1,4,3,0,,0,2.9


In [25]:
visualize_chain_3d(ds[0][1])

<py3Dmol.view at 0x7f77aea26670>