In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
from molfeat.store import ModelInfo
from molfeat.store import ModelStore

In [3]:
store = ModelStore()


In [20]:
avalon = {'name': 'avalon',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Similar to Daylight fingerprints, Avalon uses a fingerprint generator that enumerates certain paths and feature classes of the molecular graph.  The fingerprint bit positions are hashed from the description of the feature; however, the hash codes for all the path-style features are computed implicitly while they are enumerated.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['avalon', 'hashed', '2D', 'binary', 'rdkit', 'folded'],
 'authors': ['Peter Gedeck', 'Bernhard Rohde', 'Christian Bartels'],
 'reference': 'https://doi.org/10.1021/ci050413p'
}

ecfp = {'name': 'ecfp',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Extended-connectivity fingerprints (ECFPs) are a family of circular fingerprints that are commonly used for the measure of molecular similarity. They are based on the connectivity of atoms in molecular graphs.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['fixed', 'morgan', '2D', 'binary', 'rdkit','ecfp', 'folded'],
 'authors': ['David Rogers', 'Mathew Hahn'],
 'reference': 'https://doi.org/10.1021/ci100050t',
}

fcfp = {'name': 'fcfp',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Functional-class fingerprints (FCFPs) are an extension of ECFPs which incorporate information about the functional classes of atoms in a molecule. FCFPs are intended to capture more abstract property-based substructural features and leverage atomic characteristics that relate more to pharmacophoric features (e.g. hydrogen donor/acceptor, polarity, aromaticity, etc.).',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['functional', 'fcfp', '2D', 'binary', 'rdkit','pharmacophore', 'folded'],
 'authors': ['David Rogers', 'Mathew Hahn'],
 'reference': 'https://doi.org/10.1021/ci100050t',
}

topological = {'name': 'topological',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Topological torsion fingerprints are a type of molecular fingerprint that represents the topological features of a molecule based on its graph representation. They are generated by computing the frequencies of all possible molecular torsions in a molecule and then encoding them as a binary vector.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['graph', 'topological', 'torsion', 'rdkit', 'binary', 'folded'],
 'authors': ['Ramaswamy Nilakantan', 'Norman Bauman', 'J. Scott Dixon', 'R. Venkataraghavan'],
 'reference': 'https://doi.org/10.1021/ci00054a008',
}

atompair = {'name': 'topological',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Topological torsion fingerprints are a type of molecular fingerprint that represents the topological features of a molecule based on its graph representation. They are generated by computing the frequencies of all possible molecular torsions in a molecule and then encoding them as a binary vector.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['graph', 'topological', 'torsion', 'rdkit', 'binary', 'folded'],
 'authors': ['Ramaswamy Nilakantan', 'Norman Bauman', 'J. Scott Dixon', 'R. Venkataraghavan'],
 'reference': 'https://doi.org/10.1021/ci00054a008',
}

rdkit = {'name': 'rdkit',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'This is an RDKit-specific fingerprint that is inspired by (though it differs significantly from) public descriptions of the Daylight fingerprint. The fingerprinting algorithm identifies all subgraphs in the molecule within a particular range of sizes, hashes each subgraph to generate a raw bit ID, that is then folded into the requested fingerprint size as binary vectors. Options are available to generate count-based forms of the fingerprint or “non-folded” forms (using a sparse representation).',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['fingerprints', 'rdkit', 'binary', 'folded', 'daylight'],
 'authors': ['RDKit'],
 'reference': 'https://www.rdkit.org/docs/RDKit_Book.html#rdkit-fingerprints'
}


pattern = {'name': 'pattern',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Pattern fingerprints were designed to be used in substructure screening. The algorithm identifies features in the molecule by doing substructure searches using a small number of very generic SMARTS patterns and then hashing each occurrence of a pattern based on the atom and bond types involved. The fact that a particular pattern matched the molecule at all is also stored by hashing the pattern ID and size.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['pattern', 'fingerprints', 'rdkit', 'binary', 'predefined', 'substructures'],
 'authors': ['RDKit'],
 'reference': 'https://www.rdkit.org/docs/RDKit_Book.html#pattern-fingerprints',
}

layered = {'name': 'pattern',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Pattern fingerprints were designed to be used in substructure screening. The algorithm identifies features in the molecule by doing substructure searches using a small number of very generic SMARTS patterns and then hashing each occurrence of a pattern based on the atom and bond types involved. The fact that a particular pattern matched the molecule at all is also stored by hashing the pattern ID and size.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['pattern', 'fingerprints', 'rdkit', 'binary', 'predefined', 'substructures'],
 'authors': ['RDKit'],
 'reference': 'https://www.rdkit.org/docs/RDKit_Book.html#pattern-fingerprints',
}

map4 = {'name': 'map4',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'fp',
 'submitter': 'Datamol',
 'description': 'MinHashed atom-pair fingerprint up to a diameter of four bonds (MAP4) is suitable for both small and large molecules by combining substructure and atom-pair concepts. In this fingerprint the circular substructures with radii of r = 1 and r = 2 bonds around each atom in an atom-pair are written as two pairs of SMILES, each pair being combined with the topological distance separating the two central atoms. These so-called atom-pair molecular shingles are hashed, and the resulting set of hashes is MinHashed to form the MAP4 fingerprint.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['minhashed', 'map4', 'atompair', 'substructure', 'morgan'],
 'authors': ['Alice Capecchi', 'Daniel Probst', 'Jean-Louis Reymond'],
 'reference': 'https://doi.org/10.1186/s13321-020-00445-4'
}

secfp = {'name': 'secfp',
 'inputs': 'smiles',
 'type': 'hashed',
 'version': 0,
 'group': 'fp',
 'submitter': 'Datamol',
 'description': 'SMILES extended connectivity fingerprint (SECFP), is a fingerprint variant on MinHash fingerprints (MHFPs) SMILES-based circular substructure hashing scheme, folded by the same modulo 𝑛 operation that is used by ECFP.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['minhashed', 'smiles', 'ecfp', 'secfp', 'mhfp', 'mhfp6'],
 'authors': ['Daniel Probst', 'Jean-Louis Reymond'],
 'reference': 'https://doi.org/10.1186/s13321-018-0321-8',
}


erg = {'name': 'erg',
 'inputs': 'smiles',
 'type': 'hand-crafted',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Extended Reduced Graph approach (ErG) describes a molecular structure by defining its pharmacophoric points and the topological distance between them. It uses a pairwise combination of pharmacophores and their distance to set a corresponding bit in a vector. The ErG fingerprint implements fuzzy incrementation, which favours retrieval of actives with different core structures (scaffold hopping).',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['2D', 'pharmacophore', 'erg', 'graph', 'rdkit'],
 'authors': ['Nikolaus Stiefl', 'Ian A Watson', 'Knut Baumann', 'Andrea Zaliani'],
 'reference': 'https://doi.org/10.1021/ci050457y',
}

estate = {'name': 'estate',
 'inputs': 'smiles',
 'type': 'hand-crafted',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Electrotopological state (Estate) indices are numerical values computed for each atom in a molecule, and which encode information about both the topological environment of that atom and the electronic interactions due to all other atoms in the molecule.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['electrotopological', 'electronic', 'interactions', 'estate', 'rdkit'],
 'authors': ['Lemont B. Kier', 'Lowell H. Hall'],
 'reference': 'https://doi.org/10.1023/A:1015952613760',
}

avalon_count = {'name': 'estate',
 'inputs': 'smiles',
 'type': 'hand-crafted',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Electrotopological state (Estate) indices are numerical values computed for each atom in a molecule, and which encode information about both the topological environment of that atom and the electronic interactions due to all other atoms in the molecule.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['electrotopological', 'electronic', 'interactions', 'estate', 'rdkit'],
 'authors': ['Lemont B. Kier', 'Lowell H. Hall'],
 'reference': 'https://doi.org/10.1023/A:1015952613760',
}


rdkit_count = {'name': 'estate',
 'inputs': 'smiles',
 'type': 'hand-crafted',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'Electrotopological state (Estate) indices are numerical values computed for each atom in a molecule, and which encode information about both the topological environment of that atom and the electronic interactions due to all other atoms in the molecule.',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['electrotopological', 'electronic', 'interactions', 'estate', 'rdkit'],
 'authors': ['Lemont B. Kier', 'Lowell H. Hall'],
 'reference': 'https://doi.org/10.1023/A:1015952613760',
}


ecfp_count = {'name': 'ecfp-count',
 'inputs': 'smiles',
 'type': 'count',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'The ECFP-Count (Extended Connectivity Fingerprints-Coun is essentially the same as the ECFP. However, instead of being hashed into a binary vector, there is no hashing process and simply a count vector is returned',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['fixed', 'morgan', '2D', 'rdkit', 'ecfpcount', 'vector'],
 'authors': ['David Rogers', 'Mathew Hahn'],
 'reference': 'https://doi.org/10.1021/ci100050t',
}

fcfp_count = {'name': 'fcfp-count',
 'inputs': 'smiles',
 'type': 'count',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'The FCFP-Count (Functional Class Fingerprints-Count) is essentially the same as the FCFP. However, instead of being hashed into a binary vector, there is no hashing process and simply a count vector is returned',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['functional', 'fcfpcount', '2D', 'rdkit', 'pharmacophore'],
 'authors': ['David Rogers', 'Mathew Hahn'],
 'reference': 'https://doi.org/10.1021/ci100050t',
}

topological_count = {'name': 'topological-count',
 'inputs': 'smiles',
 'type': 'count',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'The Topological-Count fingerprint is essentially the same as the Topological fingerprint. However, instead of being hashed into a binary vector, there is no hashing process and simply a count vector is returned',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['graph', 'topologicalcount', 'torsion', 'rdkit', 'vector'],
 'authors': ['Ramaswamy Nilakantan', 'Norman Bauman', 'J. Scott Dixon', 'R. Venkataraghavan'],
 'reference': 'https://doi.org/10.1021/ci00054a008',
}

atom_pair_count = {'name': 'atompair-count',
 'inputs': 'smiles',
 'type': 'count',
 'version': 0,
 'group': 'rdkit',
 'submitter': 'Datamol',
 'description': 'The Atompair-Count fingerprint is essentially the same as the atompair fingerprint. However, instead of being hashed into a binary vector, there is no hashing process and simply a count vector is returned',
 'representation': 'vector',
 'require_3D': False,
 'tags': ['atompaircount', 'interactions', 'frequency', 'rdkit', 'vector'],
 'authors': ['Raymond E. Carhart', 'Dennis H. Smith', 'R. Venkataraghavan'],
 'reference': 'https://doi.org/10.1021/ci00046a002',
}



In [21]:
model_infos = [avalon, ecfp, fcfp, topological, atompair, rdkit, pattern, layered, map4, secfp, erg, estate, avalon_count, rdkit_count, ecfp_count, fcfp_count, topological_count, atom_pair_count]

In [22]:
for model_info in model_infos:
    card = ModelInfo(**model_info)
    store.register(card, model=None, force=True)

  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:28:46.948 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model avalon !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:28:50.103 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model ecfp !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:28:53.506 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model fcfp !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:28:56.505 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model topological !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:28:59.608 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model topological !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:02.642 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model rdkit !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:05.565 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model pattern !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:08.538 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model pattern !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:11.463 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model map4 !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:14.306 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model secfp !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:17.374 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model erg !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:21.309 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model estate !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:24.835 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model estate !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:27.786 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model estate !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:30.813 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model ecfp-count !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:33.827 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model fcfp-count !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:36.952 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model topological-count !


  0%|          | 0.00/4.00 [00:00<?, ?B/s]

2023-02-16 10:29:40.075 | INFO     | molfeat.store.modelstore:register:126 - Successfuly registered model atompair-count !
