# Building Blocks

> Building block assembly related functions

In [None]:
#| default_exp building_blocks

In [None]:
#| hide
from nbdev.showdoc import *
%load_ext autoreload
%autoreload 2

In [None]:
#| export
import chem_templates
from chem_templates.imports import *
from chem_templates.utils import *
from chem_templates.chem import Molecule, to_mol, canon_smile, to_smile
from rdkit.Chem import rdChemReactions
from rdkit import Chem

BB_PATH = chem_templates.__path__[0] + '/building_block_schemas'

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
#| export

def check_mapped_valence(smiles: str, 
                         mapping_start: str) -> bool:
    targets = {"C": 4, "N": 3, "N+": 4, "O": 2, "S:10": 6, "S:20": 2}
    mol = to_mol(smiles)
    if mol is None:
        return False
    
    mapped_atoms = [atom for atom in mol.GetAtoms() if atom.GetAtomMapNum() != 0]
    map_nums = [atom.GetAtomMapNum() for atom in mapped_atoms]
    mapped_symbols = [atom.GetSymbol() for atom in mapped_atoms]
    for i in range(len(mapped_atoms)):
        if mapped_symbols[i] == 'S':
            if mapped_atoms[i].GetTotalValence() < targets[f'{mapped_symbols[i]}:{map_nums[i]}']:
                return False
            
        elif '+' in mapping_start:
            if mapped_atoms[i].GetTotalValence() < targets['N+']:
                return False
            
        else:
            if mapped_atoms[i].GetTotalValence() < targets[mapped_symbols[i]]:
                return False
            
    return True


def add_mapping_label(smiles: str, 
                mapping_start: str, 
                mapping_end: str) -> Union[str, None]:
    
    if 'S' in mapping_start:
        label = '[' + mapping_end.replace(':', 'H:') + ']'
        new_smile = smiles.replace(mapping_start, label)
        return canon_smile(new_smile)
    
    elif ('+' in mapping_start) and ('H' in mapping_start):
        # N+ special case, no valence check
        prev_smile = None
        possible_labels = ['[' + mapping_end.replace(':', f"H{i}:") + ']' for i in range(1, 5)]
        for pl in possible_labels:
            new_smile = smiles.replace(mapping_start, pl)
            new_mol = to_mol(new_smile)
            if new_mol:
                prev_smile = new_smile
            else:
                return canon_smile(prev_smile)
    
    else:
        charge_term = '+' if '+' in mapping_start else ''
        possible_labels = ['[' + mapping_end.replace(':', f"H{i}{charge_term}:") + ']' for i in range(1, 5)]
    
        for pl in possible_labels:
            new_smile = smiles.replace(mapping_start, pl)
            new_mol = to_mol(new_smile)
            if new_mol and check_mapped_valence(new_smile, mapping_start):
                return canon_smile(new_smile)
        
    return None

In [None]:
all_mappings = [['*N', 'N:20'],
 ['*[15N]', '15N:20'],
 ['*C', 'C:10'],
 ['*[13C]', '13C:10'],
 ['*[13CH]', '13C:10'],
 ['*c', 'c:20'],
 ['*[13c]', '13c:20'],
 ['*C', 'C:70'],
 ['*[13C]', '13C:70'],
 ['*[13CH]', '13C:70'],
 ['*O', 'O:20'],
 ['*c', 'c:70'],
 ['*[13c]', '13c:70'],
 ['*C', 'C:20'],
 ['*[13C]', '13C:20'],
 ['*[13CH]', '13C:20'],
 ['*[15NH]', '15N:20'],
 ['*N', 'N:40'],
 ['*[15NH]', '15N:40'],
 ['*N', 'N:11'],
 ['*[15NH]', '15N:11'],
 ['*[15N]', '15N:11'],
 ['*C', 'C:60'],
 ['*[13C]', '13C:60'],
 ['*[13CH]', '13C:60'],
 ['*c', 'c:60'],
 ['*[13c]', '13c:60'],
 ['*S', 'S:20'],
 ['*c', 'c:10'],
 ['*[13c]', '13c:10'],
 ['*C', 'C:30'],
 ['*[13C]', '13C:30'],
 ['*[13CH]', '13C:30'],
 ['*C', 'C:21'],
 ['*[13C]', '13C:21'],
 ['*[13CH]', '13C:21'],
 ['*c', 'c:21'],
 ['*[13c]', '13c:21'],
 ['*S', 'S:10'],
 ['*C', 'C:50'],
 ['*C', 'C:40'],
 ['*[13C]', '13C:40'],
 ['*[13CH]', '13C:40'],
 ['*[N+]', 'N:20'],
 ['*[NH+]', 'N:20'],
 ['*n', 'n:20'],
 ['*[15n]', '15n:20']]

targets = ['[NH3:20]',
 '[15NH3:20]',
 '[CH4:10]',
 '[13CH4:10]',
 '[13CH4:10]',
 'c1cc[cH:20]cc1',
 'c1cc[13cH:20]cc1',
 '[CH4:70]',
 '[13CH4:70]',
 '[13CH4:70]',
 '[OH2:20]',
 'c1cc[cH:70]cc1',
 'c1cc[13cH:70]cc1',
 '[CH4:20]',
 '[13CH4:20]',
 '[13CH4:20]',
 '[15NH3:20]',
 '[NH3:40]',
 '[15NH3:40]',
 '[NH3:11]',
 '[15NH3:11]',
 '[15NH3:11]',
 '[CH4:60]',
 '[13CH4:60]',
 '[13CH4:60]',
 'c1cc[cH:60]cc1',
 'c1cc[13cH:60]cc1',
 '[SH:20]',
 'c1cc[cH:10]cc1',
 'c1cc[13cH:10]cc1',
 '[CH4:30]',
 '[13CH4:30]',
 '[13CH4:30]',
 '[CH4:21]',
 '[13CH4:21]',
 '[13CH4:21]',
 'c1cc[cH:21]cc1',
 'c1cc[13cH:21]cc1',
 '[SH:10]',
 '[CH4:50]',
 '[CH4:40]',
 '[13CH4:40]',
 '[13CH4:40]',
 '[NH4+:20]',
 '[NH3:20]',
 'c1cc[nH:20]c1',
 'c1cc[15nH:20]c1']

outputs = []
for (mapping_start, mapping_end) in all_mappings:
    inp = mapping_start
    if 'c' in inp:
        inp += '1ccccc1'
        
    if 'n' in inp:
        inp += '1cccc1'
        
    outputs.append(add_mapping_label(inp, mapping_start, mapping_end))
    
assert outputs == targets

In [None]:
#| export

class Mapping():
    def __init__(self, mapping_start, mapping_end):
        self.mapping_start = mapping_start
        self.mapping_end = mapping_end
        
    def map_smile(self, smile):
        if self.mapping_start in smile:
            smile = add_mapping_label(smile, self.mapping_start, self.mapping_end)
        return smile
    
    def dump(self):
        return {'mapping_start' : self.mapping_start, 
                'mapping_end' : self.mapping_end}
    
    @classmethod
    def from_dict(cls, input_dict):
        return cls(input_dict['mapping_start'], input_dict['mapping_end'])
    
    def __repr__(self):
        return f'Mapping: {self.mapping_start} -> {self.mapping_end}'
    
def add_multiple_mappings(smile, mappings):
    stack = [smile]
    outputs = []
    
    while stack:
        current = stack.pop()
        for mapping in mappings:
            labeled = mapping.map_smile(current)
            if (labeled is not None) and (labeled != current):
                if '*' in labeled:
                    stack.append(labeled)
                else:
                    outputs.append(labeled)
                    
    return deduplicate_list(outputs) if outputs else [smile]

In [None]:
mappings = [
    Mapping('*N', 'N:20'),
    Mapping('*[15NH]', '15N:20'),
]

assert add_multiple_mappings('*N', mappings) == ['[NH3:20]']
assert add_multiple_mappings('*[15NH]', mappings) == ['[15NH3:20]']
assert add_multiple_mappings('*NC([15NH]*)C', mappings) == ['CC([NH2:20])[15NH2:20]']
assert add_multiple_mappings('*NC([15NH]*)C', []) == ['*NC([15NH]*)C']

In [None]:
#| export

class MappedReaction():
    def __init__(self, rxn_smarts, mappings, is_deprotection=False):
        self.rxn_smarts = rxn_smarts
        self.rxn = rdChemReactions.ReactionFromSmarts(self.rxn_smarts)
        self.rxn.Initialize()
        self.mappings = mappings
        self.is_deprotection = is_deprotection
        
    def match(self, mol):
        return self.rxn.IsMoleculeReactant(mol)
    
    def map_product(self, smile):
        if (not self.is_deprotection):
            smiles = add_multiple_mappings(smile, self.mappings)
        else:
            smiles = [smile]
        return smiles
    
    def react(self, smile):
        mol = to_mol(smile)
        products = flatten_list(self.rxn.RunReactants((mol,)))
        products = [to_smile(i) for i in products]
        products = flatten_list([self.map_product(i) for i in products])
        return deduplicate_list(products)
    
    def dump(self):
        output = {
            'rxn_smarts' : self.rxn_smarts,
            'is_deprotection' : self.is_deprotection,
            'mappings' : [i.dump() for i in self.mappings]
        }
        return output
        
    @classmethod
    def from_dict(cls, input_dict):
        mappings = [Mapping.from_dict(i) for i in input_dict['mappings']]
        return cls(input_dict['rxn_smarts'], mappings, input_dict['is_deprotection'])
    
    def __repr__(self):
        if self.is_deprotection:
            output = f'Reaction: deprotection'
        else:
            output = f'Reaction: {len(self.mappings)} mappings'
        return output

In [None]:
mappings = [
    Mapping('*N', 'N:20'),
    Mapping('*[15NH]', '15N:20'),
]

smarts = '[NH,NH2;!$(NC(=O)OC([CD1])([CD1])[CD1]);!$(N[2H]):1]>>*[N;+0:1]'

rxn = MappedReaction(smarts, mappings)
smile = 'CC(C)(C)OC(=O)N1CCN(c2cc(Br)nc3c2NCCC3)CC1'
reacted = rxn.react(smile)
assert len(reacted)==1
assert reacted[0] == 'CC(C)(C)OC(=O)N1CCN(c2cc(Br)nc3c2[NH:20]CCC3)CC1'
assert MappedReaction.from_dict(rxn.dump()).react(smile) == reacted

In [None]:
#| export

class TransformBlock():
    def __init__(self, rxns):
        self.rxns = rxns
        
    def transform_smile(self, smile, include_input=False):
        outputs = []
        mol = to_mol(smile)
        
        for rxn in self.rxns:
            if rxn.match(mol):
                outputs += rxn.react(smile)
                
        return outputs
    
    def transform(self, smiles, include_input=False, flatten=True):
        outputs = [self.transform_smile(i, include_input=include_input) for i in smiles]
        if flatten:
            outputs = deduplicate_list(flatten_list(outputs))
        return outputs
    
    def dump(self):
        return {'rxns' : [i.dump() for i in self.rxns]}
    
    @classmethod
    def from_dict(cls, input_dict):
        rxns = [MappedReaction.from_dict(i) for i in input_dict['rxns']]
        return cls(rxns)
    
    def __repr__(self):
        return f"TransformBlock: {', '.join([i.__repr__() for i in self.rxns])}"

In [None]:
tfm_dict1 = {
    'rxns' : [
        {
            'rxn_smarts' : '[NH,NH2;!$(NC(=O)OC([CD1])([CD1])[CD1]);!$(N[2H]):1]>>*[N;+0:1]',
            'is_deprotection' : False,
            'mappings' : [
                {'mapping_start': '*N', 'mapping_end': 'N:20'},
                {'mapping_start': '*[15NH]', 'mapping_end': '15N:20'}
            ]
        },
        {
            'rxn_smarts': '[NH2;$([NH2][c,CX4]);!$(NC(=O)OC([CD1])([CD1])[CD1]);!$(N[2H]):1]>>*[N;+0:1]',
            'is_deprotection': False,
            'mappings': [
                {'mapping_start': '*N', 'mapping_end': 'N:20'},
                {'mapping_start': '*[15NH]', 'mapping_end': '15N:20'}]
        }
    ]
}

tfm_dict2 = {
    'rxns' : [
        {
            'rxn_smarts' : '[N;$(NC(=O)OC([CD1])([CD1])[CD1]):1][C;$(C(=O)OC([CD1])([CD1])[CD1]):2]>>[N:1]',
            'is_deprotection' : True,
            'mappings' : [
                {'mapping_start' : '', 'mapping_end' : ''}
            ]
        }
    ]
}

tfm_dict3 = {
    'rxns' : [
        {
            'rxn_smarts' : '[NH,NH2;!$(NC=[N,S,P,C]);!$(N!@C=O);!$(N[CH2]c1[cH][cH][cH][cH][cH]1);!$(N[2H]):1]>>*[N;+0:1]',
            'is_deprotection' : False,
            'mappings' : [
                {'mapping_start' : '*N', 'mapping_end' : 'N:20'},
                {'mapping_start' : '*[15NH]', 'mapping_end' : '15N:20'}
            ]
        },
        {
            'rxn_smarts' : '[NH2;!$(NC=[N,S,P,C]);!$(N!@C=O);$([NH2][c,CX4]);!$(N[2H]):1]>>*[N;+0:1]',
            'is_deprotection' : False,
            'mappings' : [
                {'mapping_start' : '*N', 'mapping_end' : 'N:40'},
                {'mapping_start' : '*[15NH]', 'mapping_end' : '15N:40'}
            ]
        }
    ]
}

tfm1 = TransformBlock.from_dict(tfm_dict1)
tfm2 = TransformBlock.from_dict(tfm_dict2)
tfm3 = TransformBlock.from_dict(tfm_dict3)

smile = 'CC(C)(C)OC(=O)N1CCN(c2cc(Br)nc3c2NCCC3)CC1'

p1 = tfm1.transform([smile])
p2 = tfm2.transform(p1)
p3 = tfm3.transform(p2)

assert set(p3) == set(['Brc1cc(N2CCNCC2)c2c(n1)CCC[NH:20]2', 'Brc1cc(N2CC[NH:20]CC2)c2c(n1)CCC[NH:20]2'])


In [None]:
#| export

class BuildingBlockSchema():
    def __init__(self,
                 name,
                 smarts_match_any,
                 smarts_match_all,
                 smarts_match_none,
                 n_func,
                 n_pg,
                 transforms
                ):
        
        self.name = name
        
        self.smarts_match_any = smarts_match_any
        self.smarts_match_any_mols = [Chem.MolFromSmarts(i) for i in smarts_match_any]
        
        self.smarts_match_all = smarts_match_all
        self.smarts_match_all_mols = [Chem.MolFromSmarts(i) for i in smarts_match_all]
        
        self.smarts_match_none = smarts_match_none
        self.smarts_match_none_mols = [Chem.MolFromSmarts(i) for i in smarts_match_none]
        
        self.n_func = n_func
        self.n_pg = n_pg
        
        self.transforms = transforms
        
    def _check_match_any(self, mol: Chem.Mol) -> bool:
        for substruct in self.smarts_match_any_mols:
            if mol.HasSubstructMatch(substruct):
                return True
        return False
    
    def _check_match_all(self, mol: Chem.Mol) -> bool:
        for substruct in self.smarts_match_all_mols:
            if not mol.HasSubstructMatch(substruct):
                return False
        return True
    
    def _check_match_none(self, mol: Chem.Mol) -> bool:
        for substruct in self.smarts_match_none_mols:
            if mol.HasSubstructMatch(substruct):
                return False
        return True
        
    def match(self, mol: Chem.Mol) -> bool:
        return self._check_match_any(mol) and self._check_match_all(mol) and self._check_match_none(mol)
    
    def transform_smile(self, smile, return_sequence=False):
        sequence = []
        inputs = [smile]
        sequence.append(inputs)
        
        for tfm in self.transforms:
            inputs = tfm.transform(inputs, flatten=True)
            sequence.append(inputs)
            
        if return_sequence:
            return (inputs, sequence)
        else:
            return inputs
        
    def dump(self):
        output = {
            'schema_name' : self.name,
            'smarts_match_any' : self.smarts_match_any,
            'smarts_match_all' : self.smarts_match_all,
            'smarts_match_none' : self.smarts_match_none,
            'n_func' : self.n_func,
            'n_pg' : self.n_pg,
            'transforms' : [i.dump() for i in self.transforms]
        }
        return output
    
    @classmethod
    def from_dict(cls, input_dict):
        tfms = [TransformBlock.from_dict(i) for i in input_dict['transforms']]
        return cls(input_dict['schema_name'], 
                   input_dict['smarts_match_any'], 
                   input_dict['smarts_match_all'], 
                   input_dict['smarts_match_none'],
                   input_dict['n_func'],
                   input_dict['n_pg'],
                   tfms
                  )

In [None]:
schema_dict = {
    'schema_name': 'NbocDi_Amines',
    'smarts_match_any': [
        '[N;+0]C(=O)OC([CD1])([CD1])[CD1].[NX3;+0;$([NH2][c,CX4]),$([NH]([c,CX4])[CX4])]',
        '[NX3;+0;$([NHR0]([CX4,c])!@[ND3R]),$([NH2R0]!@[ND3R]),$([NR]@[NHR]@[CX4,c]),$([NH]([CX4])[OD2]),$([NH2][OD2])].[OD1;$(O=C(N)OC([CH3])([CH3])[CH3])]'
    ],
     'smarts_match_all': [],
     'smarts_match_none': [
         '[Cl,F,Br][C,P,S]=[O,S,N]',
         '[CH2]1[CH2][CH2][CH]([CH2][CH2]1)[NH][CH]1[CH2][CH2][CH2][CH2][CH2]1.*',
         'NC(=O)O[CH2][CH]1c2ccccc2-c3ccccc13',
         '[N;$([NH]);!$(NC=[C,N,O,P,S])!$(N=[C,N,O,S])!$(N[P,S])!$(NC#N)].[N;$([NH]);!$(NC=[C,N,O,P,S])!$(N=[C,N,O,S])!$(N[P,S])!$(NC#N)]',
         'NC(=O)OC([CD1])([CD1])[CD1].NC(=O)OC([CD1])([CD1])[CD1]',
         '[CX4][Cl,Br,I]',
         'O=C([#6])O',
         '[Sn,B]',
         '[O,S]=[C;$(C([#6])[#6])]',
         '[O,S]=[CH;$(C[#6])]',
         '[O,S]=[CH2]',
         '[SH,S-]',
         'N[OH]',
         '*=[NH]',
         '[Or3]',
         'N=C=[O,S]',
         '[Si,P,S,B][Cl,F,Br,I]',
         '[SiH,SiH2,SiH3,SiH4,PH,PH2,PH3,SH,SH2,BH,BH2,BH3,BH4]',
         '[Mg,Li,Zn][#6]'
     ],
     'n_func' : 2,
     'n_pg' : 1,
     'transforms' : [i.dump() for i in [tfm1, tfm2, tfm3]]
}

schema = BuildingBlockSchema.from_dict(schema_dict)

assert schema.match(to_mol(smile))

assert set(schema.transform_smile(smile)) == set(['Brc1cc(N2CC[NH:20]CC2)c2c(n1)CCC[NH:20]2', 
                                                  'Brc1cc(N2CCNCC2)c2c(n1)CCC[NH:20]2'])

In [None]:
acid_schemas = json.load(open(f'{BB_PATH}/Acid.json'))
schema = BuildingBlockSchema.from_dict(acid_schemas['schemas'][0]) # aromatic acids

acid_mol = Chem.MolFromSmiles('[OH]C(=O)c1ccccc1')
non_acid_mol = Chem.MolFromSmiles('CC(=O)c1ccccc1')

assert schema.match(acid_mol)
assert not schema.match(non_acid_mol)

In [None]:
#| export

class BuildingBlockClass():
    def __init__(self, name, schemas):
        self.name = name
        self.schemas = schemas
        self.schema_dict = {i.name:i for i in self.schemas}
        
    def match(self, mol):
        output = {}
        for schema in self.schemas:
            if schema.match(mol):
                output[schema.name] = schema
                
        if output:
            output = {self.name : output}
            
        return output
    
    def add_schema(self, schema):
        self.schemas.append(schema)
        
    def dump(self):
        return {'class_name' : self.name, 'schemas' : [i.dump() for i in self.schemas]}
    
    @classmethod
    def from_dict(cls, input_dict):

        schemas = [BuildingBlockSchema.from_dict(i) for i in input_dict['schemas']]
        return cls(input_dict['class_name'], schemas)
    
    @classmethod
    def from_file(cls, filename):
        with open(filename, 'r') as f:
            input_dict = json.load(f)
            return cls.from_dict(input_dict)

In [None]:
#| export

BB_CLASS_NAMES = [
        'SecondaryAmines',
        'Acylhalides',
        'SulfonesSulfinates',
        'Reagents',
        'Bifunctional',
        'ArylHalide',
        'Anhydrides',
        'Boronics',
        'Alkenes',
        'Acetylenes',
        'AlkylHalides',
        'Aldehyde',
        'Ketones',
        'Amides',
        'TertiaryAmines',
        'ElementOrganics',
        'SulfonylHalides',
        'PrimaryAmines',
        'Trifunctional',
        'Alcohols',
        'nHAzoles',
        'Azides',
        'Esters',
        'Aminoacids',
        'Acid',
        'ReagentsForOlefination'
]

BB_CLASSES = {i : BuildingBlockClass.from_file(f'{BB_PATH}/{i}.json') for i in BB_CLASS_NAMES}

In [None]:
assert BB_CLASSES['ArylHalide'].match(Chem.MolFromSmiles('CC(C)(C)OC(=O)NNS(=O)(=O)c1ccc(F)cc1'))
assert not BB_CLASSES['SecondaryAmines'].match(Chem.MolFromSmiles('CC(C)(C)OC(=O)NNS(=O)(=O)c1ccc(F)cc1'))

In [None]:
#| export
class ReactionUniverse():
    def __init__(self, 
                 universe_name: str, 
                 building_block_classes: list[BuildingBlockClass]):
        self.universe_name = universe_name
        self.building_block_classes = building_block_classes
        self.building_block_class_dict = {i.name : i for i in self.building_block_classes}
        
    def add_class(self, bb_class: BuildingBlockClass):
        self.building_block_classes.append(bb_class)
        self.building_block_class_dict[bb_class.name] = bb_class
        
    def match(self, mol: Chem.Mol) -> dict:
        return {k: v for d in [i.match(mol) for i in self.building_block_classes] for k, v in d.items()}
        
    def dump(self):
        return {'universe_name' : self.universe_name, 
                'building_block_classes' : [i.dump() for i in self.building_block_classes]}
        
        
    @classmethod
    def from_dict(cls, input_dict: dict):
        name = input_dict['universe_name']
        building_block_classes = [BuildingBlockClass.from_dict(i) for i in input_dict['building_block_classes']]
        return cls(name, building_block_classes)
        
    @classmethod
    def from_file(cls, filename):
        with open(filename, 'r') as f:
            input_dict = json.load(f)
            return cls.from_dict(input_dict)
        


In [None]:
rxn_universe = ReactionUniverse('all_rxns', BB_CLASSES.values())
match_names = list(rxn_universe.match(Chem.MolFromSmiles('CC(NC1CCN(C(=O)OC(C)(C)C)CC1)c1cccc(Br)c1')).keys())
assert match_names == ['SecondaryAmines', 'Bifunctional', 'ArylHalide']

In [None]:
#| export
class BuildingBlock(Molecule):
    def __init__(self, 
                 smile: str, 
                 data:  Optional[dict]=None):
        super().__init__(smile, data)
        
    def match_schema(self, schema: BuildingBlockSchema) -> bool:
        return schema.match(self.mol)
    
    def match_class(self, bb_class: BuildingBlockClass) -> dict:
        return bb_class.match(self.mol)
    
    def match_universe(self, rxn_universe: ReactionUniverse) -> dict:
        return rxn_universe.match(self.mol)
    
    def classify(self, rxn_universe: ReactionUniverse) -> dict:
        matches = self.match_universe(rxn_universe)
        self.add_data({rxn_universe.universe_name : matches})
        return matches

In [None]:
bb = BuildingBlock('CC(NC1CCN(C(=O)OC(C)(C)C)CC1)c1cccc(Br)c1')
_ = bb.classify(rxn_universe)
assert list(bb.data[rxn_universe.universe_name].keys()) == ['SecondaryAmines', 'Bifunctional', 'ArylHalide']

In [None]:
#| hide
import nbdev; nbdev.nbdev_export()