In [1]:
from rdkit import Chem
from rdkit.Chem.Draw import IPythonConsole
from rdkit.Chem import Draw
import networkx as nx
import pandas as pd
from tqdm import tqdm
from SmilesPE.pretokenizer import atomwise_tokenizer

In [2]:
def mol_to_nx(mol):
    G = nx.Graph()

    for atom in mol.GetAtoms():
        G.add_node(atom.GetIdx(),
                   atomic_num=atom.GetAtomicNum(),
                   formal_charge=atom.GetFormalCharge(),
                   chiral_tag=atom.GetChiralTag(),
                   hybridization=atom.GetHybridization(),
                   num_explicit_hs=atom.GetNumExplicitHs(),
                   is_aromatic=atom.GetIsAromatic())
    for bond in mol.GetBonds():
        G.add_edge(bond.GetBeginAtomIdx(),
                   bond.GetEndAtomIdx(),
                   bond_type=bond.GetBondType())
    return G

def nx_to_mol(G):
    mol = Chem.RWMol()
    atomic_nums = nx.get_node_attributes(G, 'atomic_num')
    chiral_tags = nx.get_node_attributes(G, 'chiral_tag')
    formal_charges = nx.get_node_attributes(G, 'formal_charge')
    node_is_aromatics = nx.get_node_attributes(G, 'is_aromatic')
    node_hybridizations = nx.get_node_attributes(G, 'hybridization')
    num_explicit_hss = nx.get_node_attributes(G, 'num_explicit_hs')
    node_to_idx = {}
    for node in G.nodes():
        a=Chem.Atom(atomic_nums[node])
        a.SetChiralTag(chiral_tags[node])
        a.SetFormalCharge(formal_charges[node])
        a.SetIsAromatic(node_is_aromatics[node])
        a.SetHybridization(node_hybridizations[node])
        a.SetNumExplicitHs(num_explicit_hss[node])
        idx = mol.AddAtom(a)
        node_to_idx[node] = idx

    bond_types = nx.get_edge_attributes(G, 'bond_type')
    for edge in G.edges():
        first, second = edge
        ifirst = node_to_idx[first]
        isecond = node_to_idx[second]
        bond_type = bond_types[first, second]
        mol.AddBond(ifirst, isecond, bond_type)

    Chem.SanitizeMol(mol)
    return mol

In [3]:
class MyNX:
    
    def dfs(self, visited, graph, node, depth, arr=[]):
        if node not in visited:
            visited.add(node)
            arr.append((node, depth))
            for neighbour in graph[node]:
                self.dfs(visited, graph, neighbour, depth+1, arr)
        return arr

    def bfs(self, visited, graph, node, arr=[], queue=[]):
        visited |= {node}
        queue.append((node, 0))

        while queue:
            m, d = queue.pop(0)
            arr.append((m, d))

            for neighbour in graph[m]:
                if neighbour not in visited:
                    visited |= {neighbour}
                    queue.append((neighbour, d+1))
        return arr
    
my_nx = MyNX()

In [4]:
grammar = {'[', ']', '(', ')', '=', '.', '/', '\\', '-', '#', '@'} | {str(i) for i in range(10)}

In [5]:
df = pd.read_pickle('processed.pickle')
reactants = df['reactants_mol'].to_list()
products  = df['products_mol'].to_list()
r_tokens = []
p_tokens = []

In [6]:
df

Unnamed: 0,reactants_mol,products_mol,reaction_type,set,num_reacts,num_prods,ratio
0,"[CS(=O)(=O)OC[C@H]1CCC(=O)O1, Fc1ccc(Nc2ncnc3c...",[O=C1CC[C@H](CN2CCN(CCOc3cc4ncnc(Nc5ccc(F)c(Cl...,<RX_1>,train,2,1,1.121951
1,[COC(=O)c1cc(CCCc2cc3c(=O)[nH]c(N)nc3[nH]2)cs1],[Nc1nc2[nH]c(CCCc3csc(C(=O)O)c3)cc2c(=O)[nH]1],<RX_6>,train,1,1,1.045455
2,"[CC1(C)OB(B2OC(C)(C)C(C)(C)O2)OC1(C)C, FC(F)(F...",[CC1(C)OB(c2cccc(Nc3nccc(C(F)(F)F)n3)c2)OC1(C)C],<RX_9>,train,2,1,1.384615
3,[CC(C)(C)OC(=O)NCC(=O)CCC(=O)OCCCC(=O)OCc1ccccc1],[CC(C)(C)OC(=O)NCC(=O)CCC(=O)OCCCC(=O)O],<RX_6>,train,1,1,1.318182
4,"[Fc1cc2c(Cl)ncnc2cn1, NC1CCCCCC1]",[Fc1cc2c(NC3CCCCCC3)ncnc2cn1],<RX_1>,train,2,1,1.052632
...,...,...,...,...,...,...,...
50032,"[Cc1cc([N+](=O)[O-])ccc1O, Nc1cc(Cl)ccn1]",[Cc1cc([N+](=O)[O-])ccc1Oc1ccnc(N)c1],<RX_1>,test,2,1,1.055556
50033,[COC(=O)c1[nH]c2cc(Cl)cc3c2c1C(CC(=O)OC(C)(C)C...,[COC(=O)c1[nH]c2cc(Cl)cc3c2c1C(CC(=O)O)CC3],<RX_6>,test,1,1,1.190476
50034,[COc1cc(C(F)(F)F)cc(SC)c1C(=O)NC1(c2ccccc2)CC(...,[COc1cc(C(F)(F)F)cc(SC)c1C(=O)NC1(c2ccccc2)CC(...,<RX_7>,test,1,1,1.000000
50035,"[C=C(C)Cn1nc(C)c(Br)c1-c1ccc(F)cc1, OO]",[Cc1nn(CC(C)CO)c(-c2ccc(F)cc2)c1Br],<RX_10>,test,2,1,1.052632


In [7]:
final_dict = {
    'reactant_token': [],
    'reactant_bfs_id': [],
    'reactant_bfs_depth': [],
    'reactant_dfs_id': [],
    'reactant_dfs_depth': [],
    'product_token': [],
    'product_bfs_id': [],
    'product_bfs_depth': [],
    'product_dfs_id': [],
    'product_dfs_depth': [],
}

for i, (r_smi, p_smi) in enumerate(tqdm(zip(reactants, products), total=len(reactants))):
    for k in final_dict.keys():
        final_dict[k].append([])
    for molecule_set, molecule_type in zip([r_smi, p_smi], ['reactant', 'product']):
        for smi in sorted(molecule_set, key=len, reverse=True):
            for k in final_dict.keys():
                if molecule_type in k:
                    final_dict[k][i].append([])
            
            mol = Chem.MolFromSmiles(smi)
            
            g = mol_to_nx(mol)
            bfs_depth = [0] + [None for _ in range(len(g)-1)]
            dfs_depth = [0] + [None for _ in range(len(g)-1)]
            
            for max_depth in range(1, len(g)):
                for atom_id in nx.bfs_tree(g, source=0, depth_limit=max_depth):
                    if bfs_depth[atom_id] is None:
                        bfs_depth[atom_id] = max_depth
                for atom_id in nx.dfs_tree(g, source=0, depth_limit=max_depth):
                    if dfs_depth[atom_id] is None:
                        dfs_depth[atom_id] = max_depth
                if None not in bfs_depth and None not in dfs_depth:
                    break

            bfs_ids = list(nx.bfs_tree(g, source=0))
            dfs_ids = list(nx.dfs_tree(g, source=0))

            assert dfs_ids == [atom.GetIdx() for atom in mol.GetAtoms()], "DFS: Not the same :("
            # assert bfs_ids != [atom.GetIdx() for atom in mol.GetAtoms()], "BFS: Are the same :("

            toks = atomwise_tokenizer(smi)

            positions = []
            start = 0
            for tok in toks:
                final_dict[f'{molecule_type}_token'][i][-1].append(tok)
                if tok in grammar:
                    positions.append(positions[-1])
                else:
                    positions.append(((bfs_ids[start], bfs_depth[start]), (dfs_ids[start], dfs_depth[start])))
                    start += 1

            assert start == mol.GetNumAtoms(), "Mismatch in number of atoms"
            assert len({(t, p[0][0], p[1][0]) for t, p in zip(toks, positions)}) == len(toks), "Duplicate tokens"
            
            for (bi, bd), (di, dd) in positions:
                final_dict[f'{molecule_type}_bfs_id'][i][-1].append(bi)
                final_dict[f'{molecule_type}_bfs_depth'][i][-1].append(bd)
                final_dict[f'{molecule_type}_dfs_id'][i][-1].append(di)
                final_dict[f'{molecule_type}_dfs_depth'][i][-1].append(dd)
            
    #         break
    #     break
    # break

100%|██████████| 50037/50037 [04:10<00:00, 199.93it/s]


In [8]:
for k, v in final_dict.items():
    print(k, len(final_dict[k]), len(final_dict[k][0]), len(final_dict[k][0][0]))

reactant_token 50037 2 50
reactant_bfs_id 50037 2 50
reactant_bfs_depth 50037 2 50
reactant_dfs_id 50037 2 50
reactant_dfs_depth 50037 2 50
product_token 50037 1 64
product_bfs_id 50037 1 64
product_bfs_depth 50037 1 64
product_dfs_id 50037 1 64
product_dfs_depth 50037 1 64


In [12]:
df2 = pd.DataFrame(final_dict)

In [13]:
df2

Unnamed: 0,reactant_token,reactant_bfs_id,reactant_bfs_depth,reactant_dfs_id,reactant_dfs_depth,product_token,product_bfs_id,product_bfs_depth,product_dfs_id,product_dfs_depth
0,"[[F, c, 1, c, c, c, (, N, c, 2, n, c, n, c, 3,...","[[0, 1, 1, 2, 32, 3, 3, 31, 33, 33, 4, 5, 6, 7...","[[0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 8, 8,...","[[0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 10, 1...","[[0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 8, 8,...","[[O, =, C, 1, C, C, [C@H], (, C, N, 2, C, C, N...","[[0, 0, 1, 1, 2, 40, 3, 3, 4, 5, 5, 6, 7, 39, ...","[[0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 6, 7, 8, 8,...","[[0, 0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9,...","[[0, 0, 1, 1, 2, 3, 3, 3, 5, 6, 6, 7, 8, 9, 9,..."
1,"[[C, O, C, (, =, O, ), c, 1, c, c, (, C, C, C,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 22, 22, 6, 21,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 8,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 8,...","[[N, c, 1, n, c, 2, [nH], c, (, C, C, C, c, 3,...","[[0, 1, 1, 2, 21, 21, 3, 19, 19, 4, 18, 20, 5,...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10..."
2,"[[C, C, 1, (, C, ), O, B, (, B, 2, O, C, (, C,...","[[0, 1, 1, 1, 2, 2, 3, 15, 15, 4, 4, 14, 16, 1...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7,...","[[0, 1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7, 8,...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7,...","[[C, C, 1, (, C, ), O, B, (, c, 2, c, c, c, c,...","[[0, 1, 1, 1, 2, 2, 3, 23, 23, 4, 4, 22, 24, 2...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 7, 6,...","[[0, 1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 8, 9,...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 7, 6,..."
3,"[[C, C, (, C, ), (, C, ), O, C, (, =, O, ), N,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,...","[[C, C, (, C, ), (, C, ), O, C, (, =, O, ), N,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,..."
4,"[[F, c, 1, c, c, 2, c, (, Cl, ), n, c, n, c, 2...","[[0, 1, 1, 2, 11, 11, 3, 3, 10, 10, 4, 9, 5, 6...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 5, 4, 4,...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 9,...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 5, 4, 4,...","[[F, c, 1, c, c, 2, c, (, N, C, 3, C, C, C, C,...","[[0, 1, 1, 2, 18, 18, 3, 3, 17, 4, 4, 16, 5, 1...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9,...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9, 10...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9,..."
...,...,...,...,...,...,...,...,...,...,...
50032,"[[C, c, 1, c, c, (, [N+], (, =, O, ), [O-], ),...","[[0, 1, 1, 2, 9, 9, 3, 3, 3, 8, 8, 10, 10, 4, ...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 8,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,...","[[C, c, 1, c, c, (, [N+], (, =, O, ), [O-], ),...","[[0, 1, 1, 2, 9, 9, 3, 3, 3, 8, 8, 10, 10, 4, ...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 8,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,..."
50033,"[[C, O, C, (, =, O, ), c, 1, [nH], c, 2, c, c,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 13, 13, 6, 12,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,...","[[C, O, C, (, =, O, ), c, 1, [nH], c, 2, c, c,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 13, 13, 6, 12,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,..."
50034,"[[C, O, c, 1, c, c, (, C, (, F, ), (, F, ), F,...","[[0, 1, 2, 2, 3, 13, 13, 4, 4, 10, 10, 10, 14,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,...","[[C, O, c, 1, c, c, (, C, (, F, ), (, F, ), F,...","[[0, 1, 2, 2, 3, 13, 13, 4, 4, 10, 10, 10, 14,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,..."
50035,"[[C, =, C, (, C, ), C, n, 1, n, c, (, C, ), c,...","[[0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 10, 10, 6, 6, ...","[[0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 5,...","[[0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8,...","[[0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 5,...","[[C, c, 1, n, n, (, C, C, (, C, ), C, O, ), c,...","[[0, 1, 1, 2, 17, 17, 3, 9, 9, 18, 18, 4, 10, ...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 6, 7, 7, 3,...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 6, 7, 7, 3,..."


In [14]:
df_col_merged = pd.concat([df, df2], axis=1)

In [15]:
df_col_merged

Unnamed: 0,reactants_mol,products_mol,reaction_type,set,num_reacts,num_prods,ratio,reactant_token,reactant_bfs_id,reactant_bfs_depth,reactant_dfs_id,reactant_dfs_depth,product_token,product_bfs_id,product_bfs_depth,product_dfs_id,product_dfs_depth
0,"[CS(=O)(=O)OC[C@H]1CCC(=O)O1, Fc1ccc(Nc2ncnc3c...",[O=C1CC[C@H](CN2CCN(CCOc3cc4ncnc(Nc5ccc(F)c(Cl...,<RX_1>,train,2,1,1.121951,"[[F, c, 1, c, c, c, (, N, c, 2, n, c, n, c, 3,...","[[0, 1, 1, 2, 32, 3, 3, 31, 33, 33, 4, 5, 6, 7...","[[0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 8, 8,...","[[0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 10, 1...","[[0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 8, 8,...","[[O, =, C, 1, C, C, [C@H], (, C, N, 2, C, C, N...","[[0, 0, 1, 1, 2, 40, 3, 3, 4, 5, 5, 6, 7, 39, ...","[[0, 0, 1, 1, 2, 3, 3, 3, 4, 5, 5, 6, 7, 8, 8,...","[[0, 0, 1, 1, 2, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9,...","[[0, 0, 1, 1, 2, 3, 3, 3, 5, 6, 6, 7, 8, 9, 9,..."
1,[COC(=O)c1cc(CCCc2cc3c(=O)[nH]c(N)nc3[nH]2)cs1],[Nc1nc2[nH]c(CCCc3csc(C(=O)O)c3)cc2c(=O)[nH]1],<RX_6>,train,1,1,1.045455,"[[C, O, C, (, =, O, ), c, 1, c, c, (, C, C, C,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 22, 22, 6, 21,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 8,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 8,...","[[N, c, 1, n, c, 2, [nH], c, (, C, C, C, c, 3,...","[[0, 1, 1, 2, 21, 21, 3, 19, 19, 4, 18, 20, 5,...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 7, 8, 9, 9, 10..."
2,"[CC1(C)OB(B2OC(C)(C)C(C)(C)O2)OC1(C)C, FC(F)(F...",[CC1(C)OB(c2cccc(Nc3nccc(C(F)(F)F)n3)c2)OC1(C)C],<RX_9>,train,2,1,1.384615,"[[C, C, 1, (, C, ), O, B, (, B, 2, O, C, (, C,...","[[0, 1, 1, 1, 2, 2, 3, 15, 15, 4, 4, 14, 16, 1...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7,...","[[0, 1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 7, 8,...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7,...","[[C, C, 1, (, C, ), O, B, (, c, 2, c, c, c, c,...","[[0, 1, 1, 1, 2, 2, 3, 23, 23, 4, 4, 22, 24, 2...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 7, 6,...","[[0, 1, 1, 1, 2, 2, 3, 4, 4, 5, 5, 6, 7, 8, 9,...","[[0, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 7, 6,..."
3,[CC(C)(C)OC(=O)NCC(=O)CCC(=O)OCCCC(=O)OCc1ccccc1],[CC(C)(C)OC(=O)NCC(=O)CCC(=O)OCCCC(=O)O],<RX_6>,train,1,1,1.318182,"[[C, C, (, C, ), (, C, ), O, C, (, =, O, ), N,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,...","[[C, C, (, C, ), (, C, ), O, C, (, =, O, ), N,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,...","[[0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 5, 6, 6, 7,...","[[0, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3, 3, 4, 4, 4,..."
4,"[Fc1cc2c(Cl)ncnc2cn1, NC1CCCCCC1]",[Fc1cc2c(NC3CCCCCC3)ncnc2cn1],<RX_1>,train,2,1,1.052632,"[[F, c, 1, c, c, 2, c, (, Cl, ), n, c, n, c, 2...","[[0, 1, 1, 2, 11, 11, 3, 3, 10, 10, 4, 9, 5, 6...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 5, 4, 4,...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 6, 7, 8, 9, 9,...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 5, 4, 4,...","[[F, c, 1, c, c, 2, c, (, N, C, 3, C, C, C, C,...","[[0, 1, 1, 2, 18, 18, 3, 3, 17, 4, 4, 16, 5, 1...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9,...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9, 10...","[[0, 1, 1, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 9, 9,..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
50032,"[Cc1cc([N+](=O)[O-])ccc1O, Nc1cc(Cl)ccn1]",[Cc1cc([N+](=O)[O-])ccc1Oc1ccnc(N)c1],<RX_1>,test,2,1,1.055556,"[[C, c, 1, c, c, (, [N+], (, =, O, ), [O-], ),...","[[0, 1, 1, 2, 9, 9, 3, 3, 3, 8, 8, 10, 10, 4, ...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 8,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,...","[[C, c, 1, c, c, (, [N+], (, =, O, ), [O-], ),...","[[0, 1, 1, 2, 9, 9, 3, 3, 3, 8, 8, 10, 10, 4, ...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 6, 7, 8,...","[[0, 1, 1, 2, 3, 3, 4, 4, 4, 5, 5, 5, 5, 4, 3,..."
50033,[COC(=O)c1[nH]c2cc(Cl)cc3c2c1C(CC(=O)OC(C)(C)C...,[COC(=O)c1[nH]c2cc(Cl)cc3c2c1C(CC(=O)O)CC3],<RX_6>,test,1,1,1.190476,"[[C, O, C, (, =, O, ), c, 1, [nH], c, 2, c, c,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 13, 13, 6, 12,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,...","[[C, O, C, (, =, O, ), c, 1, [nH], c, 2, c, c,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 13, 13, 6, 12,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,...","[[0, 1, 2, 2, 2, 3, 3, 4, 4, 5, 6, 6, 7, 8, 8,...","[[0, 1, 2, 2, 2, 3, 3, 3, 3, 4, 5, 5, 6, 7, 7,..."
50034,[COc1cc(C(F)(F)F)cc(SC)c1C(=O)NC1(c2ccccc2)CC(...,[COc1cc(C(F)(F)F)cc(SC)c1C(=O)NC1(c2ccccc2)CC(...,<RX_7>,test,1,1,1.000000,"[[C, O, c, 1, c, c, (, C, (, F, ), (, F, ), F,...","[[0, 1, 2, 2, 3, 13, 13, 4, 4, 10, 10, 10, 14,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,...","[[C, O, c, 1, c, c, (, C, (, F, ), (, F, ), F,...","[[0, 1, 2, 2, 3, 13, 13, 4, 4, 10, 10, 10, 14,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 7, 7, 8,...","[[0, 1, 2, 2, 3, 4, 4, 5, 5, 6, 6, 6, 6, 6, 6,..."
50035,"[C=C(C)Cn1nc(C)c(Br)c1-c1ccc(F)cc1, OO]",[Cc1nn(CC(C)CO)c(-c2ccc(F)cc2)c1Br],<RX_10>,test,2,1,1.052632,"[[C, =, C, (, C, ), C, n, 1, n, c, (, C, ), c,...","[[0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 10, 10, 6, 6, ...","[[0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 5,...","[[0, 0, 1, 1, 2, 2, 3, 4, 4, 5, 6, 6, 7, 7, 8,...","[[0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 5, 5, 6, 6, 5,...","[[C, c, 1, n, n, (, C, C, (, C, ), C, O, ), c,...","[[0, 1, 1, 2, 17, 17, 3, 9, 9, 18, 18, 4, 10, ...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 6, 7, 7, 3,...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 7, 8, 8, 9,...","[[0, 1, 1, 2, 3, 3, 4, 5, 5, 6, 6, 6, 7, 7, 3,..."


In [16]:
df_col_merged.to_pickle('final_data.pickle')

Need to add molecule ID

In [2]:
df = pd.read_pickle('final_data.pickle')

In [5]:
df['set'].unique()

array(['train', 'valid', 'test'], dtype=object)