In [47]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [48]:
canonical_file="../dataset/canonical_trainset.csv"

In [49]:
canonical_smiles_df=pd.read_csv(canonical_file)

In [50]:
canonical_smiles_df.head()

Unnamed: 0,SMILES,Label
0,CN(C)C(=N)N=C(N)N,Negative
1,COC(=O)C=CC(=O)O,Negative
2,O=C(O)C=Cc1ccccc1,Negative
3,Cc1ccc(C(C)C)cc1O,Negative
4,COc1ccc(C(=O)O)cc1,Negative


In [51]:
def find_unique_elements_and_atomic_numbers(smiles_series):
    atomic_numbers = {}

    for smiles in smiles_series:
        mol = Chem.MolFromSmiles(smiles)
        if not mol: 
            continue
        
        mol_with_h = Chem.AddHs(mol)
        
        for atom in mol_with_h.GetAtoms():
            element = atom.GetSymbol()
            atomic_numbers[element] = atom.GetAtomicNum()

    return atomic_numbers

In [52]:
def one_hot_encode(val, categories):
    return [int(val == category) for category in categories]

In [53]:
def get_atom_number(smiles):

    atomic_numbers = find_unique_elements_and_atomic_numbers(smiles)
    atom_number = list(atomic_numbers.values())
    return atom_number

# create 3D conformation of molecules

In [54]:
def generate_3d_coordinates(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None
    mol_with_h = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol_with_h, AllChem.ETKDG())
    AllChem.UFFOptimizeMolecule(mol_with_h)
    return mol_with_h

In [55]:
def get_atom_features(atom,smiles):
    atom_type = atom.GetAtomicNum()
    atom_degree = atom.GetDegree()
    atom_implicit_valence = atom.GetImplicitValence()
    atom_aromatic = int(atom.GetIsAromatic())
    atom_hybridization = atom.GetHybridization()
    total_num_hs = atom.GetTotalNumHs()   
    
    atom_number=get_atom_number(smiles)
    
    atom_type_encoded = one_hot_encode(atom_type,atom_number)  
    hybridization_types = [Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2, Chem.rdchem.HybridizationType.SP3]
    hybridization_encoded = one_hot_encode(atom_hybridization, hybridization_types)
    
    return np.array([atom_degree, atom_implicit_valence, atom_aromatic, total_num_hs] + atom_type_encoded + hybridization_encoded, dtype=np.float32)


In [56]:
def get_bond_features(bond):
    bond_type = bond.GetBondType()
    bond_conjugated = int(bond.GetIsConjugated())
    bond_in_ring = int(bond.IsInRing())
    
    bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    bond_type_encoded = one_hot_encode(bond_type, bond_types)
    
    return np.array([bond_conjugated, bond_in_ring] + bond_type_encoded, dtype=np.float32)

# compute distance matrix

In [57]:
def compute_distance_matrix(mol_with_h):
    conf = mol_with_h.GetConformer()
    num_atoms = mol_with_h.GetNumAtoms()
    distance_matrix = np.zeros((num_atoms, num_atoms))
    for i in range(num_atoms):
        for j in range(i+1, num_atoms):
            distance = conf.GetAtomPosition(i).Distance(conf.GetAtomPosition(j))
            distance_matrix[i, j] = distance_matrix[j, i] = distance
    return distance_matrix

# voxelization

In [58]:
def voxelization(coords, resolution=10, grid_size=20):
    voxels = np.zeros((grid_size, grid_size, grid_size))
    voxel_size = 1.0 / resolution
    center = grid_size / 2
    for x, y, z in coords:
        ix = int((x / voxel_size) + center)
        iy = int((y / voxel_size) + center)
        iz = int((z / voxel_size) + center)
        
        if 0 <= ix < grid_size and 0 <= iy < grid_size and 0 <= iz < grid_size:
            voxels[ix, iy, iz] = 1  
    return voxels


# Mian function

In [59]:
def preprocess_smiles_with_labels_3d(smiles_list, labels):
    
    graph_data = []
    
    label_map = {'Negative': 0, 'Positive': 1}

    for smiles, label in zip(smiles_list, labels):
        mol_with_h = generate_3d_coordinates(smiles)
        if mol_with_h is None:
            continue

        atoms_features = [get_atom_features(atom,smiles_list) for atom in mol_with_h.GetAtoms()]
        num_atoms = len(atoms_features)
        
        adjacency_matrix = np.zeros((num_atoms, num_atoms), dtype=np.float32)
        
        bonds_features = []

        for bond in mol_with_h.GetBonds():
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            bond_features = get_bond_features(bond)
            bonds_features.append((start, end, bond_features))
            
            adjacency_matrix[start, end] = adjacency_matrix[end, start] = 1

        distance_matrix = compute_distance_matrix(mol_with_h)
        coords = np.array([mol_with_h.GetConformer().GetAtomPosition(atom.GetIdx()) for atom in mol_with_h.GetAtoms()])
        
        voxels = voxelization(coords)

        graph_data.append({
            'nodes_features': np.array(atoms_features),
            'edges_features': np.array(bonds_features, dtype=object),
            'adjacency_matrix': adjacency_matrix,
            'distance_matrix': distance_matrix,
            'coords': coords,
            'voxels': voxels,
            'label': label_map[label] 
        })

    return graph_data

# test

In [76]:
labels = canonical_smiles_df['Label'][:10].values 

In [77]:
graph_data = preprocess_smiles_with_labels_3d(canonical_smiles_df["SMILES"][:10].values, labels)

In [78]:
print("nodes_features  edges_features  adjacency_matrix  distance_matrix  coords       voxels")
for item in graph_data:
    nodes_features=item['nodes_features']
    edges_features=item['edges_features']
    adjacency_matrix=item['adjacency_matrix']
    distance_matrix=item['distance_matrix']
    coords=item['coords']
    voxels=item['voxels']
    print(f"{nodes_features.shape}         {edges_features.shape}          {adjacency_matrix.shape}           {distance_matrix.shape}     {coords.shape}    {voxels.shape}")

nodes_features  edges_features  adjacency_matrix  distance_matrix  coords       voxels
(20, 12)         (19, 3)          (20, 20)           (20, 20)     (20, 3)    (20, 20, 20)
(15, 12)         (14, 3)          (15, 15)           (15, 15)     (15, 3)    (20, 20, 20)
(19, 12)         (19, 3)          (19, 19)           (19, 19)     (19, 3)    (20, 20, 20)
(25, 12)         (25, 3)          (25, 25)           (25, 25)     (25, 3)    (20, 20, 20)
(19, 12)         (19, 3)          (19, 19)           (19, 19)     (19, 3)    (20, 20, 20)
(29, 12)         (28, 3)          (29, 29)           (29, 29)     (29, 3)    (20, 20, 20)
(31, 12)         (30, 3)          (31, 31)           (31, 31)     (31, 3)    (20, 20, 20)
(23, 12)         (24, 3)          (23, 23)           (23, 23)     (23, 3)    (20, 20, 20)
(22, 12)         (22, 3)          (22, 22)           (22, 22)     (22, 3)    (20, 20, 20)
(19, 12)         (18, 3)          (19, 19)           (19, 19)     (19, 3)    (20, 20, 20)


In [79]:
graph_data[0]

{'nodes_features': array([[4., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
        [3., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [4., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
        [3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
        [2., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [2., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
        [3., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [3., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0