In [25]:
from rdkit import Chem
from rdkit.Chem import Draw
from rdkit.Chem import AllChem
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [26]:
canonical_file="../dataset/canonical_trainset.csv"

In [27]:
canonical_smiles_df=pd.read_csv(canonical_file)

In [28]:
canonical_smiles_df.head()

Unnamed: 0,SMILES,Label
0,CN(C)C(=N)N=C(N)N,Negative
1,COC(=O)C=CC(=O)O,Negative
2,O=C(O)C=Cc1ccccc1,Negative
3,Cc1ccc(C(C)C)cc1O,Negative
4,COc1ccc(C(=O)O)cc1,Negative


In [29]:
def find_unique_elements_and_atomic_numbers(smiles_series):
    atomic_numbers = {}

    for smiles in smiles_series:
        mol = Chem.MolFromSmiles(smiles)
        if not mol: 
            continue
        
        mol_with_h = Chem.AddHs(mol)
        
        for atom in mol_with_h.GetAtoms():
            element = atom.GetSymbol()
            atomic_numbers[element] = atom.GetAtomicNum()

    return atomic_numbers

In [30]:
def one_hot_encode(val, categories):
    return [int(val == category) for category in categories]

In [31]:
atomic_numbers = find_unique_elements_and_atomic_numbers(canonical_smiles_df["SMILES"])
atom_number = list(atomic_numbers.values())

# create 3D conformation of molecules

In [32]:
def generate_3d_coordinates(smiles):
    mol = Chem.MolFromSmiles(smiles)
    if not mol:
        return None
    mol_with_h = Chem.AddHs(mol)
    AllChem.EmbedMolecule(mol_with_h, AllChem.ETKDG())
    AllChem.UFFOptimizeMolecule(mol_with_h)
    return mol_with_h

In [33]:
def get_atom_features(atom):
    atom_type = atom.GetAtomicNum()
    atom_degree = atom.GetDegree()
    atom_implicit_valence = atom.GetImplicitValence()
    atom_aromatic = int(atom.GetIsAromatic())
    atom_hybridization = atom.GetHybridization()
    total_num_hs = atom.GetTotalNumHs()   
    
    atom_type_encoded = one_hot_encode(atom_type,atom_number)  
    hybridization_types = [Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2, Chem.rdchem.HybridizationType.SP3]
    hybridization_encoded = one_hot_encode(atom_hybridization, hybridization_types)
    
    return np.array([atom_degree, atom_implicit_valence, atom_aromatic, total_num_hs] + atom_type_encoded + hybridization_encoded, dtype=np.float32)


In [34]:
def get_bond_features(bond):
    bond_type = bond.GetBondType()
    bond_conjugated = int(bond.GetIsConjugated())
    bond_in_ring = int(bond.IsInRing())
    
    bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    bond_type_encoded = one_hot_encode(bond_type, bond_types)
    
    return np.array([bond_conjugated, bond_in_ring] + bond_type_encoded, dtype=np.float32)

# compute distance matrix

In [35]:
def compute_distance_matrix(mol_with_h):
    conf = mol_with_h.GetConformer()
    num_atoms = mol_with_h.GetNumAtoms()
    distance_matrix = np.zeros((num_atoms, num_atoms))
    for i in range(num_atoms):
        for j in range(i+1, num_atoms):
            distance = conf.GetAtomPosition(i).Distance(conf.GetAtomPosition(j))
            distance_matrix[i, j] = distance_matrix[j, i] = distance
    return distance_matrix

# voxelization

In [36]:
def voxelization(coords, resolution=10, grid_size=20):
    voxels = np.zeros((grid_size, grid_size, grid_size))
    for coord in coords:
        x, y, z = coord
        ix, iy, iz = int(x * resolution), int(y * resolution), int(z * resolution)
        if 0 <= ix < grid_size and 0 <= iy < grid_size and 0 <= iz < grid_size:
            voxels[ix, iy, iz] = 1
    return voxels

# Mian function

In [37]:
def preprocess_smiles(smiles_list):
    nodes_features = []
    edges_features = []
    adjacency_matrices = []

    for smiles in smiles_list:
        mol_with_h = generate_3d_coordinates(smiles)
        if mol_with_h:

            atoms_features = [get_atom_features(atom) for atom in mol_with_h.GetAtoms()]
            nodes_features.append(np.array(atoms_features))

            num_atoms = len(atoms_features)
            adjacency_matrix = np.zeros((num_atoms, num_atoms), dtype=np.float32)
            bonds_features = []

            for bond in mol_with_h.GetBonds():
                start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
                bond_features = get_bond_features(bond)
                bonds_features.append((start, end, bond_features))
                adjacency_matrix[start, end] = adjacency_matrix[end, start] = 1

            edges_features.append(np.array(bonds_features, dtype=object))
            adjacency_matrices.append(adjacency_matrix)

            distance_matrix = compute_distance_matrix(mol_with_h)
            coords = np.array([mol_with_h.GetConformer().GetAtomPosition(atom.GetIdx()) for atom in mol_with_h.GetAtoms()])
            voxels = voxelization(coords)
            
            return nodes_features,edges_features,adjacency_matrices,distance_matrix,coords,voxels
        

In [38]:
smiles_list=["C=CCSS(=O)CC=C"]
nodes_features,edges_features,adjacency_matrices,distance_matrix,coords,voxels=preprocess_smiles(smiles_list)


print("Atom Features for GNN:", nodes_features)
print("Edge Features for GNN:", edges_features)
print("Adjacency Matrix for GNN:", adjacency_matrices)
print("Distance Matrix for 3D-CNN:", distance_matrix)
print("Coordinates for 3D-CNN:", coords)
print("Voxels for 3D-CNN:",voxels)

Atom Features for GNN: [array([[3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0.],
       [3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0.],
       [4., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [2., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [3., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 1.,
        0.],
       [4., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        1.],
       [3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0.],
       [3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.,
        0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0.],
       [1., 0., 0., 0., 0., 0., 1., 0., 0