In [16]:
from rdkit import Chem
from rdkit.Chem import Draw
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [17]:
canonical_file="../dataset/canonical_trainset.csv"

In [18]:
canonical_smiles_df=pd.read_csv(canonical_file)

In [19]:
canonical_smiles_df.head()

Unnamed: 0,SMILES,Label
0,CN(C)C(=N)N=C(N)N,Negative
1,COC(=O)C=CC(=O)O,Negative
2,O=C(O)C=Cc1ccccc1,Negative
3,Cc1ccc(C(C)C)cc1O,Negative
4,COc1ccc(C(=O)O)cc1,Negative


# find unique elements and their number within all SMILES

In [20]:
def find_unique_elements_and_atomic_numbers(smiles_series):
    atomic_numbers = {}

    for smiles in smiles_series:
        mol = Chem.MolFromSmiles(smiles)
        if not mol: 
            continue
        
        mol_with_h = Chem.AddHs(mol)
        
        for atom in mol_with_h.GetAtoms():
            element = atom.GetSymbol()
            atomic_numbers[element] = atom.GetAtomicNum()

    return atomic_numbers


# create one hot

In [21]:
def one_hot_encode(val, categories):
    return [int(val == category) for category in categories]

# get all unique atom number 

In [22]:
def get_atom_number(smiles):

    atomic_numbers = find_unique_elements_and_atomic_numbers(smiles)
    atom_number = list(atomic_numbers.values())
    return atom_number

# get atom features

In [23]:
def get_atom_features(atom,smiles):
    atom_type = atom.GetAtomicNum()
    atom_degree = atom.GetDegree()
    atom_implicit_valence = atom.GetImplicitValence()
    atom_aromatic = int(atom.GetIsAromatic())
    atom_hybridization = atom.GetHybridization()
    total_num_hs = atom.GetTotalNumHs()  
    
    atom_number = get_atom_number(smiles)
    
    atom_type_encoded = one_hot_encode(atom_type,atom_number)  
    hybridization_types = [Chem.rdchem.HybridizationType.SP, Chem.rdchem.HybridizationType.SP2, Chem.rdchem.HybridizationType.SP3]
    hybridization_encoded = one_hot_encode(atom_hybridization, hybridization_types)
    
    return np.array([atom_degree, atom_implicit_valence, atom_aromatic, total_num_hs] + atom_type_encoded + hybridization_encoded, dtype=np.float32)


# get bond features

In [24]:
def get_bond_features(bond):
    bond_type = bond.GetBondType()
    bond_conjugated = int(bond.GetIsConjugated())
    bond_in_ring = int(bond.IsInRing())
    
    bond_types = [Chem.rdchem.BondType.SINGLE, Chem.rdchem.BondType.DOUBLE, Chem.rdchem.BondType.TRIPLE, Chem.rdchem.BondType.AROMATIC]
    bond_type_encoded = one_hot_encode(bond_type, bond_types)
    
    return np.array([bond_conjugated, bond_in_ring] + bond_type_encoded, dtype=np.float32)

# mian function

In [25]:
def preprocess_smiles_with_labels(smiles_list, labels):
    graph_data = []  

    label_map = {'Negative': 0, 'Positive': 1}

    for smiles, label in zip(smiles_list, labels):
        mol = Chem.MolFromSmiles(smiles)
        if not mol:
            continue
        
        mol_with_h = Chem.AddHs(mol)
        atoms_features = [get_atom_features(atom,smiles_list) for atom in mol_with_h.GetAtoms()]
        
        num_atoms = len(atoms_features)
        
        adjacency_matrix = np.zeros((num_atoms, num_atoms), dtype=np.float32)
        
        bonds_features = []
        
        for bond in mol_with_h.GetBonds():
            start, end = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
            bond_features = get_bond_features(bond)
            bonds_features.append((start, end, bond_features))
            
            adjacency_matrix[start, end] = adjacency_matrix[end, start] = 1

        graph_data.append({
            'nodes_features': np.array(atoms_features),
            'edges_features': np.array(bonds_features, dtype=object),
            'adjacency_matrix': adjacency_matrix,
            'label': label_map[label]  
        })

    return graph_data

# test

In [26]:
labels = canonical_smiles_df['Label'][:10].values

In [27]:
graph_data = preprocess_smiles_with_labels(canonical_smiles_df["SMILES"][:10].values, labels)

In [34]:
print("nodes_features  edges_features  adjacency_matrix")
for item in graph_data:
    node_feature=item['nodes_features']
    edges_features=item['edges_features']
    adjacency_matrix=item['adjacency_matrix']
    print(f"{node_feature.shape}         {edges_features.shape}           {adjacency_matrix.shape}")

nodes_features  edges_features  adjacency_matrix
(20, 12)         (19, 3)           (20, 20)
(15, 12)         (14, 3)           (15, 15)
(19, 12)         (19, 3)           (19, 19)
(25, 12)         (25, 3)           (25, 25)
(19, 12)         (19, 3)           (19, 19)
(29, 12)         (28, 3)           (29, 29)
(31, 12)         (30, 3)           (31, 31)
(23, 12)         (24, 3)           (23, 23)
(22, 12)         (22, 3)           (22, 22)
(19, 12)         (18, 3)           (19, 19)


In [35]:
graph_data[0]

{'nodes_features': array([[4., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
        [3., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [4., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1.],
        [3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
        [2., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [2., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [3., 0., 0., 0., 1., 0., 0., 0., 0., 0., 1., 0.],
        [3., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [3., 0., 0., 0., 0., 1., 0., 0., 0., 0., 1., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
        [1., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0