In [None]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import torch
import dgl
import rdkit.Chem.Descriptors as dsc
from rdkit import Chem
from mendeleev.fetch import fetch_table
import traceback

In [2]:
df = pd.read_csv(r'C:\Programming\Github\EGCN\data\molproperty_mp.csv')
df

Unnamed: 0,smiles,melting_point
0,C=O,-92.0
1,CNN,-52.4
2,C(=O)O,8.3
3,CCl,-97.6
4,CN,-93.4
...,...,...
118,COP(=O)(C)OC,-50.0
119,CC(=O)NC1=CC=C(C=C1)O,168.0
120,CC(=O)OC1=CC=CC=C1C(=O)O,135.0
121,C[N+]1=CC=C(C=C1)C2=CC=[N+](C=C2)C.[Cl-].[Cl-],300.0


In [7]:

def zscore(X):
    if len(X.shape) == 1:
        means = np.mean(X)
        stds = np.std(X)

        for i in range(0, X.shape[0]):
            if stds == 0:
                X[i] = 0
            else:
                X[i] = (X[i] - means) / stds
    else:
        means = np.mean(X, axis=0)
        stds = np.std(X, axis=0)

        for i in range(0, X.shape[0]):
            for j in range(0, X.shape[1]):
                if stds[j] == 0:
                    X[i, j] = 0
                else:
                    X[i, j] = (X[i, j] - means[j]) / stds[j]

    return X


def replace_nan(X, replace_val):
    X[np.isnan(X)] = replace_val

    return X


def adj_mat_to_edges(adj_mat):
    edges = []

    for i in range(0, adj_mat.shape[0]):
        for j in range(0, adj_mat.shape[1]):
            if adj_mat[i, j] == 1:
                edges.append((i, j))

    return edges


def get_one_hot_vector(label, num_classes):
    one_hot_vector = np.zeros(num_classes)
    one_hot_vector[label-1] = 1

    return one_hot_vector


In [8]:


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
sel_prop_names = ['atomic_weight',
                'atomic_radius',
                'atomic_volume',
                'dipole_polarizability',
                'fusion_heat',
                'thermal_conductivity',
                'vdw_radius',
                'en_pauling']
dim_atomic_feat = len(sel_prop_names)
dim_self_feat = 4


class molDGLGraph(dgl.DGLGraph):
    def __init__(self, smiles, adj_mat, feat_mat, mol):
        super(molDGLGraph, self).__init__()
        self.smiles = smiles
        self.adj_mat = adj_mat
        self.feat_mat = feat_mat
        self.atomic_nodes = []
        self.neighbors = {}

        node_id = 0
        for atom in mol.GetAtoms():
            self.atomic_nodes.append(atom.GetSymbol())
            self.neighbors[node_id] = atoms_to_symbols(atom.GetNeighbors())
            node_id += 1


def read_atom_prop():
#    tb_atomic_props = get_table('elements')
    tb_atomic_props = fetch_table('elements')
#    arr_atomic_nums = np.array(tb_atomic_props['atomic_number'], dtype=np.int)
    arr_atomic_nums = np.array(tb_atomic_props['atomic_number'], dtype=int)
#    arr_atomic_props = np.nan_to_num(np.array(tb_atomic_props[sel_prop_names], dtype=np.float32))
    arr_atomic_props = np.nan_to_num(np.array(tb_atomic_props[sel_prop_names], dtype=float))
    arr_atomic_props = zscore(arr_atomic_props)
    atomic_props_mat = {arr_atomic_nums[i]: arr_atomic_props[i, :] for i in range(0, arr_atomic_nums.shape[0])}

    return atomic_props_mat


def construct_mol_graph(smiles, mol, adj_mat, feat_mat):
    molGraph = molDGLGraph(smiles, adj_mat, feat_mat, mol).to(device)
    edges = adj_mat_to_edges(adj_mat)
    src, dst = tuple(zip(*edges))

    molGraph.add_nodes(adj_mat.shape[0])
    molGraph.add_edges(src, dst)
    molGraph.ndata['feat'] = torch.tensor(feat_mat, dtype=torch.float32).to(device)

    return molGraph


def smiles_to_mol_graph(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        adj_mat = Chem.GetAdjacencyMatrix(mol)
        node_feat_mat = np.empty([mol.GetNumAtoms(), atomic_props.get(1).shape[0]])

        ind = 0
        for atom in mol.GetAtoms():
            node_feat_mat[ind, :] = atomic_props.get(atom.GetAtomicNum())
            ind = ind + 1

        return mol, construct_mol_graph(smiles, mol, adj_mat, node_feat_mat)
    except Exception as e:
        print(f"Error processing SMILES: {smiles}")
        print(traceback.format_exc())  # 예외 정보를 출력
        return None, None


def atoms_to_symbols(atoms):
    symbols = []

    for atom in atoms:
        symbols.append(atom.GetSymbol())

    return symbols


def normalize_self_feat(mol_graphs, self_feat_name):
    self_feats = []

    for mol_graph in mol_graphs:
        self_feats.append(getattr(mol_graph, self_feat_name))

    mean_self_feat = np.mean(self_feats)
    std_self_feat = np.std(self_feats)

    for mol_graph in mol_graphs:
        if std_self_feat == 0:
            setattr(mol_graph, self_feat_name, 0)
        else:
            setattr(mol_graph, self_feat_name, (getattr(mol_graph, self_feat_name) - mean_self_feat) / std_self_feat)

def read_dataset(file_name):
    samples = []
    mol_graphs = []
    data_mat = np.array(pd.read_csv(file_name))
    smiles = data_mat[:, 0]
#    target = np.array(data_mat[:, 1:3], dtype=np.float)
    target = np.array(data_mat[:, 1:3], dtype=float)

    for i in range(0, data_mat.shape[0]):
        mol, mol_graph = smiles_to_mol_graph(smiles[i])

        if mol is not None and mol_graph is not None:
            ####################################################
            # 1
            mol_graph.NOCount = dsc.NOCount(mol)
            mol_graph.SlogP_VSA1 = dsc.SlogP_VSA1(mol)
            mol_graph.LabuteASA = dsc.LabuteASA(mol)
            mol_graph.Chi1 = dsc.Chi1(mol)
            ####################################################

            samples.append((mol_graph, target[i]))
            mol_graphs.append(mol_graph)

    ####################################################
    # 1
    normalize_self_feat(mol_graphs, 'NOCount')
    normalize_self_feat(mol_graphs, 'SlogP_VSA1')
    normalize_self_feat(mol_graphs, 'LabuteASA')
    normalize_self_feat(mol_graphs, 'Chi1')
    return samples

atomic_props = read_atom_prop()

In [9]:
import os
import random
SEED = 100

os.environ['PYTHONHASHSEED'] = str(SEED)
os.environ['TF_DETERMINISTIC_OPS'] = '1'
os.environ['KMP_DUPLICATE_LIB_OK']='True'

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
dgl.random.seed(SEED)
#tf.random.set_seed(SEED)

torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False


# check GPU availability
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

# experiment parameters
dataset_name = 'molproperty_mp'
batch_size = 32
max_epochs = 300
k = 5


cuda


In [13]:
print('Data loading...')
dataset = read_dataset('../data/' + dataset_name + '.csv')
random.shuffle(dataset)
train_dataset, test_dataset = train_test_split(dataset, test_size = 0.2, random_state = SEED)


Data loading...
Error processing SMILES: [P-3].[P-3].[Zn+2].[Zn+2].[Zn+2]
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 66, in smiles_to_mol_graph
    return mol, construct_mol_graph(smiles, mol, adj_mat, node_feat_mat)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 46, in construct_mol_graph
    src, dst = tuple(zip(*edges))
ValueError: not enough values to unpack (expected 2, got 0)

Error processing SMILES: [Na]
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 66, in smiles_to_mol_graph
    return mol, construct_mol_graph(smiles, mol, adj_mat, node_feat_mat)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 46, in construct_mol_graph
    src, dst = tuple(zip(*edges))
ValueError: not enough values to unpack (expected 2, got 0)

Error processing SMILES: FH
Traceback (most recent call last):
  File "C:\User

[14:26:28] SMILES Parse Error: syntax error while parsing: FH
[14:26:28] SMILES Parse Error: Failed parsing SMILES 'FH' for input: 'FH'
[14:26:28] Explicit valence for atom # 1 Br, 5, is greater than permitted


Error processing SMILES: N
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 66, in smiles_to_mol_graph
    return mol, construct_mol_graph(smiles, mol, adj_mat, node_feat_mat)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 46, in construct_mol_graph
    src, dst = tuple(zip(*edges))
ValueError: not enough values to unpack (expected 2, got 0)

Error processing SMILES: P
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 66, in smiles_to_mol_graph
    return mol, construct_mol_graph(smiles, mol, adj_mat, node_feat_mat)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173.py", line 46, in construct_mol_graph
    src, dst = tuple(zip(*edges))
ValueError: not enough values to unpack (expected 2, got 0)

Error processing SMILES: Cl
Traceback (most recent call last):
  File "C:\Users\user\AppData\Local\Temp\ipykernel_4124\760318173

In [17]:
# 3. target만 추출
y_train = np.array([target for (_, target) in train_dataset])
y_test = np.array([target for (_, target) in test_dataset])

scaler = StandardScaler()
y_train_scaling = scaler.fit_transform(y_train)
y_test_scaling = scaler.transform(y_test)

# 5. 스케일된 target을 다시 덮어씌우기
train_dataset = [(g, y) for (g, _), y in zip(train_dataset, y_train_scaling)]
test_dataset = [(g, y) for (g, _), y in zip(test_dataset, y_test_scaling)]

In [None]:

# 4. StandardScaler 정의 및 적용
scaler = StandardScaler()
y_train_scaled = scaler.fit_transform(y_train)   # ✅ train에 fit
y_test_scaled = scaler.transform(y_test)         # ✅ test에는 transform만

# 5. 스케일된 target을 다시 덮어씌우기
train_dataset = [(g, y) for (g, _), y in zip(train_dataset, y_train_scaled)]
test_dataset = [(g, y) for (g, _), y in zip(test_dataset, y_test_scaled)]

In [18]:
[y for (x, y) in train_dataset]

[array([-0.53858922]),
 array([0.26976525]),
 array([-0.4068083]),
 array([-0.26251187]),
 array([-0.21097743]),
 array([0.81234927]),
 array([-0.10054649]),
 array([-0.61368226]),
 array([-0.65329015]),
 array([0.12620503]),
 array([0.1571257]),
 array([-0.43772896]),
 array([-0.61589087]),
 array([0.31467384]),
 array([0.3084897]),
 array([-0.92391957]),
 array([1.20253858]),
 array([-0.748408]),
 array([-0.56067541]),
 array([-0.74104594]),
 array([1.603771]),
 array([-0.80951312]),
 array([-0.6895115]),
 array([0.50984212]),
 array([-0.00483968]),
 array([-0.75724248]),
 array([-1.03700085]),
 array([-0.15944299]),
 array([-0.40614571]),
 array([-0.77049419]),
 array([-0.72632181]),
 array([-0.52062578]),
 array([0.53995295]),
 array([1.4160384]),
 array([-0.22570156]),
 array([0.29553247]),
 array([-0.50987717]),
 array([0.21786271]),
 array([-0.58128918]),
 array([0.20939634]),
 array([-0.50987717]),
 array([2.03445166]),
 array([-1.30070994]),
 array([-0.76313213]),
 array([4.35