In [1]:
import deepchem as dc
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import CSVLoader
from deepchem.splits import RandomSplitter
import numpy as np
import pandas as pd
import joblib as jb
import torch
import torch.nn as nn
import torch.nn.functional as F

from deepchem.models.layers import GraphConv, GraphPool, GraphGather


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (C:\Users\yyyyx\miniconda3\envs\deepchem\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [2]:
new_dataset = jb.load('./Aquasol_dataset_MolGraphConv.pkl')

In [3]:
new_dataset

<NumpyDataset X.shape: (9943,), y.shape: (9943,), w.shape: (9943,), task_names: [0]>

In [4]:
# 데이터셋 스플릿 (train/valid/test)
splitter = RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(new_dataset)

In [5]:
# 확인
print(f'Original dataset size: {len(new_dataset)}')
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Original dataset size: 9943
Train dataset size: 7954
Validation dataset size: 994
Test dataset size: 995


In [6]:
train_dataset.X

array([GraphData(node_features=[14, 30], edge_index=[2, 32], edge_features=None),
       GraphData(node_features=[15, 30], edge_index=[2, 32], edge_features=None),
       GraphData(node_features=[17, 30], edge_index=[2, 34], edge_features=None),
       ...,
       GraphData(node_features=[40, 30], edge_index=[2, 80], edge_features=None),
       GraphData(node_features=[12, 30], edge_index=[2, 24], edge_features=None),
       GraphData(node_features=[12, 30], edge_index=[2, 24], edge_features=None)],
      dtype=object)

In [26]:
###############################################################################
# 2. PyTorch GCN 모델 정의
###############################################################################
class GCNModel(nn.Module):
    """
    DeepChem의 그래프 컨볼루션 레이어를 이용하여 GCN을 구성하는 예시 모델입니다.
    """
    def __init__(self, hidden_dim=64, output_dim=1):
        super(GCNModel, self).__init__()
        # DeepChem에서 제공하는 그래프 컨볼루션 레이어(ConvMolLayer, GraphConv 등)가 있습니다.
        # 여기서는 GraphConv 레이어를 예시로 사용합니다.
        self.gc1 = GraphConv(hidden_dim)
        self.gc2 = GraphConv(hidden_dim)
        
        # Readout 전에 feature를 조금 더 변환할 수도 있음
        self.readout = GraphPool()  # 기본적으로 sum pooling
        
        # 최종 예측 레이어
        self.dense = nn.Linear(hidden_dim, output_dim)

    def forward(self, inputs):
        """
        inputs: (node_features, adjacency_list, num_nodes)의 튜플 형태로 들어옵니다.
          - node_features: shape (batch_size, max_atoms, n_feat)
          - adjacency_list: (batch_size, max_bonds, 2)
          - num_nodes: (batch_size,)
        """
        node_features, adjacency_list, num_nodes = inputs
        
        # GraphConv 레이어 통과
        h = self.gc1(node_features, adjacency_list, num_nodes)
        h = F.relu(h)
        h = self.gc2(h, adjacency_list, num_nodes)
        h = F.relu(h)
        
        # Readout(여기서는 Sum Pooling)
        h = self.readout(h, num_nodes)  # shape (batch_size, hidden_dim)
        
        # 회귀를 위한 최종 Dense
        out = self.dense(h)  # shape (batch_size, output_dim)
        return out

In [11]:
model = dc.models.GraphConvModel(
    n_tasks=1,            # 예측해야 하는 타겟 개수 (회귀 문제에서 1개라면 1)
    mode='regression',    # 회귀냐 분류냐
    batch_size=32,
    learning_rate=0.001,
)

model.fit(train_dataset, nb_epoch=10)

InternalError: cudaGetDevice() failed. Status: cudaGetErrorString symbol not found.

In [7]:
# GCNModel 초기화
model = dc.models.GCNModel(
    n_tasks=1,
    graph_conv_layers=[64, 64],
    dense_layer_size=128,
    dropout=0.2,
    mode='regression',
    learning_rate=1e-3,
    batch_size=128,
)

In [8]:
%%time

model.fit(train_dataset, nb_epoch=10)

CPU times: total: 3min 3s
Wall time: 45.6 s


1.3576590220133464

In [27]:
###############################################################################
# 3. DeepChem의 TorchModel로 래핑 & 학습
###############################################################################
# 모델을 PyTorch로 정의했으므로, DeepChem의 TorchModel을 사용합니다.
model = dc.models.TorchModel(
    model=GCNModel(hidden_dim=64, output_dim=1),  # 위에서 정의한 PyTorch 모델
    loss=dc.models.losses.L2Loss(),               # 회귀이므로 L2Loss 사용
    output_types=["prediction"]                   # 예측의 형태
)

# 모델 학습
# nb_epoch, batch_size 등은 상황에 맞게 조절
model.fit(train_dataset, nb_epoch=100)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint64, uint32, uint16, uint8, and bool.

In [9]:
###############################################################################
# 4. 모델 평가
###############################################################################
# 평가지표 설정 (예: R^2 스코어, MAE, RMSE 등)
metric_r2 = dc.metrics.Metric(dc.metrics.pearson_r2_score)
metric_mae = dc.metrics.Metric(dc.metrics.mean_absolute_error)

print("Train R^2: ", model.evaluate(train_dataset, [metric_r2]))
print("Valid R^2: ", model.evaluate(valid_dataset, [metric_r2]))
print("Test  R^2: ", model.evaluate(test_dataset, [metric_r2]))

print("Train MAE: ", model.evaluate(train_dataset, [metric_mae]))
print("Valid MAE: ", model.evaluate(valid_dataset, [metric_mae]))
print("Test  MAE: ", model.evaluate(test_dataset, [metric_mae]))


Train R^2:  {'pearson_r2_score': 0.776267570646285}
Valid R^2:  {'pearson_r2_score': 0.7438408257849691}
Test  R^2:  {'pearson_r2_score': 0.7343115565439987}
Train MAE:  {'mean_absolute_error': 0.8807286419332556}
Valid MAE:  {'mean_absolute_error': 0.9010264118836565}
Test  MAE:  {'mean_absolute_error': 0.9475775182251481}
