In [15]:
import deepchem as dc
from deepchem.feat.graph_data import GraphData
from deepchem.feat import MolGraphConvFeaturizer
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('./filtered_data_pIC50.csv')

In [3]:
df

Unnamed: 0,Molecule ChEMBL ID,Smiles,Molecule Name,Molecule Max Phase,Molecular Weight,#RO5 Violations,AlogP,Standard Value,pIC50,pChEMBL Value,...,Target Type,Document ChEMBL ID,Source ID,Source Description,Document Journal,Document Year,Cell ChEMBL ID,Properties,Action Type,Standard Text Value
0,CHEMBL100034,CCCCNC(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc1ccccc1)N...,,,604.75,1.0,2.76,126.00,6.899629,6.90,...,SINGLE PROTEIN,CHEMBL1126824,1.0,Scientific Literature,J Med Chem,1993.0,,,,
1,CHEMBL100039,CC(C)CCN(C[C@@H](O)[C@H](Cc1ccccc1)NC(=O)[C@H]...,,,597.76,1.0,3.10,13.00,7.886057,7.89,...,SINGLE PROTEIN,CHEMBL1126824,1.0,Scientific Literature,J Med Chem,1993.0,,,,
2,CHEMBL100040,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,,,730.85,1.0,2.85,0.20,9.698970,,...,SINGLE PROTEIN,CHEMBL1128092,1.0,Scientific Literature,Bioorg Med Chem Lett,1995.0,,,,
3,CHEMBL100048,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,,,599.84,1.0,2.53,1.70,8.769551,8.77,...,SINGLE PROTEIN,CHEMBL1128633,1.0,Scientific Literature,Bioorg Med Chem Lett,1995.0,,,,
4,CHEMBL100089,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...,,,740.99,1.0,3.42,0.50,9.301030,9.30,...,SINGLE PROTEIN,CHEMBL1128092,1.0,Scientific Literature,Bioorg Med Chem Lett,1995.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2047,CHEMBL99774,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...,,,676.92,1.0,3.72,24.00,7.619789,7.62,...,SINGLE PROTEIN,CHEMBL1128092,1.0,Scientific Literature,Bioorg Med Chem Lett,1995.0,,,,
2048,CHEMBL99875,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,,,539.74,1.0,4.00,6.60,8.180456,8.18,...,SINGLE PROTEIN,CHEMBL1128091,1.0,Scientific Literature,Bioorg Med Chem Lett,1995.0,,,,
2049,CHEMBL99878,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,,,762.91,1.0,3.40,10.00,8.000000,8.00,...,SINGLE PROTEIN,CHEMBL1128092,1.0,Scientific Literature,Bioorg Med Chem Lett,1995.0,,,,
2050,CHEMBL99934,CCCNC(=O)N(CC(C)C)C[C@@H](O)[C@H](Cc1ccccc1)NC...,,,569.70,1.0,2.32,518.00,6.285670,6.29,...,SINGLE PROTEIN,CHEMBL1126824,1.0,Scientific Literature,J Med Chem,1993.0,,,,


In [6]:
# 1. SMILES에서 그래프 형식으로 변환
featurizer = MolGraphConvFeaturizer()
features = featurizer.featurize(df["Smiles"])  # Smiles 컬럼에서 특징 추출

In [8]:
features

array([GraphData(node_features=[44, 30], edge_index=[2, 92], edge_features=None),
       GraphData(node_features=[43, 30], edge_index=[2, 88], edge_features=None),
       GraphData(node_features=[49, 30], edge_index=[2, 104], edge_features=None),
       ...,
       GraphData(node_features=[50, 30], edge_index=[2, 106], edge_features=None),
       GraphData(node_features=[41, 30], edge_index=[2, 84], edge_features=None),
       GraphData(node_features=[41, 30], edge_index=[2, 88], edge_features=None)],
      dtype=object)

In [7]:
# 2. 레이블 설정 (pIC50)
labels = df["pIC50"].values

In [9]:
# 3. 가중치 (필요시 기본값으로 1 설정)
weights = None  # 기본적으로 None으로 설정. 커스텀 가중치가 있으면 지정.

# 4. 데이터셋 생성
dataset = dc.data.NumpyDataset(X=features, y=labels, w=weights)

In [13]:
# 데이터셋 확인
print(f"Number of samples in dataset: {len(dataset)}")
print(f"Feature shape: {dataset.X[0].node_features.shape[0]} nodes, {dataset.X[0].edge_index.shape[1]} edges")
print(f"First label: {dataset.y[0]}")

Number of samples in dataset: 2052
Feature shape: 44 nodes, 92 edges
First label: 6.899629455


In [16]:
# 데이터셋을 Train/Validation/Test로 나누기
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)

# 확인
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 1641
Validation dataset size: 205
Test dataset size: 206


In [20]:
model = dc.models.GCNModel(n_tasks=1, mode='regression', dropout=0.2,batch_normalize=False)

In [21]:
model.fit(train_dataset, nb_epoch=100)

0.6419707489013672

In [24]:
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.5662477620747329}
Test set score: {'pearson_r2_score': 0.49987485298065687}
