In [3]:
import deepchem as dc
from deepchem.feat import MolGraphConvFeaturizer
from deepchem.data import CSVLoader
from deepchem.splits import RandomSplitter
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F

from deepchem.models.layers import GraphConv, GraphPool, GraphGather


In [4]:
df = pd.read_csv('./AqsolDB.csv')

In [5]:
df

Unnamed: 0.1,Unnamed: 0,Drug_ID,Drug,Y
0,0,"N,N,N-trimethyloctadecan-1-aminium bromide",CCCCCCCCCCCCCCCCCC[N+](C)(C)C.[Br-],-3.616127
1,1,Benzo[cd]indol-2(1H)-one,O=C1Nc2cccc3cccc1c23,-3.254767
2,2,4-chlorobenzaldehyde,O=Cc1ccc(Cl)cc1,-2.177078
3,3,"zinc bis[2-hydroxy-3,5-bis(1-phenylethyl)benzo...",CC(c1ccccc1)c1cc(C(=O)[O-])c(O)c(C(C)c2ccccc2)...,-3.924409
4,4,4-({4-[bis(oxiran-2-ylmethyl)amino]phenyl}meth...,c1cc(N(CC2CO2)CC2CO2)ccc1Cc1ccc(N(CC2CO2)CC2CO...,-4.662065
...,...,...,...,...
9977,9977,tetracaine,CCCCNc1ccc(C(=O)OCCN(C)C)cc1,-3.010000
9978,9978,tetracycline,CN(C)[C@@H]1C(O)=C(C(N)=O)C(=O)[C@@]2(O)C(O)=C...,-2.930000
9979,9979,thymol,Cc1ccc(C(C)C)c(O)c1,-2.190000
9980,9980,verapamil,COc1ccc(CCN(C)CCCC(C#N)(c2ccc(OC)c(OC)c2)C(C)C...,-3.980000


In [6]:
from rdkit import Chem

# SMILES 유효성 검사 함수
def is_valid_smiles(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)
        return mol is not None
    except:
        return False

# SMILES 유효성 확인
df["is_valid"] = df["Drug"].apply(is_valid_smiles)
valid_df = df[df["is_valid"]]

print(f"유효한 SMILES 개수: {len(valid_df)} / {len(df)}")


유효한 SMILES 개수: 9980 / 9982


In [16]:
# 1. SMILES에서 그래프 형식으로 변환
featurizer = dc.feat.MolGraphConvFeaturizer() # 이 씨발 새끼부터 어떻게 해보기
features = featurizer.featurize(df["Drug"])  # Smiles 컬럼에서 특징 추출

Failed to featurize datapoint 38, [Mo]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 143, [Mg+2]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 199, [Cd+2]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 296, [Re]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 419, N. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featurize datapoint 801, [As]. Appending empty array
Exception message: More than one atom should be present in the molecule for this featurizer to work.
Failed to featur

In [17]:
len(features)

9982

In [18]:
# 2. 레이블 설정 (pIC50)
labels = df["Y"].values

In [19]:
# 3. 가중치 (필요시 기본값으로 1 설정)
weights = None  # 기본적으로 None으로 설정. 커스텀 가중치가 있으면 지정.

# 4. 데이터셋 생성
dataset = dc.data.NumpyDataset(X=features, y=labels, w=weights)

In [20]:
dataset

<NumpyDataset X.shape: (9982,), y.shape: (9982,), w.shape: (9982,), task_names: [0]>

In [21]:
invalid_indices = []
for i, x_item in enumerate(dataset.X):
    # 예: x_item 이 비어 있거나, 그래프가 아닌 ndarray인 경우 invalid 처리
    if (isinstance(x_item, np.ndarray) and len(x_item) == 0):
        invalid_indices.append(i)

print("Invalid indices:", invalid_indices)


Invalid indices: [38, 143, 199, 296, 419, 801, 827, 885, 1148, 1171, 1245, 1290, 1440, 1533, 1668, 1987, 2055, 2415, 2631, 3041, 3053, 3082, 3385, 3399, 3425, 3433, 3491, 3504, 3507, 3522, 3558, 3567, 3592, 3640, 3648, 3650, 3755, 4792, 5043]


In [22]:
import deepchem as dc
import numpy as np

# 유효한 인덱스를 골라낸다
valid_indices = [
    i for i in range(len(dataset.X)) if i not in invalid_indices
]

# 유효한 X, y, w, ids만 추출
filtered_X   = [dataset.X[i] for i in valid_indices]
filtered_y   = [dataset.y[i] for i in valid_indices]
filtered_w   = [dataset.w[i] for i in valid_indices]
filtered_ids = [dataset.ids[i] for i in valid_indices]

# 새롭게 NumpyDataset 생성
new_dataset = dc.data.NumpyDataset(
    X=filtered_X,
    y=filtered_y,
    w=filtered_w,
    ids=filtered_ids
)

# 이제 new_test_dataset을 대신 사용하면 됨


In [23]:
new_dataset

<NumpyDataset X.shape: (9943,), y.shape: (9943,), w.shape: (9943,), task_names: [0]>

In [24]:
import joblib as jb

jb.dump(new_dataset,'./Aquasol_dataset_MolGraphConv.pkl')

['./Aquasol_dataset_MolGraphConv.pkl']