In [1]:
import deepchem as dc
from deepchem.feat.graph_data import GraphData
from deepchem.feat import MolGraphConvFeaturizer
import numpy as np
import pandas as pd

No normalization for AvgIpc. Feature removed!
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (C:\Users\yyyyx\miniconda3\envs\deepchem\lib\site-packages\deepchem\models\torch_models\__init__.py)
Skipped loading modules with pytorch-lightning dependency, missing a dependency. No module named 'lightning'
Skipped loading some Jax models, missing a dependency. No module named 'jax'


In [30]:
df = pd.read_csv('./Sollubility.csv')

In [31]:
df

Unnamed: 0,SMILES,Measured Log Solubility
0,CC(C)=CCCC(C)=CC(=O),0.390413
1,CCCC=C,0.090421
2,CCCCCCCCCCCCCC,-2.464346
3,CC(C)Cl,0.704920
4,CCC(C)CO,1.159746
...,...,...
1123,ClC4=C(Cl)C5(Cl)C3C1CC(C2OC12)C3C4(Cl)C5(Cl)Cl,-1.656304
1124,c1ccsc1,0.743629
1125,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-2.420799
1126,Cc1occc1C(=O)Nc2ccccc2,-0.209570


In [32]:
from rdkit import Chem
from rdkit.Chem import AllChem
# 수소(H)를 복원한 SMILES 컬럼 추가
def add_hydrogens(smiles):
    try:
        mol = Chem.MolFromSmiles(smiles)  # SMILES 문자열로부터 Mol 객체 생성
        mol_with_h = Chem.AddHs(mol)     # 수소 복원
        return Chem.MolToSmiles(mol_with_h)  # 수소가 추가된 SMILES로 변환
    except Exception as e:
        print(f"Error processing SMILES '{smiles}': {e}")
        return None

In [33]:
df['SMILES_with_H'] = df['SMILES'].apply(add_hydrogens)

In [38]:
df = df.drop([248],axis=0)

In [39]:
# 1. SMILES에서 그래프 형식으로 변환
featurizer = MolGraphConvFeaturizer()
features = featurizer.featurize(df["SMILES_with_H"])  # Smiles 컬럼에서 특징 추출

In [42]:
df.loc[[249]]

Unnamed: 0,SMILES,Measured Log Solubility,SMILES_with_H
249,CCC(C)C(=O)C,1.062975,[H]C([H])([H])C(=O)C([H])(C([H])([H])[H])C([H]...


In [43]:
features[248]

GraphData(node_features=[7, 30], edge_index=[2, 12], edge_features=None)

In [45]:
# 2. 레이블 설정 (pIC50)
labels = df["Measured Log Solubility"].values

In [46]:
# 3. 가중치 (필요시 기본값으로 1 설정)
weights = None  # 기본적으로 None으로 설정. 커스텀 가중치가 있으면 지정.

# 4. 데이터셋 생성
dataset = dc.data.NumpyDataset(X=features, y=labels, w=weights)

In [47]:
# 첫 번째 샘플 확인
print(type(dataset.X[0]))  # 각 샘플의 데이터 타입 확인
print(dataset.X[0])       # 샘플 내용 출력


<class 'deepchem.feat.graph_data.GraphData'>
GraphData(node_features=[11, 30], edge_index=[2, 20], edge_features=None)


In [48]:
# 데이터셋 확인
print(f"Number of samples in dataset: {len(dataset)}")
print(f"Feature shape: {dataset.X[0].node_features.shape[0]} nodes, {dataset.X[0].edge_index.shape[1]} edges")
print(f"First label: {dataset.y[0]}")

Number of samples in dataset: 1127
Feature shape: 11 nodes, 20 edges
First label: 0.3904129382012304


In [49]:
# 데이터셋을 Train/Validation/Test로 나누기
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)

# 확인
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 901
Validation dataset size: 113
Test dataset size: 113


In [50]:
model = dc.models.GCNModel(n_tasks=1, mode='regression', dropout=0.2,batch_normalize=False)

In [51]:
model.fit(train_dataset, nb_epoch=100)

0.0744758939743042

In [52]:
# 테스트 데이터셋에서 샘플 10개에 대해 예측 수행
predictions = model.predict(test_dataset)
print(predictions)

[[-0.8175308 ]
 [ 1.4575863 ]
 [ 1.211498  ]
 [ 0.44676998]
 [ 1.5643082 ]
 [-0.6749033 ]
 [ 0.69833094]
 [ 1.6169139 ]
 [-0.20249806]
 [-0.29253808]
 [ 0.4535683 ]
 [ 0.5499063 ]
 [ 0.7640685 ]
 [-0.28973213]
 [ 1.329316  ]
 [-1.454271  ]
 [ 0.9103646 ]
 [ 0.32965797]
 [ 0.07228334]
 [-0.07515994]
 [-1.1713682 ]
 [-0.26857477]
 [ 1.3532194 ]
 [-0.02872585]
 [-1.8437333 ]
 [ 0.7644342 ]
 [ 0.59173554]
 [ 1.7306337 ]
 [-0.01268139]
 [-1.2455543 ]
 [ 0.4813137 ]
 [-0.66662174]
 [ 0.72151387]
 [-0.71140426]
 [ 1.0657294 ]
 [-0.22526853]
 [ 0.64193416]
 [-0.43439463]
 [ 1.3025157 ]
 [-1.8484635 ]
 [ 1.0358499 ]
 [-1.8047053 ]
 [-0.5832558 ]
 [ 0.8983372 ]
 [-0.8023512 ]
 [-0.47986928]
 [-0.45163676]
 [ 0.7673647 ]
 [-0.70602876]
 [ 0.0617352 ]
 [-1.0806842 ]
 [-0.82666796]
 [-0.11523117]
 [-1.3774688 ]
 [-0.06718594]
 [-0.83442914]
 [ 0.2334642 ]
 [-0.28075346]
 [-0.24075209]
 [ 1.3631628 ]
 [ 1.1584446 ]
 [-2.168426  ]
 [-0.7651834 ]
 [ 1.4945848 ]
 [-0.434454  ]
 [-0.49663916]
 [ 0.13506

In [53]:
# 1. R2 스코어를 계산하기 위한 Metric 객체 준비
metric = dc.metrics.Metric(dc.metrics.r2_score, mode='regression')

# 2. Train, Validation, Test셋 각각에 대해 평가
train_score = model.evaluate(train_dataset, [metric])
valid_score = model.evaluate(valid_dataset, [metric])

# 3. 각 결과 출력
print("Train R2:", train_score["r2_score"])
print("Valid R2:", valid_score["r2_score"])

Train R2: 0.9178534986589926
Valid R2: 0.9011507357706644


In [55]:
test_score  = model.evaluate(test_dataset,  [metric])
print("Test  R2:", test_score["r2_score"])

Test  R2: 0.8550909977643876


In [59]:
# 모델 평가
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9374751104686739}
Test set score: {'pearson_r2_score': 0.8861099303412784}


In [61]:
from sklearn.metrics import r2_score

# 1. Test 데이터셋에 대한 예측값 추론
y_pred = model.predict(test_dataset)

# 2. 실제 y값 (test_dataset.y) 과 비교하여 R2 스코어 계산
test_r2 = r2_score(test_dataset.y, y_pred)

print("Test R2 Score:", test_r2)


Test R2 Score: 0.8550909977643876


In [65]:
solubilities = model.predict_on_batch(test_dataset.X[:10])
for molecule, solubility, test_solubility in zip(test_dataset.ids, solubilities, test_dataset.y):
    print(solubility, test_solubility, molecule)

[-0.8175309] -0.3058580320634255 764
[1.457586] 1.343612251211218 224
[1.211498] 0.5404087691824975 190
[0.44676974] 0.564601645147218 366
[1.5643083] 1.5274781085430935 254
[-0.67490333] -0.2869875888109434 944
[0.69833094] 0.6516959986202118 449
[1.616914] 1.3774822775618267 47
[-0.2024978] -1.2256711762420989 989
[-0.29253808] -0.3827913776312366 909


In [71]:
# 모델 훈련 후
model.save_checkpoint(model_dir="./solubility_model")  # 원하는 경로 지정