In [53]:
import deepchem as dc
from deepchem.feat.graph_data import GraphData
from deepchem.feat import MolGraphConvFeaturizer
import numpy as np
import pandas as pd

In [54]:
df = pd.read_csv('./delaney-processed_smiles.csv')

In [55]:
df

Unnamed: 0,Compound ID,smiles,measured log solubility in mols per litre
0,Amigdalin,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770
1,Fenfuram,Cc1occc1C(=O)Nc2ccccc2,-3.300
2,citral,CC(C)=CCCC(C)=CC(=O),-2.060
3,Picene,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870
4,Thiophene,c1ccsc1,-1.330
...,...,...,...
1123,halothane,FC(F)(F)C(Cl)Br,-1.710
1124,Oxamyl,CNC(=O)ON=C(SC)C(=O)N(C)C,0.106
1125,Thiometon,CCSCCSP(=S)(OC)OC,-3.091
1126,2-Methylbutane,CCC(C)C,-3.180


In [56]:
df = df.drop([934],axis=0)

In [30]:
from sklearn.preprocessing import StandardScaler

# StandardScaler 객체 생성
scaler = StandardScaler()

# 표준화를 적용한 새로운 열 추가
df["scaled log solubility"] = scaler.fit_transform(df["measured log solubility in mols per litre"].values.reshape(-1, 1))


In [49]:
mean = scaler.mean_[0]
std = scaler.scale_[0]

In [50]:
mean

-3.0520097604259093

In [51]:
std

2.0954617919366605

In [32]:
df

Unnamed: 0,Compound ID,smiles,measured log solubility in mols per litre,scaled log solubility
0,Amigdalin,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.770,1.089025
1,Fenfuram,Cc1occc1C(=O)Nc2ccccc2,-3.300,-0.118346
2,citral,CC(C)=CCCC(C)=CC(=O),-2.060,0.473409
3,Picene,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.870,-2.299250
4,Thiophene,c1ccsc1,-1.330,0.821781
...,...,...,...,...
1123,halothane,FC(F)(F)C(Cl)Br,-1.710,0.640436
1124,Oxamyl,CNC(=O)ON=C(SC)C(=O)N(C)C,0.106,1.507071
1125,Thiometon,CCSCCSP(=S)(OC)OC,-3.091,-0.018607
1126,2-Methylbutane,CCC(C)C,-3.180,-0.061080


In [57]:
# 1. SMILES에서 그래프 형식으로 변환
featurizer = MolGraphConvFeaturizer()
features = featurizer.featurize(df["smiles"])  # Smiles 컬럼에서 특징 추출

In [58]:
# 2. 레이블 설정 (Solubility)
labels = df["measured log solubility in mols per litre"].values

In [59]:
# 3. 가중치 (필요시 기본값으로 1 설정)
weights = None  # 기본적으로 None으로 설정. 커스텀 가중치가 있으면 지정.

# 4. 데이터셋 생성
dataset = dc.data.NumpyDataset(X=features, y=labels, w=weights)

In [60]:
# 첫 번째 샘플 확인
print(type(dataset.X[0]))  # 각 샘플의 데이터 타입 확인
print(dataset.X[0])       # 샘플 내용 출력


<class 'deepchem.feat.graph_data.GraphData'>
GraphData(node_features=[32, 30], edge_index=[2, 68], edge_features=None)


In [74]:
# 데이터셋 확인
print(f"Number of samples in dataset: {len(dataset)}")
print(f"Feature shape: {dataset.X[0].node_features.shape[0]} nodes, {dataset.X[0].edge_index.shape[0]} edges")
print(f"First label: {dataset.y[0]}")

Number of samples in dataset: 1127
Feature shape: 32 nodes, 2 edges
First label: -0.77


In [62]:
# 데이터셋을 Train/Validation/Test로 나누기
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)

# 확인
print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(valid_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

Train dataset size: 901
Validation dataset size: 113
Test dataset size: 113


In [63]:
model = dc.models.GCNModel(n_tasks=1, mode='regression', dropout=0.2,batch_normalize=False)

In [64]:
model.fit(train_dataset, nb_epoch=100)

0.3110897445678711

In [41]:
# 테스트 데이터셋에서 샘플 10개에 대해 예측 수행
predictions = model.predict(test_dataset)
print(predictions)

[[-1.4408021 ]
 [-3.365995  ]
 [-0.47653192]
 [-1.4512603 ]
 [-3.0889308 ]
 [ 1.6305952 ]
 [-2.1830678 ]
 [-0.7464059 ]
 [ 0.8463424 ]
 [ 0.81990397]
 [-2.1412947 ]
 [-1.656784  ]
 [-2.9333434 ]
 [-4.1325665 ]
 [-1.6679265 ]
 [-3.5985038 ]
 [-2.068902  ]
 [-3.5610318 ]
 [-2.5425725 ]
 [-3.6424148 ]
 [-0.60355556]
 [-3.7080097 ]
 [ 0.29981905]
 [-1.9709673 ]
 [-3.0740952 ]
 [-3.3587644 ]
 [-1.698504  ]
 [-0.5923021 ]
 [-3.8351088 ]
 [-2.351968  ]
 [-4.756142  ]
 [-4.0563974 ]
 [ 0.77189857]
 [-2.9925804 ]
 [-3.5510123 ]
 [-3.6951976 ]
 [-3.398741  ]
 [-1.6467323 ]
 [-3.4371114 ]
 [-5.161125  ]
 [-2.5711522 ]
 [-1.1355143 ]
 [-2.1991887 ]
 [-1.3717158 ]
 [-3.8159728 ]
 [ 1.131069  ]
 [-2.1646185 ]
 [-3.8127966 ]
 [-0.90491307]
 [-3.4156947 ]
 [-1.1550076 ]
 [-4.300931  ]
 [-1.1519315 ]
 [-3.3066435 ]
 [-6.1314983 ]
 [-1.4594936 ]
 [-1.948798  ]
 [-2.7824275 ]
 [-3.0493307 ]
 [-0.07070571]
 [-1.0477347 ]
 [-5.685768  ]
 [-4.6011915 ]
 [-2.8412447 ]
 [-3.444007  ]
 [-1.6205418 ]
 [-6.39240

In [65]:
# 1. R2 스코어를 계산하기 위한 Metric 객체 준비
metric = dc.metrics.Metric(dc.metrics.r2_score, mode='regression')

# 2. Train, Validation, Test셋 각각에 대해 평가
train_score = model.evaluate(train_dataset, [metric])
valid_score = model.evaluate(valid_dataset, [metric])

# 3. 각 결과 출력
print("Train R2:", train_score["r2_score"])
print("Valid R2:", valid_score["r2_score"])

Train R2: 0.9030530251076399
Valid R2: 0.8700333456255839


In [66]:
# 1. R2 스코어를 계산하기 위한 Metric 객체 준비
metric = dc.metrics.Metric(dc.metrics.r2_score, mode='regression')

# 2. Train, Validation, Test셋 각각에 대해 평가
train_score = model.evaluate(train_dataset, [metric])
valid_score = model.evaluate(valid_dataset, [metric])

# 3. 각 결과 출력
print("Train R2:", train_score["r2_score"])
print("Valid R2:", valid_score["r2_score"])

Train R2: 0.9030530251076399
Valid R2: 0.8700333456255839


In [67]:
test_score  = model.evaluate(test_dataset,  [metric])
print("Test  R2:", test_score["r2_score"])

Test  R2: 0.8882412442124726


In [68]:
test_score  = model.evaluate(test_dataset,  [metric])
print("Test  R2:", test_score["r2_score"])

Test  R2: 0.8882412442124726


In [69]:
# 모델 평가
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9461282768041696}
Test set score: {'pearson_r2_score': 0.9270569148344294}


In [70]:
# 모델 평가
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)
print("Training set score:", model.evaluate(train_dataset, [metric]))
print("Test set score:", model.evaluate(test_dataset, [metric]))

Training set score: {'pearson_r2_score': 0.9461282768041696}
Test set score: {'pearson_r2_score': 0.9270569148344294}


In [71]:
from sklearn.metrics import r2_score

# 1. Test 데이터셋에 대한 예측값 추론
y_pred = model.predict(test_dataset)

# 2. 실제 y값 (test_dataset.y) 과 비교하여 R2 스코어 계산
test_r2 = r2_score(test_dataset.y, y_pred)

print("Test R2 Score:", test_r2)


Test R2 Score: 0.8882412442124726


In [72]:
solubilities = model.predict_on_batch(test_dataset.X[:10])
for molecule, solubility, test_solubility in zip(test_dataset.ids, solubilities, test_dataset.y):
    print(solubility, test_solubility, molecule)

[-3.9160612] -3.1 853
[-3.5385437] -4.23 959
[-9.274925] -7.92 661
[-2.0279822] -1.655 42
[-4.932832] -4.25 946
[-1.8950099] -1.25 457
[-6.0537405] -6.35 768
[-1.8887721] -1.34 1052
[-1.8433088] -1.24 152
[-8.160906] -8.23 321


In [73]:
# 모델 훈련 후
model.save_checkpoint(model_dir="./solubility_model_no_scaler")  # 원하는 경로 지정

In [48]:
model

<deepchem.models.torch_models.gcn.GCNModel at 0x1f3dae07430>

In [52]:
# 데이터 크기 확인
print(f"Feature shape: {features[0].node_features.shape}")


Feature shape: (32, 30)
