In [27]:
import deepchem as dc
from deepchem.feat.graph_data import GraphData
from deepchem.feat import MolGraphConvFeaturizer
import numpy as np
import pandas as pd

In [28]:
df = pd.read_csv('./HIV_protease.csv')

In [29]:
df

Unnamed: 0,Name,Structure,Solubility
0,CHEMBL729/LOPINAVIR,CC1=CC=CC(C)=C1OCC(=O)N[C@@H](CC1=CC=CC=C1)[C@...,Soluble
1,CHEMBL584/NELFINAVIR,CC1=C(O)C=CC=C1C(=O)N[C@@H](CSC1=CC=CC=C1)[C@H...,Moderate
2,CHEMBL57/NEVIRAPINE,CC1=CC=NC2=C1NC(=O)C1=CC=CN=C1N2C1CC1,Moderate
3,CHEMBL282042/SAQUINAVIR MESYLATE,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...,
4,CHEMBL222559/TIPRANAVIR,CCC[C@@]1(CCC2=CC=CC=C2)CC(O)=C([C@H](CC)C2=CC...,Insoluble
...,...,...,...
108,CHEMBL100089,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...,Moderate
109,CHEMBL100048,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,Soluble
110,CHEMBL100040,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,Moderate
111,CHEMBL100039,CC(C)CCN(C[C@@H](O)[C@H](CC1=CC=CC=C1)NC(=O)[C...,Soluble


In [30]:
model = dc.models.GCNModel(n_tasks=1, mode='regression', dropout=0.2,batch_normalize=False)

In [31]:
model.restore('./solubility_model_no_scaler/checkpoint1.pt')

In [32]:
model

<deepchem.models.torch_models.gcn.GCNModel at 0x22b3b9f79d0>

In [33]:
# 1. SMILES에서 그래프 형식으로 변환
featurizer = MolGraphConvFeaturizer()
features = featurizer.featurize(df["Structure"])  # Structure 컬럼에서 특징 추출

In [34]:
# 2. 레이블 설정 (pIC50)
labels = None

In [35]:
# 3. 가중치 (필요시 기본값으로 1 설정)
weights = None  # 기본적으로 None으로 설정. 커스텀 가중치가 있으면 지정.

# 4. 데이터셋 생성
dataset = dc.data.NumpyDataset(X=features, y=labels, w=weights)

In [36]:
# 데이터 크기 확인
print(f"Feature shape: {features[0].node_features.shape}")


Feature shape: (46, 30)


In [38]:
# 예측 수행
df['Prediction'] = model.predict(dataset)

In [40]:
df.to_csv('./HIV114.csv')

In [24]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

# StandardScaler를 사용해 Prediction 값을 원복
# mean = scaler.mean_[0]  # StandardScaler에서 사용된 평균
# std = scaler.scale_[0]  # StandardScaler에서 사용된 표준편차

# 원복 공식 적용
df["Original Prediction"] = df["Prediction"] * 2.0954617919366605 -3.0520097604259093

# 결과 확인
print(df.head())


                               Name  \
0               CHEMBL729/LOPINAVIR   
1              CHEMBL584/NELFINAVIR   
2               CHEMBL57/NEVIRAPINE   
3  CHEMBL282042/SAQUINAVIR MESYLATE   
4           CHEMBL222559/TIPRANAVIR   

                                           Structure Solubility  Prediction  \
0  CC1=CC=CC(C)=C1OCC(=O)N[C@@H](CC1=CC=CC=C1)[C@...    Soluble   -5.241626   
1  CC1=C(O)C=CC=C1C(=O)N[C@@H](CSC1=CC=CC=C1)[C@H...   Moderate   -5.525194   
2              CC1=CC=NC2=C1NC(=O)C1=CC=CN=C1N2C1CC1   Moderate   -3.309443   
3  CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...        NaN   -5.772190   
4  CCC[C@@]1(CCC2=CC=CC=C2)CC(O)=C([C@H](CC)C2=CC...  Insoluble   -7.282715   

   Original Prediction  
0           -14.035636  
1           -14.629843  
2            -9.986820  
3           -15.147413  
4           -18.312662  


In [23]:
mean = scaler.mean_[0]
std = scaler.scale_[0]

AttributeError: 'StandardScaler' object has no attribute 'mean_'

In [25]:
df

Unnamed: 0,Name,Structure,Solubility,Prediction,Original Prediction
0,CHEMBL729/LOPINAVIR,CC1=CC=CC(C)=C1OCC(=O)N[C@@H](CC1=CC=CC=C1)[C@...,Soluble,-5.241626,-14.035636
1,CHEMBL584/NELFINAVIR,CC1=C(O)C=CC=C1C(=O)N[C@@H](CSC1=CC=CC=C1)[C@H...,Moderate,-5.525194,-14.629843
2,CHEMBL57/NEVIRAPINE,CC1=CC=NC2=C1NC(=O)C1=CC=CN=C1N2C1CC1,Moderate,-3.309443,-9.986820
3,CHEMBL282042/SAQUINAVIR MESYLATE,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...,,-5.772190,-15.147413
4,CHEMBL222559/TIPRANAVIR,CCC[C@@]1(CCC2=CC=CC=C2)CC(O)=C([C@H](CC)C2=CC...,Insoluble,-7.282715,-18.312662
...,...,...,...,...,...
108,CHEMBL100089,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2CCCC[C@@H]2CN1C[C...,Moderate,-6.030370,-15.688419
109,CHEMBL100048,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,Soluble,-3.489471,-10.364064
110,CHEMBL100040,CC(C)(C)NC(=O)[C@@H]1C[C@@H]2SCC[C@@H]2CN1C[C@...,Moderate,-5.755380,-15.112189
111,CHEMBL100039,CC(C)CCN(C[C@@H](O)[C@H](CC1=CC=CC=C1)NC(=O)[C...,Soluble,-4.615815,-12.724273
