In [None]:
# -----------------------------------------------------------------------------------
# 파일명       : XGBoost_model.ipynb
# 설명         : 인체 내 약물 대사에 관여하는 CYP3A4 효소 저해 예측모델 개발          
# 작성자       : 이민하
# 작성일       : 2025-07-15
# 
# 사용 모듈    :
# - pandas                           # 데이터프레임 기반 데이터 처리
# - numpy                            # 수치 계산 및 배열 연산
# - deepchem                         # 분자 데이터 처리
# - xgboost                          # 고성능 머신러닝 모델
# - optuna                           # 하이퍼파라미터 최적화
# - sklearn.model_selection          # 학습 데이터 교차 검증
#
# -----------------------------------------------------------------------------------
# >> 주요 기능
# - CYP3A4 효소 저해 예측모델의 학습
#
# >> 업데이트 내역
# [2025-07-15] Descriptors + Fingerprints 기반 XGBoost 모델 학습
# [2025-07-16] 하이퍼파라미터 튜닝
# [2025-07-17] Feature별 중요도 시각화
# [2025-07-18] 중요도 낮은 Feature 제거 후 재학습 (Public Score 기준 0.771)
#
# >> 참고 Sota 논문
# - Accurate ADMET Prediction with XGBoost
# - https://paperswithcode.com/paper/accurate-admet-prediction-with-xgboost
# -----------------------------------------------------------------------------------


In [1]:
# 데이터프레임 기반 데이터 처리
import pandas as pd

# 수치 계산 및 배열 연산
import numpy as np

# 분자 데이터 처리
import deepchem as dc

# 고성능 머신러닝 모델
import xgboost as xgb

# 하이퍼파라미터 최적화
import optuna

# 학습 데이터 교차 검증
from sklearn.model_selection import cross_val_score

from sklearn.preprocessing import StandardScaler

  from .autonotebook import tqdm as notebook_tqdm
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading some Jax models, missing a dependency. jax requires jaxlib to be installed. See https://github.com/google/jax#installation for installation instructions.


In [3]:
# 데이터 경로 설정
TRAIN_PATH = "./data/train.csv"
TEST_PATH = "./data/test.csv"
SAMPLE_PATH = "./data/sample_submission.csv"

In [4]:
# csv 파일 -> DataFrame 형태로 불러오기
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_df = pd.read_csv(SAMPLE_PATH)

train_df

Unnamed: 0,ID,Canonical_Smiles,Inhibition
0,TRAIN_0000,Cl.OC1(Cc2cccc(Br)c2)CCNCC1,12.500000
1,TRAIN_0001,Brc1ccc2OCCc3ccnc1c23,4.450000
2,TRAIN_0002,CC1(CO)CC(=NO1)c2cc(c(F)cc2Cl)[N+](=O)[O-],4.920000
3,TRAIN_0003,Fc1ccc2nc(Nc3cccc(COc4cccc(c4)C(=O)N5CCOCC5)c3...,71.500000
4,TRAIN_0004,CC(C)CC(=O)C1=C(Nc2c(Cl)ccc(Cl)c2C1=O)S(=O)C,18.300000
...,...,...,...
1676,TRAIN_1676,Cc1cc2ncn(CC3CCN(CC3)S(=O)(=O)CCN4C(=O)CCCC4=O...,0.500000
1677,TRAIN_1677,O=C(CN1N=CC=CC1=O)N2Cc3cnc(nc3C2)N4CCOCC4,0.500000
1678,TRAIN_1678,COC1=COC(=CC1=O)C(=O)Nc2cccc3c2ccn3C,0.500000
1679,TRAIN_1679,CC1=CC(=O)N(CCNC(=O)c2nc3nc(C)cc(C)n3n2)C=N1,0.500000


In [5]:
# Smile 열 추출
smile_list = train_df["Canonical_Smiles"]

smile_list

0                             Cl.OC1(Cc2cccc(Br)c2)CCNCC1
1                                   Brc1ccc2OCCc3ccnc1c23
2              CC1(CO)CC(=NO1)c2cc(c(F)cc2Cl)[N+](=O)[O-]
3       Fc1ccc2nc(Nc3cccc(COc4cccc(c4)C(=O)N5CCOCC5)c3...
4            CC(C)CC(=O)C1=C(Nc2c(Cl)ccc(Cl)c2C1=O)S(=O)C
                              ...                        
1676    Cc1cc2ncn(CC3CCN(CC3)S(=O)(=O)CCN4C(=O)CCCC4=O...
1677            O=C(CN1N=CC=CC1=O)N2Cc3cnc(nc3C2)N4CCOCC4
1678                 COC1=COC(=CC1=O)C(=O)Nc2cccc3c2ccn3C
1679         CC1=CC(=O)N(CCNC(=O)c2nc3nc(C)cc(C)n3n2)C=N1
1680                CCc1ccc(\C=N\Nc2nn3cnnc3c4ccccc24)cc1
Name: Canonical_Smiles, Length: 1681, dtype: object

In [11]:
# 저해도 열 추출
target = np.array(train_df["Inhibition"]).reshape(-1, 1)
scaler = StandardScaler()
target = scaler.fit_transform(target)

target

array([[-0.78497806],
       [-1.0899257 ],
       [-1.0721213 ],
       ...,
       [-1.23955839],
       [-1.23955839],
       [ 0.32118249]])

In [14]:
# DeepChem 클래스 생성 
# (MACCS, ECFP, Mol2Vec, PubChem fingerprints, Mordred, RDKit descriptors)
maccskeys = dc.feat.MACCSKeysFingerprint()
circular = dc.feat.CircularFingerprint()
mol2vec = dc.feat.Mol2VecFingerprint()
mordred = dc.feat.MordredDescriptors(ignore_3D=True)
rdkit = dc.feat.RDKitDescriptors()
pubchem = dc.feat.PubChemFingerprint()

# Feature 추출 함수 - Smiles -> (, 5217)차원의 데이터로 변형
def extract_features(smile_list):
    # Feature 추출
    # maccs_features = maccskeys.featurize(smile_list)
    circular_features = circular.featurize(smile_list)
    mol2vec_features = mol2vec.featurize(smile_list)
    mordred_features = mordred.featurize(smile_list)
    rdkit_features = rdkit.featurize(smile_list)
    pubchem_features = pubchem.featurize(smile_list)

    # pubchem의 결과값이 없을 경우 None이 반환되므로, [0] * 881이 담긴 리스트로 변경
    pubchem_tmp = []

    for feat in pubchem_features:
        if len(feat) == 0:
            # 881 : pubchem result's shape
            pubchem_tmp.append(np.array([0] * 881))
        else:
            pubchem_tmp.append(feat)
    
    pubchem_features = np.array(pubchem_tmp)

    # Feature 차원 확인
    print(circular_features.shape, mol2vec_features.shape, mordred_features.shape, rdkit_features.shape, pubchem_features.shape)

    # Feature 결합
    combined = np.concatenate((circular_features, mol2vec_features, mordred_features, rdkit_features, pubchem_features), axis = 1)
    
    # 결측치 처리
    combined = np.nan_to_num(combined, nan = 0, posinf = 0)

    return combined


In [15]:
# Feature matrix 생성
X = extract_features(smile_list)

print("Feature shape:", X.shape)  # (1681, 5217 차원)

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
Failed to featurize datapoint 0, Cl.OC1(Cc2cccc(Br)c2)CCNCC1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 1, Brc1ccc2c3c(ccnc13)CCO2. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 2, CC1(CO)CC(c2cc([N+](=O)[O-])c(F)cc2Cl)=NO1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 3, O=C(c1cccc(OCc2cccc(Nc3nc4ccc(F)cc4[nH]3)c2)c1)N1CCOCC1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 4, CC(C)CC(=O)c1c(S(C)=O)[nH]c2c(Cl)ccc(Cl)c2c1=O. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 5, O=C1/C(=C(/O)C2CCOCC2)OCCN1Cc1ccccc1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Fail

(1681, 2048) (1681, 300) (1681, 1613) (1681, 208) (1681, 881)
Feature shape: (1681, 5050)


In [16]:
# Optuna 활용을 위한 하이퍼파라미터 설정
def xgb_objective(trial):
    param = {
        "objective": "reg:absoluteerror",
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "reg_alpha" : trial.suggest_int("reg_alpha", 0, 10),
        "reg_lambda" : trial.suggest_int("reg_lambda", 0, 10),
        "min_child_weight" : trial.suggest_int("min_child_weight", 1, 5),
        "eval_metric": "mae",
        "tree_method": "hist",        
        "device": "cuda"
    }

    # XGBRegressor 모델 생성
    xgb_model = xgb.XGBRegressor(**param)

    # 교차 검증 (평가 Score : Negetive MAE)
    scores = cross_val_score(xgb_model, X, target, cv=5, scoring="neg_mean_absolute_error")

    # MAE 계산
    mae = -scores.mean()

    return mae

In [17]:
# Optuna Log 설정
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Optuna를 통한 학습 시작
study = optuna.create_study(direction = "minimize")
study.optimize(xgb_objective, n_trials = 300, show_progress_bar = True)

# 최적 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)
best_param = study.best_params

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.


Best trial: 85. Best value: 0.740539: 100%|██████████| 300/300 [3:34:25<00:00, 42.88s/it]  

Best hyperparameters: {'n_estimators': 673, 'colsample_bytree': 0.5971186186168796, 'subsample': 0.5136352469505031, 'learning_rate': 0.030781370564451554, 'max_depth': 4, 'reg_alpha': 7, 'reg_lambda': 8, 'min_child_weight': 1}





In [19]:
# Test 데이터 Smiles 추출 및 Feature 변형
test_smile_list = test_df["Canonical_Smiles"]

test_X = extract_features(test_smile_list)

Failed to featurize datapoint 0, O=C(c1ccc(=O)[nH]c1)N(CC1CCCO1)C1(C(=O)NC2CCCC2)CCCCC1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 1, CC(C)c1cnc(Cn2ccnc2-c2cncc(Br)c2)o1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 2, Cc1[nH]c(=O)[nH]c(=O)c1S(=O)(=O)NCC(Cc1ccccc1F)c1ccccc1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 3, Cl.Cn1cc(-c2cc(CN3CCc4nc[nH]c4C34CCOCC4)on2)cn1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 4, CC(C)OC(=O)c1ccc(N2CC(C(=O)OCc3ccc(Br)cc3)CC2=O)cc1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize datapoint 5, CN(CCn1nc2ccccn2c1=O)c1ccccc1. Appending empty array
Exception message: Expecting value: line 1 column 1 (char 0)
Failed to featurize d

(100, 2048) (100, 300) (100, 1613) (100, 208) (100, 881)


In [20]:
test_X.shape

(100, 5050)

In [29]:
# Optuna를 통해 알아낸 최적의 하이퍼파라미터로 모델 Fitting
xgb_final_model = xgb.XGBRegressor(**best_param, random_state = 7)

xgb_final_model.fit(X, target)

# Test 데이터 예측
y_pred = xgb_final_model.predict(test_X)


In [30]:
y_pred = scaler.inverse_transform(y_pred.reshape(-1, 1))

y_pred


array([[46.18877  ],
       [43.175323 ],
       [32.856003 ],
       [29.890623 ],
       [37.724716 ],
       [18.582447 ],
       [31.320349 ],
       [23.455908 ],
       [41.460674 ],
       [14.607241 ],
       [22.193247 ],
       [18.251143 ],
       [ 8.996847 ],
       [26.442142 ],
       [35.2265   ],
       [10.874701 ],
       [ 7.1688848],
       [44.484837 ],
       [43.701847 ],
       [36.242737 ],
       [31.895702 ],
       [28.64475  ],
       [56.040295 ],
       [29.644325 ],
       [50.43642  ],
       [44.48024  ],
       [54.907593 ],
       [27.423786 ],
       [35.670826 ],
       [30.181362 ],
       [25.27139  ],
       [20.65435  ],
       [27.640236 ],
       [57.920574 ],
       [22.020653 ],
       [17.548271 ],
       [61.058147 ],
       [38.431732 ],
       [35.124847 ],
       [27.419645 ],
       [42.13682  ],
       [ 9.603767 ],
       [27.034613 ],
       [29.844797 ],
       [46.019863 ],
       [30.194885 ],
       [25.21474  ],
       [38.12

In [31]:
# Test 데이터 예측 결과 -> CSV 파일로 저장
sample_df["Inhibition"] = y_pred

print(sample_df)

sample_df.to_csv('xgb_submission.csv', index = False)

          ID  Inhibition
0   TEST_000   46.188770
1   TEST_001   43.175323
2   TEST_002   32.856003
3   TEST_003   29.890623
4   TEST_004   37.724716
..       ...         ...
95  TEST_095   40.261505
96  TEST_096   52.063114
97  TEST_097   54.674820
98  TEST_098   39.347569
99  TEST_099   46.425415

[100 rows x 2 columns]


In [34]:
# Feature 중요도 출력
feature_imp = []

#    MACCS      Circular     mol2vec      mordred      rdkit      pubchem
# (1681, 167) (1681, 2048) (1681, 300) (1681, 1613) (1681, 208) (1681, 881)
feature_size = [2048, 300, 1613, 210, 881, 0]


feature_imp_cur = []
running_size = 0

for size in feature_size:
    feature_imp_cur.append(
        np.sum(xgb_final_model.feature_importances_[running_size: running_size + size])
    )
    running_size += size
    feature_imp.append(feature_imp_cur)

print(
    feature_imp,
    # f"maccskeys: {feature_imp[0]*100:.2f}% ",
    # f"circular: {feature_imp[1]*100:.2f}% ",
    # f"mol2vec: {feature_imp[2]*100:.2f}% ",
    # f"mordred: {feature_imp[3]*100:.2f}% ",
    # f"rdkit: {feature_imp[4]*100:.2f}% ",
)

[[0.032903854, 0.18264946, 0.7022285, 0.082218215, 0.0, 0.0], [0.032903854, 0.18264946, 0.7022285, 0.082218215, 0.0, 0.0], [0.032903854, 0.18264946, 0.7022285, 0.082218215, 0.0, 0.0], [0.032903854, 0.18264946, 0.7022285, 0.082218215, 0.0, 0.0], [0.032903854, 0.18264946, 0.7022285, 0.082218215, 0.0, 0.0], [0.032903854, 0.18264946, 0.7022285, 0.082218215, 0.0, 0.0]]
