In [1]:
# fingerprints(Morgan + MLP model and AttentiveFP) and descriptors(RDKit 2D + MLP model)
# sota 구현

In [2]:
# -----------------------------------------------------------------------------------
# 파일명       : test.ipynb
# 설명         : 인체 내 약물 대사에 관여하는 CYP3A4 효소 저해 예측모델 개발             
# 작성자       : 이민하
# 작성일       : 2025-07-15
# 
# 사용 모듈    :
# - os                               # 경로 관리
# -----------------------------------------------------------------------------------
# >> 주요 기능
# - 모델의 학습과 평가를 위한 모듈
#
# >> 업데이트 내역
# [2025-07-15] 
# -----------------------------------------------------------------------------------


In [3]:
import pandas as pd
import numpy as np

from rdkit import Chem
from rdkit.Chem import MACCSkeys
from rdkit.Chem import AllChem
from rdkit.Chem import Descriptors

from gensim.models import word2vec
from mol2vec.features import sentences2vec, mol2alt_sentence

from mordred import Calculator, descriptors

from tqdm import tqdm

import xgboost as xgb
import optuna

from sklearn.model_selection import cross_val_score, RandomizedSearchCV

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
TRAIN_PATH = "./data/train.csv"
TEST_PATH = "./data/test.csv"
SAMPLE_PATH = "./data/sample_submission.csv"

In [6]:
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
sample_df = pd.read_csv(SAMPLE_PATH)

train_df

Unnamed: 0,ID,Canonical_Smiles,Inhibition
0,TRAIN_0000,Cl.OC1(Cc2cccc(Br)c2)CCNCC1,12.500000
1,TRAIN_0001,Brc1ccc2OCCc3ccnc1c23,4.450000
2,TRAIN_0002,CC1(CO)CC(=NO1)c2cc(c(F)cc2Cl)[N+](=O)[O-],4.920000
3,TRAIN_0003,Fc1ccc2nc(Nc3cccc(COc4cccc(c4)C(=O)N5CCOCC5)c3...,71.500000
4,TRAIN_0004,CC(C)CC(=O)C1=C(Nc2c(Cl)ccc(Cl)c2C1=O)S(=O)C,18.300000
...,...,...,...
1676,TRAIN_1676,Cc1cc2ncn(CC3CCN(CC3)S(=O)(=O)CCN4C(=O)CCCC4=O...,0.500000
1677,TRAIN_1677,O=C(CN1N=CC=CC1=O)N2Cc3cnc(nc3C2)N4CCOCC4,0.500000
1678,TRAIN_1678,COC1=COC(=CC1=O)C(=O)Nc2cccc3c2ccn3C,0.500000
1679,TRAIN_1679,CC1=CC(=O)N(CCNC(=O)c2nc3nc(C)cc(C)n3n2)C=N1,0.500000


In [7]:
smile_list = train_df["Canonical_Smiles"]

smile_list

0                             Cl.OC1(Cc2cccc(Br)c2)CCNCC1
1                                   Brc1ccc2OCCc3ccnc1c23
2              CC1(CO)CC(=NO1)c2cc(c(F)cc2Cl)[N+](=O)[O-]
3       Fc1ccc2nc(Nc3cccc(COc4cccc(c4)C(=O)N5CCOCC5)c3...
4            CC(C)CC(=O)C1=C(Nc2c(Cl)ccc(Cl)c2C1=O)S(=O)C
                              ...                        
1676    Cc1cc2ncn(CC3CCN(CC3)S(=O)(=O)CCN4C(=O)CCCC4=O...
1677            O=C(CN1N=CC=CC1=O)N2Cc3cnc(nc3C2)N4CCOCC4
1678                 COC1=COC(=CC1=O)C(=O)Nc2cccc3c2ccn3C
1679         CC1=CC(=O)N(CCNC(=O)c2nc3nc(C)cc(C)n3n2)C=N1
1680                CCc1ccc(\C=N\Nc2nn3cnnc3c4ccccc24)cc1
Name: Canonical_Smiles, Length: 1681, dtype: object

In [8]:
target = train_df["Inhibition"]

target

0       12.500000
1        4.450000
2        4.920000
3       71.500000
4       18.300000
          ...    
1676     0.500000
1677     0.500000
1678     0.500000
1679     0.500000
1680    41.700398
Name: Inhibition, Length: 1681, dtype: float64

In [9]:
mol2vec_model = word2vec.Word2Vec.load("model_300dim.pkl")

In [None]:

# ------------------------
# Feature Extraction Utils
# ------------------------

# RDKit descriptor 리스트 정의
rdkit_desc_list = [
    Descriptors.MolWt,
    Descriptors.MolLogP,
    Descriptors.NumHDonors,
    Descriptors.NumHAcceptors,
    Descriptors.TPSA,
    Descriptors.HeavyAtomCount,
    Descriptors.FractionCSP3,
    Descriptors.NumRotatableBonds,
    Descriptors.RingCount
]

# 단일 분자 처리 함수
def extract_features(smiles):
    mol = Chem.MolFromSmiles(smiles)

    # MACCS (167차원 → 첫 번째 index 제거)
    maccs_fp = MACCSkeys.GenMACCSKeys(mol)
    maccs = np.array(list(maccs_fp))[1:]

    # Morgan fingerprint (2048-bit)
    morgan_fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius=2, nBits=2048)
    morgan = np.array(morgan_fp)

    # RDKit descriptors (~9개)
    rdkit_features = np.array([func(mol) for func in rdkit_desc_list])

    # Mordred (~1000~1600개)
    mordred_calc = Calculator(descriptors, ignore_3D=True)
    mordred_desc = mordred_calc(mol)
    mordred_desc = mordred_desc.fill_missing()
    mordred = np.array(mordred_desc)

    # mol2vec (300차원)
    sentence = mol2alt_sentence(mol, radius = 1)
    vecs = sentences2vec([sentence], mol2vec_model, unseen='UNK')
    mol2vec = vecs[0]  # 300 차원 벡터

    # 결합
    combined = np.concatenate([maccs, morgan, rdkit_features, mordred, mol2vec], axis=0)
    return combined

# ------------------------
# Batch 처리 함수
# ------------------------

def generate_feature_matrix(smiles_list):
    feature_list = []
    failed_list = []

    for smi in tqdm(smiles_list):
        features = extract_features(smi)

        if features is not None:
            feature_list.append(features)
        else:
            failed_list.append(smi)

    feature_matrix = np.array(feature_list)
    
    return feature_matrix, failed_list

In [13]:
# Feature matrix 생성
X, failed = generate_feature_matrix(smile_list)

print("Feature shape:", X.shape)  # 예: (3, 3800+ 차원)
print("Failed SMILES:", failed)

  0%|          | 0/1681 [00:00<?, ?it/s]

  return ufunc.reduce(obj, axis, dtype, out, **passkwargs)
100%|██████████| 1681/1681 [03:48<00:00,  7.35it/s]

Feature shape: (1681, 4136)
Failed SMILES: []





In [14]:
def xgb_objective(trial):
    param = {
        "objective": "reg:absoluteerror",
        "n_estimators": trial.suggest_int("n_estimators", 50, 1000),
        "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
        "subsample": trial.suggest_float("subsample", 0.5, 1.0),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "max_depth": trial.suggest_int("max_depth", 3, 7),
        "reg_alpha" : trial.suggest_int("reg_alpha", 0, 10),
        "reg_lambda" : trial.suggest_int("reg_lambda", 0, 10),
        "eval_metric": "mae",
        "tree_method": "gpu_hist",         # 핵심 설정
        "predictor": "gpu_predictor"
    }

    # XGBRegressor 모델 생성
    xgb_model = xgb.XGBRegressor(**param)

    # 교차 검증
    scores = cross_val_score(xgb_model, X, target, cv=5, scoring="neg_mean_absolute_error")

    # RMSE 계산
    rmse = (-scores.mean()) ** 0.5

    return rmse

In [15]:
optuna.logging.set_verbosity(optuna.logging.WARNING)


study = optuna.create_study(direction = "minimize")
study.optimize(xgb_objective, n_trials = 300, show_progress_bar = True, n_jobs = -1)

# 최적 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)

# 최적 하이퍼파라미터로 최종 모델 학습
best_param = study.best_params

Best trial: 250. Best value: 4.42819: 100%|██████████| 300/300 [10:07:10<00:00, 121.43s/it]  

Best hyperparameters: {'n_estimators': 909, 'colsample_bytree': 0.6424576561481847, 'subsample': 0.587773151287105, 'learning_rate': 0.01403675186395834, 'max_depth': 5, 'reg_alpha': 9, 'reg_lambda': 8}





In [16]:
test_smile_list = test_df["Canonical_Smiles"]

test_X, failed = generate_feature_matrix(test_smile_list)

  0%|          | 0/100 [00:00<?, ?it/s]

100%|██████████| 100/100 [00:26<00:00,  3.75it/s]


In [17]:
xgb_final_model = xgb.XGBRegressor(**best_param, random_state = 7)

xgb_final_model.fit(X, target)

y_pred = xgb_final_model.predict(test_X)


In [18]:
sample_df["Inhibition"] = y_pred

print(sample_df)

sample_df.to_csv('xgb_submission.csv', index = False)

          ID  Inhibition
0   TEST_000   42.810360
1   TEST_001   46.974598
2   TEST_002   31.942062
3   TEST_003   33.070927
4   TEST_004   36.992577
..       ...         ...
95  TEST_095   41.001945
96  TEST_096   57.167942
97  TEST_097   55.506142
98  TEST_098   41.079956
99  TEST_099   46.902203

[100 rows x 2 columns]
