In [22]:
import catboost as cb

import optuna

from mordred import Calculator, descriptors

import numpy as np
import pandas as pd

from tqdm import tqdm

from MapLight import *

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [23]:
X_TRAIN_PATH = "./combined_features.csv"
TRAIN_PATH = "./data/train.csv"
TEST_PATH = "./data/test.csv"
SUBMISSION_PATH = "./data/sample_submission.csv"

In [24]:
X_train = pd.read_csv(X_TRAIN_PATH, header = None)
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
submission_df = pd.read_csv(SUBMISSION_PATH)

In [25]:
RDLogger.DisableLog('rdApp.*')

smiles = test_df["Canonical_Smiles"]

transformer = PretrainedDGLTransformer(kine = "gin_supervised_masking", dtype = float)
mordred_calc = Calculator(descriptors, ignore_3D=True)

mordred_list = []
avalon_list = []
morgan_list = []
# erg_list = []
# rdkit_list = []
gin_list = []

for smile in tqdm(smiles):
    mol = Chem.MolFromSmiles(smile)
    
    avalon_list.append(avalon_fingerprint(mol).ToList())
    morgan_list.append(morgan_fingerprint(mol).ToList())
    # erg_list.append(erg_fingerprint(mol).tolist())
    # rdkit_list.append(rdkit_features(mol))
    gin_list.append(gin_supervised_masking(transformer, mol).tolist())
    mordred_list.append(mordred_calc(mol))

avalon_list = np.array(avalon_list)
morgan_list = np.array(morgan_list)
# erg_list = np.array(erg_list)
# rdkit_list = np.array(rdkit_list)
gin_list = np.squeeze(np.array(gin_list), axis = 1)
mordred_list = np.array(mordred_list)

print(avalon_list.shape, morgan_list.shape, gin_list.shape, mordred_list.shape)

test_combined = pd.DataFrame(np.concatenate((avalon_list, morgan_list, gin_list, mordred_list), axis = 1), index = None)

np.savetxt("test_combined_features.csv", test_combined, delimiter=",")

print(test_combined.shape)

100%|██████████| 100/100 [00:13<00:00,  7.30it/s]


(100, 1024) (100, 1024) (100, 300) (100, 1613)
(100, 3961)


In [15]:
X_train = pd.read_csv("train_features.csv", header = None)
X_test = pd.read_csv("test_features.csv", header = None)

print(X_train.shape, X_test.shape)

y_scaler = scaler(log = False)
y_scaler.fit(train_df["Inhibition"].values)
train_df["Inhibition"] = y_scaler.transform(train_df["Inhibition"].values)

y_train = train_df["Inhibition"]

def map_objective(trial):
    param = {
        "loss_function": "RMSE",
        "iterations": trial.suggest_int("iterations", 50, 450),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.001, 10.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 64),
        "random_seed": trial.suggest_int("random_seed", 1, 6),
        "task_type": "GPU",  # or "CPU"
        "devices": "0"
    }

    cat_model = cb.CatBoostRegressor(**param, verbose = 0)

    scores = cross_val_score(cat_model, X_train, y_train, cv = 5, scoring = 'neg_mean_absolute_error')
    
    mae = -scores.mean()

    # MAE 평균 값 반환
    return mae

(1681, 5285) (100, 5285)


In [16]:
# Optuna Log 설정
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Optuna를 통한 학습 시작
study = optuna.create_study(direction = "minimize")
study.optimize(map_objective, n_trials = 500, show_progress_bar = True)

# 최적 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)
best_param = study.best_params

Best trial: 255. Best value: 0.72208: 100%|██████████| 500/500 [2:53:20<00:00, 20.80s/it]   

Best hyperparameters: {'iterations': 373, 'learning_rate': 0.08668160768656591, 'depth': 5, 'l2_leaf_reg': 9.625925025121687, 'bagging_temperature': 0.957300259026022, 'random_strength': 8.08608251976459, 'border_count': 52, 'random_seed': 3}





In [17]:
final_cat_model = cb.CatBoostRegressor(**best_param)

final_cat_model.fit(X_train, y_train)

y_pred = y_scaler.inverse_transform(final_cat_model.predict(X_train).reshape(-1, 1)).reshape(-1)

y_test_pred = y_scaler.inverse_transform(final_cat_model.predict(X_test).reshape(-1, 1)).reshape(-1)

print(f"MAE Loss : {mean_absolute_error(y_scaler.inverse_transform(np.array(y_train).reshape(-1, 1)), y_pred):.2f}")

0:	learn: 0.9957157	total: 35.7ms	remaining: 13.3s
1:	learn: 0.9878969	total: 59.3ms	remaining: 11s
2:	learn: 0.9863310	total: 83.6ms	remaining: 10.3s
3:	learn: 0.9825824	total: 109ms	remaining: 10.1s
4:	learn: 0.9793970	total: 135ms	remaining: 9.91s
5:	learn: 0.9733595	total: 159ms	remaining: 9.73s
6:	learn: 0.9707211	total: 182ms	remaining: 9.53s
7:	learn: 0.9670299	total: 217ms	remaining: 9.92s
8:	learn: 0.9646504	total: 242ms	remaining: 9.8s
9:	learn: 0.9630650	total: 266ms	remaining: 9.64s
10:	learn: 0.9612254	total: 289ms	remaining: 9.51s
11:	learn: 0.9603285	total: 312ms	remaining: 9.38s
12:	learn: 0.9575868	total: 336ms	remaining: 9.31s
13:	learn: 0.9549409	total: 360ms	remaining: 9.24s
14:	learn: 0.9516928	total: 398ms	remaining: 9.5s
15:	learn: 0.9497049	total: 422ms	remaining: 9.42s
16:	learn: 0.9482831	total: 446ms	remaining: 9.35s
17:	learn: 0.9465351	total: 471ms	remaining: 9.29s
18:	learn: 0.9454976	total: 497ms	remaining: 9.27s
19:	learn: 0.9432666	total: 522ms	remainin

In [18]:
submission_df["Inhibition"] = y_test_pred

submission_df.to_csv('maplight_submission.csv', index = False)

In [21]:
importances = final_cat_model.get_feature_importance(type='PredictionValuesChange')

feature_sizes = [1024, 300, 1613, 300, 2048]
block_names = ['Avalon', 'Gin', 'Mordred', 'Mol2Vec', 'Circular']

start = 0
block_importances = []

for size in feature_sizes:
    block_imp = np.sum(importances[start:start + size])
    block_importances.append(block_imp)
    start += size

# 정규화 (전체 비중으로 보기 위해 %로 변환)
block_importances = np.array(block_importances)
block_importances_percent = block_importances / block_importances.sum() * 100

for name, imp in zip(block_names, block_importances_percent):
    print(f"{name}: {imp:.2f}%")

Avalon: 11.46%
Gin: 30.37%
Mordred: 40.03%
Mol2Vec: 14.66%
Circular: 3.49%


In [20]:
# Standard Scaler - 6.83
# Not Standard Scaler = 6.83

# HyperParameter Tuning - 