In [2]:
import catboost as cb

import optuna

from mordred import Calculator, descriptors

import numpy as np
import pandas as pd

from tqdm import tqdm

from MapLight import *

from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import cross_val_score

In [3]:
X_TRAIN_PATH = "./combined_features.csv"
TRAIN_PATH = "./data/train.csv"
TEST_PATH = "./data/test.csv"
SUBMISSION_PATH = "./data/sample_submission.csv"

In [4]:
X_train = pd.read_csv(X_TRAIN_PATH, header = None)
train_df = pd.read_csv(TRAIN_PATH)
test_df = pd.read_csv(TEST_PATH)
submission_df = pd.read_csv(SUBMISSION_PATH)

In [5]:
RDLogger.DisableLog('rdApp.*')

smiles = test_df["Canonical_Smiles"]

transformer = PretrainedDGLTransformer(kine = "gin_supervised_masking", dtype = float)
mordred_calc = Calculator(descriptors, ignore_3D=True)

mordred_list = []
avalon_list = []
morgan_list = []
# erg_list = []
rdkit_list = []
gin_list = []

for smile in tqdm(smiles):
    mol = Chem.MolFromSmiles(smile)
    
    avalon_list.append(avalon_fingerprint(mol).ToList())
    morgan_list.append(morgan_fingerprint(mol).ToList())
    # erg_list.append(erg_fingerprint(mol).tolist())
    rdkit_list.append(rdkit_features(mol))
    gin_list.append(gin_supervised_masking(transformer, mol).tolist())
    mordred_list.append(mordred_calc(mol))

avalon_list = np.array(avalon_list)
morgan_list = np.array(morgan_list)
# erg_list = np.array(erg_list)
rdkit_list = np.array(rdkit_list)
gin_list = np.squeeze(np.array(gin_list), axis = 1)
mordred_list = np.array(mordred_list)

print(avalon_list.shape, morgan_list.shape, rdkit_list.shape, gin_list.shape, mordred_list.shape)

test_combined = pd.DataFrame(np.concatenate((avalon_list, morgan_list, rdkit_list, gin_list, mordred_list), axis = 1), index = None)

print(test_combined.shape)

100%|██████████| 100/100 [00:13<00:00,  7.28it/s]

(100, 1024) (100, 1024) (100, 200) (100, 300) (100, 1613)
(100, 4161)





In [6]:
X_test = test_combined

y_scaler = scaler(log = False)
y_scaler.fit(train_df["Inhibition"].values)
train_df["Inhibition"] = y_scaler.transform(train_df["Inhibition"].values)

y_train = train_df["Inhibition"]

def map_objective(trial):
    param = {
        "loss_function": "RMSE",
        "iterations": trial.suggest_int("iterations", 50, 700),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "depth": trial.suggest_int("depth", 3, 10),
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.0, 1.0),
        "random_strength": trial.suggest_float("random_strength", 0.001, 10.0, log=True),
        "border_count": trial.suggest_int("border_count", 32, 64),
        "random_seed": trial.suggest_int("random_seed", 1, 6),
        "task_type": "GPU",  # or "CPU"
        "devices": "0"
    }

    cat_model = cb.CatBoostRegressor(**param, verbose = 0)

    scores = cross_val_score(cat_model, X_train, y_train, cv = 5, scoring = 'neg_mean_absolute_error')
    
    mae = -scores.mean()

    # MAE 평균 값 반환
    return mae

In [None]:
# Optuna Log 설정
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Optuna를 통한 학습 시작
study = optuna.create_study(direction = "minimize")
study.optimize(map_objective, n_trials = 300, show_progress_bar = True)

# 최적 하이퍼파라미터 출력
print("Best hyperparameters:", study.best_params)
best_param = study.best_params

Best trial: 8. Best value: 0.728209:  10%|▉         | 29/300 [44:44<3:13:37, 42.87s/it]  

In [None]:
final_cat_model = cb.CatBoostRegressor(**best_param)

final_cat_model.fit(X_train, y_train)

y_pred = y_scaler.inverse_transform(final_cat_model.predict(X_train).reshape(-1, 1)).reshape(-1)

y_test_pred = y_scaler.inverse_transform(final_cat_model.predict(X_test).reshape(-1, 1)).reshape(-1)

print(f"MAE Loss : {mean_absolute_error(y_scaler.inverse_transform(np.array(y_train).reshape(-1, 1)), y_pred):.2f}")

0:	learn: 0.9791990	total: 30.1ms	remaining: 8.1s
1:	learn: 0.9627693	total: 40.6ms	remaining: 5.43s
2:	learn: 0.9484678	total: 49.8ms	remaining: 4.43s
3:	learn: 0.9354621	total: 60.6ms	remaining: 4.03s
4:	learn: 0.9256396	total: 71ms	remaining: 3.77s
5:	learn: 0.9160786	total: 80.8ms	remaining: 3.56s
6:	learn: 0.9074439	total: 103ms	remaining: 3.87s
7:	learn: 0.9000996	total: 113ms	remaining: 3.69s
8:	learn: 0.8920691	total: 123ms	remaining: 3.58s
9:	learn: 0.8856254	total: 134ms	remaining: 3.47s
10:	learn: 0.8796539	total: 148ms	remaining: 3.48s
11:	learn: 0.8736952	total: 161ms	remaining: 3.46s
12:	learn: 0.8680036	total: 173ms	remaining: 3.42s
13:	learn: 0.8623263	total: 183ms	remaining: 3.35s
14:	learn: 0.8583116	total: 194ms	remaining: 3.29s
15:	learn: 0.8524638	total: 204ms	remaining: 3.25s
16:	learn: 0.8471981	total: 214ms	remaining: 3.19s
17:	learn: 0.8429170	total: 224ms	remaining: 3.14s
18:	learn: 0.8395072	total: 234ms	remaining: 3.08s
19:	learn: 0.8353034	total: 244ms	rema

In [None]:
submission_df["Inhibition"] = y_test_pred

submission_df.to_csv('maplight_submission.csv', index = False)

In [None]:
importances = final_cat_model.get_feature_importance(type='PredictionValuesChange')

feature_sizes = [1024, 1024, 200, 300, 1613]
block_names = ['block1', 'block2', 'block3', 'block4', 'block5']

start = 0
block_importances = []

for size in feature_sizes:
    block_imp = np.sum(importances[start:start + size])
    block_importances.append(block_imp)
    start += size

# 정규화 (전체 비중으로 보기 위해 %로 변환)
block_importances = np.array(block_importances)
block_importances_percent = block_importances / block_importances.sum() * 100

for name, imp in zip(block_names, block_importances_percent):
    print(f"{name}: {imp:.2f}%")

block1: 9.73%
block2: 6.04%
block3: 2.61%
block4: 8.52%
block5: 34.02%


In [None]:
# Standard Scaler - 6.83
# Not Standard Scaler = 6.83

# HyperParameter Tuning - 