In [81]:
pd.set_option('display.max_columns', None) 

In [82]:
# calc_classification.py
import matplotlib as mpl
mpl.rcParams['font.family'] = 'DejaVu Sans'
# 필요하다면 sans-serif 계열을 강제 지정
mpl.rcParams['font.sans-serif'] = ['DejaVu Sans']
import sys
# classification 패키지가 있는 상위 폴더를 직접 지정
sys.path.insert(0, "/home/cseomoon/appl/af_analysis-0.1.4/model")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pycaret.classification import *
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score,
    confusion_matrix, classification_report,
    precision_recall_curve, average_precision_score,
    roc_curve
)

from classification.utils.data_loader import load_and_preprocess_data

In [83]:

# 1) 경로 정의
train_fp = "/home/cseomoon/appl/af_analysis-0.1.4/model/classification/data/train/pipeline_AbNb_native_data.csv"
test_fp  = "/home/cseomoon/appl/af_analysis-0.1.4/model/classification/data/test/pipeline_ABAG_native_data.csv"

# 2) 데이터 로드 및 전처리
X_train, y_train, groups = load_and_preprocess_data(
    train_fp,
    target_column="DockQ",
    threshold=0.23,
    query_id_column="query"
)
X_test, y_test, _ = load_and_preprocess_data(
    test_fp,
    target_column="DockQ",
    threshold=0.23,
    query_id_column="query"
)

# 3) DataFrame 생성
train_df = X_train.copy()
train_df["target"] = y_train.values
test_df = X_test.copy()
test_df["target"] = y_test.values

# 4) PyCaret 설정 (5-Fold 그룹KFold, GPU 사용, 정규화)
clf_setup = setup(
    data=train_df,
    target="target",
    session_id=42,
    fold_strategy="groupkfold",
    fold=5,
    fold_groups=groups,
    normalize=True,
    use_gpu=False,
    verbose=True,
    log_experiment=False
)



Original data shape: (3650, 90)
Class distribution before NaN drop (DockQ >= 0.23): 0 (Negative) = 2529, 1 (Positive) = 1121
Found 'LIS' column in the data. It will be dropped from features.
Identified 71 potential feature columns.
Checking for NaN values in potential feature columns...
Dropped 3 rows containing NaN values in one or more feature columns.
Processed Features (X) shape after NaN drop: (3647, 71)
Processed Target (y) shape after NaN drop: (3647,)
Processed Query IDs shape after NaN drop: (3647,)
Class distribution after NaN drop (DockQ >= 0.23): 0 (Negative) = 2526, 1 (Positive) = 1121
Original data shape: (1650, 90)
Class distribution before NaN drop (DockQ >= 0.23): 0 (Negative) = 1022, 1 (Positive) = 628
Found 'LIS' column in the data. It will be dropped from features.
Identified 71 potential feature columns.
Checking for NaN values in potential feature columns...
Dropped 2 rows containing NaN values in one or more feature columns.
Processed Features (X) shape after NaN

Unnamed: 0,Description,Value
0,Session id,42
1,Target,target
2,Target type,Binary
3,Original data shape,"(3647, 72)"
4,Transformed data shape,"(3647, 72)"
5,Transformed train set shape,"(2552, 72)"
6,Transformed test set shape,"(1095, 72)"
7,Numeric features,71
8,Preprocess,True
9,Imputation type,simple


In [84]:
# add metric to PyCaret
add_metric('pr_ap', 'PR-AP', average_precision_score)

Name                                                             PR-AP
Display Name                                                     PR-AP
Score Function       <pycaret.internal.metrics.EncodedDecodedLabels...
Scorer               make_scorer(average_precision_score, response_...
Target                                                            pred
Args                                                                {}
Greater is Better                                                 True
Multiclass                                                        True
Custom                                                            True
Name: pr_ap, dtype: object

In [85]:
# 5) 모델 비교 및 선택 (전체 모델 대상, 내부 CV 결과 확인)
top5 = compare_models(
    n_select=5,
    sort="AUC"
)
print("=== Top 5 모델 (내부 5-Fold CV) ===")
print(pull().head(5))



Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,PR-AP,TT (Sec)
catboost,CatBoost Classifier,0.8695,0.9302,0.7248,0.7875,0.7528,0.6542,0.657,0.6632,4.648
et,Extra Trees Classifier,0.8723,0.9295,0.7186,0.7906,0.7502,0.655,0.6585,0.6634,0.992
rf,Random Forest Classifier,0.8632,0.9269,0.7306,0.7631,0.7426,0.6385,0.642,0.6458,0.928
lr,Logistic Regression,0.8772,0.9227,0.7896,0.757,0.7703,0.6776,0.6795,0.6722,7.306
nb,Naive Bayes,0.8979,0.9221,0.7543,0.8458,0.7962,0.72,0.7234,0.7148,0.46
lightgbm,Light Gradient Boosting Machine,0.8602,0.9174,0.7282,0.7634,0.7427,0.636,0.6384,0.6483,125.156
ridge,Ridge Classifier,0.887,0.9154,0.726,0.8337,0.775,0.6919,0.696,0.6943,0.896
gbc,Gradient Boosting Classifier,0.8627,0.9143,0.7254,0.7592,0.737,0.6352,0.6391,0.6445,2.408
svm,SVM - Linear Kernel,0.8588,0.9139,0.8111,0.7129,0.7551,0.647,0.6523,0.6415,1.196
lda,Linear Discriminant Analysis,0.8855,0.9086,0.7249,0.8232,0.7694,0.6856,0.6897,0.688,0.43


=== Top 5 모델 (내부 5-Fold CV) ===
                             Model  Accuracy     AUC  Recall   Prec.      F1  \
catboost       CatBoost Classifier    0.8695  0.9302  0.7248  0.7875  0.7528   
et          Extra Trees Classifier    0.8723  0.9295  0.7186  0.7906  0.7502   
rf        Random Forest Classifier    0.8632  0.9269  0.7306  0.7631  0.7426   
lr             Logistic Regression    0.8772  0.9227  0.7896  0.7570  0.7703   
nb                     Naive Bayes    0.8979  0.9221  0.7543  0.8458  0.7962   

           Kappa     MCC   PR-AP  TT (Sec)  
catboost  0.6542  0.6570  0.6632     4.648  
et        0.6550  0.6585  0.6634     0.992  
rf        0.6385  0.6420  0.6458     0.928  
lr        0.6776  0.6795  0.6722     7.306  
nb        0.7200  0.7234  0.7148     0.460  


In [86]:
new_data=test_df.copy()
new_data.drop('target', axis=1, inplace=True)
new_data.head()

Unnamed: 0,fraction_disordered,has_clash,ipTM,pTM,ranking_confidence,iptm_A,iptm_H,iptm_L,ptm_A,ptm_H,...,query_avg_RMSD,scaled_RMSD_ratio,scaled_model_RMSD,scaled_query_RMSD,dG_separated,dSASA_int,nres_int,delta_unsatHbonds,packstat,dG_dSASA_norm
0,0.0,0.0,0.57,0.7,0.6,0.11,0.5,0.5,0.72,0.88,...,39.246214,0.559606,0.056252,0.044806,19.708321,1472.935064,47.0,17.0,0.713741,0.01338
1,0.01,0.0,0.57,0.7,0.61,0.12,0.5,0.5,0.73,0.88,...,39.246214,0.579124,0.060631,0.044806,59.586646,1843.67107,60.0,21.0,0.674735,0.03232
2,0.0,0.0,0.55,0.69,0.58,0.07,0.48,0.48,0.74,0.89,...,39.246214,0.401814,0.030546,0.044806,56.250134,1609.165063,53.0,11.0,0.741571,0.034956
3,0.0,0.0,0.57,0.7,0.6,0.12,0.5,0.5,0.73,0.87,...,39.246214,0.609161,0.068129,0.044806,27.314403,1546.077081,54.0,21.0,0.729679,0.017667
4,0.01,0.0,0.57,0.7,0.6,0.11,0.5,0.5,0.75,0.89,...,39.246214,0.419686,0.032811,0.044806,20.159038,1206.538153,43.0,6.0,0.749578,0.016708


In [87]:
# 6) 최상위 모델 최종화
best = top5[0]
final_model = finalize_model(best)

predictions = predict_model(final_model, data=new_data)
predictions.head()


Unnamed: 0,fraction_disordered,has_clash,ipTM,pTM,ranking_confidence,iptm_A,iptm_H,iptm_L,ptm_A,ptm_H,...,scaled_model_RMSD,scaled_query_RMSD,dG_separated,dSASA_int,nres_int,delta_unsatHbonds,packstat,dG_dSASA_norm,prediction_label,prediction_score
0,0.0,0.0,0.57,0.7,0.6,0.11,0.5,0.5,0.72,0.88,...,0.056252,0.044806,19.708321,1472.935059,47.0,17.0,0.713741,0.01338,0,0.9715
1,0.01,0.0,0.57,0.7,0.61,0.12,0.5,0.5,0.73,0.88,...,0.060631,0.044806,59.586647,1843.671021,60.0,21.0,0.674735,0.03232,0,0.9835
2,0.0,0.0,0.55,0.69,0.58,0.07,0.48,0.48,0.74,0.89,...,0.030546,0.044806,56.250134,1609.165039,53.0,11.0,0.741571,0.034956,0,0.9957
3,0.0,0.0,0.57,0.7,0.6,0.12,0.5,0.5,0.73,0.87,...,0.068129,0.044806,27.314404,1546.077026,54.0,21.0,0.729679,0.017667,0,0.9827
4,0.01,0.0,0.57,0.7,0.6,0.11,0.5,0.5,0.75,0.89,...,0.032811,0.044806,20.159039,1206.538208,43.0,6.0,0.749578,0.016708,0,0.9842


In [88]:
X_test

Unnamed: 0,fraction_disordered,has_clash,ipTM,pTM,ranking_confidence,iptm_A,iptm_H,iptm_L,ptm_A,ptm_H,...,query_avg_RMSD,scaled_RMSD_ratio,scaled_model_RMSD,scaled_query_RMSD,dG_separated,dSASA_int,nres_int,delta_unsatHbonds,packstat,dG_dSASA_norm
0,0.00,0.0,0.57,0.70,0.60,0.11,0.50,0.50,0.72,0.88,...,39.246214,0.559606,0.056252,0.044806,19.708321,1472.935064,47.0,17.0,0.713741,0.013380
1,0.01,0.0,0.57,0.70,0.61,0.12,0.50,0.50,0.73,0.88,...,39.246214,0.579124,0.060631,0.044806,59.586646,1843.671070,60.0,21.0,0.674735,0.032320
2,0.00,0.0,0.55,0.69,0.58,0.07,0.48,0.48,0.74,0.89,...,39.246214,0.401814,0.030546,0.044806,56.250134,1609.165063,53.0,11.0,0.741571,0.034956
3,0.00,0.0,0.57,0.70,0.60,0.12,0.50,0.50,0.73,0.87,...,39.246214,0.609161,0.068129,0.044806,27.314403,1546.077081,54.0,21.0,0.729679,0.017667
4,0.01,0.0,0.57,0.70,0.60,0.11,0.50,0.50,0.75,0.89,...,39.246214,0.419686,0.032811,0.044806,20.159038,1206.538153,43.0,6.0,0.749578,0.016708
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1645,0.01,0.0,0.21,0.65,0.30,0.13,0.48,0.48,0.74,0.86,...,78.955890,0.307621,0.005123,0.011457,893.272378,1979.008027,103.0,15.0,0.550770,0.451374
1646,0.01,0.0,0.21,0.65,0.31,0.14,0.48,0.48,0.74,0.86,...,78.955890,0.532362,0.013022,0.011457,223.208736,1584.054934,63.0,17.0,0.608707,0.140910
1647,0.01,0.0,0.21,0.65,0.30,0.15,0.49,0.48,0.74,0.86,...,78.955890,0.570071,0.015135,0.011457,1050.032869,1595.052750,64.0,7.0,0.610768,0.658306
1648,0.01,0.0,0.21,0.65,0.30,0.14,0.49,0.48,0.74,0.86,...,78.955890,0.531169,0.012960,0.011457,19.805114,1530.011186,65.0,3.0,0.673359,0.012944


In [89]:
y_test

0       0
1       0
2       0
3       0
4       0
       ..
1645    0
1646    0
1647    0
1648    0
1649    0
Name: __target__DockQ, Length: 1648, dtype: int64

In [90]:
# interpret_model(final_model, X_new_sample=X_test,y_new_sample=y_test)

In [91]:
# lb = get_leaderboard()