In [7]:
# calc_classification.py

import sys
# classification 패키지가 있는 상위 폴더를 직접 지정
sys.path.insert(0, "/home/cseomoon/appl/af_analysis-0.1.4/model")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from pycaret.classification import (
    setup, compare_models, finalize_model,
    predict_model, pull, plot_model
)
from sklearn.metrics import (
    roc_auc_score, accuracy_score, f1_score,
    confusion_matrix, classification_report,
    precision_recall_curve, average_precision_score,
    roc_curve
)

from classification.utils.data_loader import load_and_preprocess_data

def main():
    # 1) 파일 경로
    train_fp = "/home/cseomoon/appl/af_analysis-0.1.4/model/classification/data/train/AbNb_final_h3_l3_plddt_20250522.csv"
    test_fp  = "/home/cseomoon/appl/af_analysis-0.1.4/model/classification/data/test/ABAG_final_h3_plddt_20250522.csv"
    
    # 2) 데이터 로드 & 레이블링
    X_train, y_train, groups = load_and_preprocess_data(
        train_fp,
        target_column="DockQ",
        threshold=0.23,
        query_id_column="query"
    )
    X_test, y_test, _ = load_and_preprocess_data(
        test_fp,
        target_column="DockQ",
        threshold=0.23,
        query_id_column="query"
    )
    
    # 3) DataFrame 준비
    train_df = X_train.copy()
    train_df["target"] = y_train.values
    test_df = X_test.copy()
    test_df["target"] = y_test.values
    
    # 4) PyCaret 설정 (5-Fold 그룹KFold, CPU 환경이므로 use_gpu=False)
    clf_setup = setup(
        data=train_df,
        target="target",
        session_id=42,
        fold_strategy="groupkfold",
        fold=5,
        fold_groups=groups,
        normalize=True,
        use_gpu=False,      # ← GPU 없이 CPU만 사용
        verbose=False,
        log_experiment=False
    )
    
    # 5) 모델 비교 & 선택
    top3 = compare_models(n_select=3, sort="AUC")
    print("=== Top 3 모델 (내부 5-Fold CV) ===")
    print(pull().head(5))
    
    best = top3[0]
    final_model = finalize_model(best)
    
    # 6) 내부 CV 시각화 (파일로 저장)
    plot_model(final_model, plot="auc", save=True)
    plot_model(final_model, plot="pr", save=True)
    plot_model(final_model, plot="confusion_matrix", save=True)
    plot_model(final_model, plot="feature", save=True)
    
    # 7) 외부 테스트셋 피처 맞추기
    feature_cols = train_df.columns.difference(["target"])
    for c in feature_cols:
        if c not in test_df.columns:
            test_df[c] = 0
    test_aligned = test_df[feature_cols]
    
    # 8) 외부 테스트 예측
    pred = predict_model(final_model, data=test_aligned)
    
    # 9) 동적 확률·레이블 추출
    new_cols = [c for c in pred.columns if c not in test_aligned.columns]
    
    # 확률 컬럼 찾기
    prob_candidates = [c for c in new_cols if any(kw in c.lower() for kw in ("score","prob","proba"))]
    if not prob_candidates:
        raise ValueError(f"확률 컬럼을 찾을 수 없습니다: {new_cols}")
    y_pred_proba = pred[prob_candidates[-1]].values
    
    # 레이블 컬럼 찾기
    label_candidates = [c for c in new_cols if c not in prob_candidates]
    if not label_candidates:
        raise ValueError(f"예측 라벨 컬럼을 찾을 수 없습니다: {new_cols}")
    y_pred = pred[label_candidates[0]].values
    
    # 10) 외부 테스트 평가 지표
    auc  = roc_auc_score(y_test, y_pred_proba)
    acc  = accuracy_score(y_test, y_pred)
    f1   = f1_score(y_test, y_pred)
    cm   = confusion_matrix(y_test, y_pred)
    report = classification_report(y_test, y_pred)
    
    print("\n=== External Test Set Evaluation ===")
    print(f"AUC      : {auc:.4f}")
    print(f"Accuracy : {acc:.4f}")
    print(f"F1-score : {f1:.4f}")
    print("\nConfusion Matrix:")
    print(cm)
    print("\nClassification Report:")
    print(report)
    
    # 11) 외부 Precision-Recall Curve 저장
    precision, recall, _ = precision_recall_curve(y_test, y_pred_proba)
    pr_auc = average_precision_score(y_test, y_pred_proba)
    plt.figure()
    plt.plot(recall, precision, label=f"PR AUC={pr_auc:.2f}")
    plt.hlines(y=y_test.mean(), xmin=0, xmax=1, linestyles="--", label="baseline")
    plt.xlabel("Recall")
    plt.ylabel("Precision")
    plt.title("Precision–Recall Curve (External Test)")
    plt.legend()
    plt.grid()
    plt.savefig("pr_curve_external.png")
    plt.close()
    
    # 12) 외부 ROC Curve 저장
    fpr, tpr, _ = roc_curve(y_test, y_pred_proba)
    plt.figure()
    plt.plot(fpr, tpr, label=f"ROC AUC={auc:.2f}")
    plt.plot([0,1], [0,1], linestyle="--", label="chance")
    plt.xlabel("False Positive Rate")
    plt.ylabel("True Positive Rate")
    plt.title("ROC Curve (External Test)")
    plt.legend()
    plt.grid()
    plt.savefig("roc_curve_external.png")
    plt.close()

if __name__ == "__main__":
    main()

Original data shape: (3650, 89)
Class distribution before NaN drop (DockQ >= 0.23): 0 (Negative) = 2529, 1 (Positive) = 1121
'LIS' column not found in the data.
Identified 67 potential feature columns.
Checking for NaN values in potential feature columns...
Dropped 3 rows containing NaN values in one or more feature columns.
Processed Features (X) shape after NaN drop: (3647, 67)
Processed Target (y) shape after NaN drop: (3647,)
Processed Query IDs shape after NaN drop: (3647,)
Class distribution after NaN drop (DockQ >= 0.23): 0 (Negative) = 2526, 1 (Positive) = 1121
Original data shape: (1650, 84)
Class distribution before NaN drop (DockQ >= 0.23): 0 (Negative) = 1022, 1 (Positive) = 628
Found 'LIS' column in the data. It will be dropped from features.
Identified 61 potential feature columns.
Checking for NaN values in potential feature columns...
Dropped 2 rows containing NaN values in one or more feature columns.
Processed Features (X) shape after NaN drop: (1648, 61)
Processed Ta

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8825,0.935,0.7561,0.835,0.7928,0.709,0.7117,0.106
nb,Naive Bayes,0.8889,0.9288,0.7861,0.8401,0.8109,0.7297,0.7318,0.596
gbc,Gradient Boosting Classifier,0.8589,0.928,0.778,0.7559,0.7596,0.6594,0.6651,1.028
xgboost,Extreme Gradient Boosting,0.8631,0.926,0.7695,0.7668,0.7632,0.6659,0.6695,0.094
ridge,Ridge Classifier,0.8899,0.9234,0.7817,0.8226,0.801,0.7226,0.724,0.57
lr,Logistic Regression,0.8823,0.9233,0.8361,0.7871,0.8053,0.7194,0.7255,0.604
rf,Random Forest Classifier,0.8801,0.9227,0.7619,0.8191,0.7877,0.7023,0.7051,0.688
lda,Linear Discriminant Analysis,0.8884,0.9217,0.7856,0.8184,0.8007,0.721,0.7224,0.024
ada,Ada Boost Classifier,0.8552,0.9166,0.8,0.7395,0.76,0.6572,0.6643,0.252
svm,SVM - Linear Kernel,0.8427,0.906,0.784,0.7173,0.7455,0.6292,0.6337,0.594


Processing:   0%|          | 0/71 [00:00<?, ?it/s]

KeyboardInterrupt: 