In [1]:
import sys
sys.path.append('../src')
import pandas as pd

import models, data_preprocessing


In [2]:
X,y=data_preprocessing.load_data()

In [3]:
train_idx, test_idx=data_preprocessing.split_indices(y)

In [4]:
X_train, X_test, y_train, y_test=data_preprocessing.make_splits(X,y,train_idx,test_idx)

In [5]:
print(y_train.iloc(0))

<pandas.core.indexing._iLocIndexer object at 0x0000011BC66245F0>


In [6]:
y_train_list=y_train.to_list()
list=[]
sum=0
for i in range (1,4):
    for j in range(len(y_train_list)):
        if y_train_list[j] == i:
            sum+=1
    list.append(sum)
    sum=0

print(f'train set 내 1번 클래스 수: {list[0]}')
print(f'train set 내 2번 클래스 수: {list[1]}')
print(f'train set 내 3번 클래스 수: {list[2]}')

train set 내 1번 클래스 수: 1324
train set 내 2번 클래스 수: 236
train set 내 3번 클래스 수: 140


In [7]:
## SMOTE 활용을 통한 train set 내 클래스 불균형 해소

X_train,y_train=data_preprocessing.smote_minority_class(X_train,y_train,2,650)
X_train,y_train=data_preprocessing.smote_minority_class(X_train,y_train,3,650)

[SMOTE] class 2: 236개 → 650개로 늘리기 (synthetic 414개).
[SMOTE] class 3: 140개 → 650개로 늘리기 (synthetic 510개).


In [8]:
## logistic model input 스케일링
X_train_logisitc, X_test_logistic = data_preprocessing.scale_train_test_np(
    X_train, X_test
)

logistic_model=models.get_multinomial_logistic()
logistic_model.fit(X_train_logisitc,y_train)

## xgboost model input y값 인덱싱
y_train_xgb = (y_train - 1).astype(int)
y_test_xgb  = (y_test  - 1).astype(int)

xgboost_model = models.get_xgb_multiclass()
xgboost_model.fit(X_train, y_train_xgb)



0,1,2
,objective,'multi:softprob'
,base_score,
,booster,
,callbacks,
,colsample_bylevel,
,colsample_bynode,
,colsample_bytree,
,device,
,early_stopping_rounds,
,enable_categorical,False


성능 출력.

In [22]:
import numpy as np

from sklearn.metrics import (
    confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, roc_auc_score
)

y_pred_log = logistic_model.predict(X_test_logistic)          # 1,2,3 라벨
y_pred_xgb_raw = xgboost_model.predict(X_test)       # 0,1,2 라벨
y_pred_xgb = y_pred_xgb_raw + 1                     



In [23]:
def summarize_multiclass_model(name, y_true, y_pred, labels=(1, 2, 3)):
    acc        = accuracy_score(y_true, y_pred)
    macro_prec = precision_score(y_true, y_pred, average="macro", zero_division=0)
    macro_rec  = recall_score(y_true, y_pred, average="macro")
    macro_f1   = f1_score(y_true, y_pred, average="macro")
    weighted_f1 = f1_score(y_true, y_pred, average="weighted")

    prec_per = precision_score(y_true, y_pred, average=None, labels=labels, zero_division=0)
    rec_per  = recall_score(y_true, y_pred, average=None, labels=labels)
    f1_per   = f1_score(y_true, y_pred, average=None, labels=labels)

    out = {
        "model": name,
        "accuracy": acc,
        "macro_precision": macro_prec,
        "macro_recall": macro_rec,
        "macro_f1": macro_f1,
        "weighted_f1": weighted_f1,
    }
    for i, c in enumerate(labels):
        out[f"precision_cls{c}"] = prec_per[i]
        out[f"recall_cls{c}"]    = rec_per[i]
        out[f"f1_cls{c}"]        = f1_per[i]
    return out

multi_rows = []
multi_rows.append(summarize_multiclass_model("logistic_multinomial", y_test, y_pred_log))
multi_rows.append(summarize_multiclass_model("xgboost_multiclass",   y_test, y_pred_xgb))

perf_multiclass = pd.DataFrame(multi_rows)

print("\n=== Multiclass performance (CTG only) ===")
display(perf_multiclass)

multi_path = "../reports/tables/performance_ctg_multiclass.csv"
perf_multiclass.to_csv(multi_path, index=False)
print(f"Saved multiclass performance to: {multi_path}")


## 이진 분류: high-risk(2,3) vs normal(1)

def summarize_binary_model(name, task_name, y_true, y_pred, y_proba=None):
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()

    se   = tp / (tp + fn) if (tp + fn) > 0 else np.nan  # sensitivity
    sp   = tn / (tn + fp) if (tn + fp) > 0 else np.nan  # specificity
    ppv  = tp / (tp + fp) if (tp + fp) > 0 else np.nan  # PPV
    npv  = tn / (tn + fn) if (tn + fn) > 0 else np.nan  # NPV
    prev = y_true.mean()

    acc = accuracy_score(y_true, y_pred)
    f1  = f1_score(y_true, y_pred)
    auc = roc_auc_score(y_true, y_proba) if y_proba is not None else np.nan

    return {
        "task": task_name,
        "model": name,
        "accuracy": acc,
        "f1": f1,
        "sensitivity": se,
        "specificity": sp,
        "ppv": ppv,
        "npv": npv,
        "prevalence": prev,
        "auc": auc,
    }

y_test_highrisk = np.isin(y_test, [2, 3]).astype(int)
y_pred_log_highrisk = np.isin(y_pred_log, [2, 3]).astype(int)
y_pred_xgb_highrisk = np.isin(y_pred_xgb, [2, 3]).astype(int)


proba_log = logistic_model.predict_proba(X_test_logistic)   
y_proba_log_highrisk = proba_log[:, 1] + proba_log[:, 2]


proba_xgb = xgboost_model.predict_proba(X_test)
y_proba_xgb_highrisk = proba_xgb[:, 1] + proba_xgb[:, 2]

bin_rows = []
bin_rows.append(
    summarize_binary_model(
        name="logistic_multinomial",
        task_name="highrisk_vs_normal",
        y_true=y_test_highrisk,
        y_pred=y_pred_log_highrisk,
        y_proba=y_proba_log_highrisk,
    )
)
bin_rows.append(
    summarize_binary_model(
        name="xgboost_multiclass",
        task_name="highrisk_vs_normal",
        y_true=y_test_highrisk,
        y_pred=y_pred_xgb_highrisk,
        y_proba=y_proba_xgb_highrisk,
    )
)

perf_binary = pd.DataFrame(bin_rows)

print("\n=== Binary performance (high-risk vs normal) ===")
display(perf_binary)

binary_path = "../reports/tables/performance_ctg_binary.csv"
perf_binary.to_csv(binary_path, index=False)
print(f"Saved binary performance to: {binary_path}")


=== Multiclass performance (CTG only) ===


Unnamed: 0,model,accuracy,macro_precision,macro_recall,macro_f1,weighted_f1,precision_cls1,recall_cls1,f1_cls1,precision_cls2,recall_cls2,f1_cls2,precision_cls3,recall_cls3,f1_cls3
0,logistic_multinomial,0.906103,0.796485,0.860679,0.825113,0.909195,0.977918,0.936556,0.95679,0.661538,0.728814,0.693548,0.75,0.916667,0.825
1,xgboost_multiclass,0.957746,0.92669,0.918942,0.922729,0.957432,0.975976,0.981873,0.978916,0.859649,0.830508,0.844828,0.944444,0.944444,0.944444


Saved multiclass performance to: ../reports/tables/performance_ctg_multiclass.csv

=== Binary performance (high-risk vs normal) ===




Unnamed: 0,task,model,accuracy,f1,sensitivity,specificity,ppv,npv,prevalence,auc
0,highrisk_vs_normal,logistic_multinomial,0.934272,0.862745,0.926316,0.936556,0.807339,0.977918,0.223005,0.981301
1,highrisk_vs_normal,xgboost_multiclass,0.967136,0.925532,0.915789,0.981873,0.935484,0.975976,0.223005,0.991445


Saved binary performance to: ../reports/tables/performance_ctg_binary.csv


logistic model에 따른 fetal_health.csv의 변수 별 중요도

In [24]:
df=pd.read_csv('../data/fetal_health.csv')
feature_names = df.columns.drop('fetal_health').tolist()

print("classes_:", logistic_model.classes_)
coef = logistic_model.coef_          # shape: (n_classes, n_features)

coef_df = pd.DataFrame(
    coef,
    columns=feature_names,
    index=[f"class_{int(c)}" for c in logistic_model.classes_]
)

coef_df.head()

classes_: [1. 2. 3.]


Unnamed: 0,baseline value,accelerations,fetal_movement,uterine_contractions,light_decelerations,severe_decelerations,prolongued_decelerations,abnormal_short_term_variability,mean_value_of_short_term_variability,percentage_of_time_with_abnormal_long_term_variability,...,histogram_width,histogram_min,histogram_max,histogram_number_of_peaks,histogram_number_of_zeroes,histogram_mode,histogram_mean,histogram_median,histogram_variance,histogram_tendency
class_1,-0.315755,2.416457,-0.622669,0.667749,-0.050387,0.010148,-1.619698,-1.737885,0.316979,-0.769217,...,-0.014225,-0.330245,-0.604454,0.405133,-0.094596,0.869807,-0.695307,0.297289,-1.417467,-0.14603
class_2,-1.384062,-0.730282,-0.001981,-0.17133,-0.209896,-0.195294,0.163739,-0.238047,-0.132167,-0.26052,...,-0.154996,0.247235,0.071616,0.842194,-0.139351,-0.165845,1.548742,1.705264,0.015722,0.088636
class_3,1.699817,-1.686175,0.62465,-0.496419,0.260283,0.185145,1.455958,1.975932,-0.184813,1.029737,...,0.169221,0.083009,0.532838,-1.247326,0.233948,-0.703962,-0.853434,-2.002553,1.401745,0.057394


In [28]:
classes = logistic_model.classes_ 

rows = []
for class_idx, cls in enumerate(classes):
    for j, fname in enumerate(feature_names):
        beta = coef[class_idx, j]
        rows.append({
            "class": int(cls),
            "feature": fname,
            "coef": beta,
            "odds_ratio": float(np.exp(beta)),
            "abs_coef": float(abs(beta)),
        })

log_importance = pd.DataFrame(rows)

print("\n=== Top 10 features for class 3 (pathological) by |coef| ===")
display(
    log_importance[log_importance['class']== 3].sort_values("abs_coef", ascending=False)
                  .head(10)
)

log_path = "../reports/tables/logistic_feature_importance_long.csv"
log_importance.to_csv(log_path, index=False)
print(f"Saved logistic feature importance to: {log_path}")


=== Top 10 features for class 3 (pathological) by |coef| ===


Unnamed: 0,class,feature,coef,odds_ratio,abs_coef
60,3,histogram_median,-2.002553,0.13499,2.002553
49,3,abnormal_short_term_variability,1.975932,7.213342,1.975932
42,3,baseline value,1.699817,5.472947,1.699817
43,3,accelerations,-1.686175,0.185227,1.686175
48,3,prolongued_decelerations,1.455958,4.288592,1.455958
61,3,histogram_variance,1.401745,4.062283,1.401745
56,3,histogram_number_of_peaks,-1.247326,0.287272,1.247326
51,3,percentage_of_time_with_abnormal_long_term_var...,1.029737,2.800331,1.029737
59,3,histogram_mean,-0.853434,0.42595,0.853434
58,3,histogram_mode,-0.703962,0.494622,0.703962


Saved logistic feature importance to: ../reports/tables/logistic_feature_importance_long.csv
