# 성별별 비교 (6가지 데이터 버전 × 전체 모델)

**6가지 경우:**
- 로그 없음: 결측 제거 없음 / 50% 초과 제거 / 80% 초과 제거
- 로그 변환: 결측 제거 없음 / 50% 초과 제거 / 80% 초과 제거

각 경우별로 저장된 모델(로지스틱, 경사하강법, KNN, SVM, 랜덤포레스트, XGBoost, LightGBM, CatBoost)을 불러와
**전체·남·여** test AUC를 비교합니다.

**사전 조건:** 3~8번 폴더에서 해당 버전의 `new_*.ipynb`를 실행해 `results/*.pkl`이 있어야 합니다.

## 라이브러리 및 경로

In [ ]:
import os
import pickle
import numpy as np
import pandas as pd
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import matplotlib.pyplot as plt
import seaborn as sb

PROJECT_ROOT = r'C:\\itwill_bigdata_final_project-main\\itwill_bigdata_final_project'
CSV_PATH = os.path.join(PROJECT_ROOT, '1. 초기 데이터 전처리', '3.coding_book_mapping.csv')
FOLDERS = {
    'nodrop_nolog': os.path.join(PROJECT_ROOT, '3. 결측 변수 제거 없이 분석 진행'),
    '50_nolog': os.path.join(PROJECT_ROOT, '4. 결측 50% 초과 변수 제거 분석'),
    '80_nolog': os.path.join(PROJECT_ROOT, '5. 결측 80% 초과 변수 제거 분석'),
    'nodrop_log': os.path.join(PROJECT_ROOT, '6. 로그변환_결측제거없음'),
    '50_log': os.path.join(PROJECT_ROOT, '7. 로그변환_결측50초과제거'),
    '80_log': os.path.join(PROJECT_ROOT, '8. 로그변환_결측80초과제거'),
}
PKL_NAMES = ['new_로지스틱', 'new_경사하강법', 'new_KNN', 'new_SVM', 'new_랜덤포레스트', 'new_XGBoost', 'new_LightGBM', 'new_CatBoost']


## 데이터 로딩 (3가지 버전: 결측 제거 없음 / 50% / 80%)

In [ ]:
from pandas import read_csv

categorical_cols = [
    'w09_fam1','w09_fam2','w09edu','w09gender1','w09marital','w09edu_s','w09ecoact_s','w09enu_type',
    'w09ba069','w09bp1','w09c152','w09c001','w09c003','w09c005',
    'w09chronic_a','w09chronic_b','w09chronic_c','w09chronic_d','w09chronic_e','w09chronic_f',
    'w09chronic_g','w09chronic_h','w09chronic_i','w09chronic_j','w09chronic_k','w09chronic_l','w09chronic_m',
    'w09c056','w09c068','w09c081','w09c082','w09c085','w09c102',
    'w09smoke','w09alc','w09addic','w09c550',
    'w09f001type','w09g031',
    'w09cadd_19','w09c142','w09c143','w09c144','w09c145','w09c146','w09c147','w09c148','w09c149','w09c150','w09c151'
]

def load_and_split(threshold_pct=None):
    origin = read_csv(CSV_PATH, encoding='utf-8')
    origin_type_changed = origin.copy()
    cat_for_type = [c for c in categorical_cols if c in origin_type_changed.columns]
    origin_type_changed[cat_for_type] = origin_type_changed[cat_for_type].astype('category')
    origin = origin_type_changed
    origin2 = origin.drop(['dependent_wage_work'], axis=1)
    yname = 'dependent_ecotype'
    if threshold_pct is not None:
        missing_rate = origin2.isnull().mean()
        drop_high = [c for c in missing_rate[missing_rate > threshold_pct].index if c != yname]
        origin2 = origin2.drop(columns=drop_high)
    df3 = origin2.copy()
    drop_for_leakage = [yname, 'work_ability_age']
    x = df3.drop(columns=[c for c in drop_for_leakage if c in df3.columns])
    y = df3[yname].astype(int)
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=52, stratify=y)
    return x_train, x_test, y_train, y_test

_, x_test_nodrop, _, y_test_nodrop = load_and_split(None)
_, x_test_50, _, y_test_50 = load_and_split(0.5)
_, x_test_80, _, y_test_80 = load_and_split(0.8)
print('nodrop test:', x_test_nodrop.shape, '50% test:', x_test_50.shape, '80% test:', x_test_80.shape)


## w09gender1: 남/여 기준 서브그룹 AUC 계산

In [ ]:
def get_subgroup_auc(y_true, y_proba, subgroup_series, group_a_val, group_b_val):
    res = {}
    mask_a = (subgroup_series == group_a_val)
    mask_b = (subgroup_series == group_b_val)
    if mask_a.sum() > 0 and y_true[mask_a].nunique() > 1:
        res['남'] = roc_auc_score(y_true[mask_a], y_proba[mask_a])
    else:
        res['남'] = np.nan
    if mask_b.sum() > 0 and y_true[mask_b].nunique() > 1:
        res['여'] = roc_auc_score(y_true[mask_b], y_proba[mask_b])
    else:
        res['여'] = np.nan
    return res

CASES = [
    ('결측제거없음_로그없음', 'nodrop_nolog', x_test_nodrop, y_test_nodrop),
    ('결측50%제거_로그없음', '50_nolog', x_test_50, y_test_50),
    ('결측80%제거_로그없음', '80_nolog', x_test_80, y_test_80),
    ('결측제거없음_로그변환', 'nodrop_log', x_test_nodrop, y_test_nodrop),
    ('결측50%제거_로그변환', '50_log', x_test_50, y_test_50),
    ('결측80%제거_로그변환', '80_log', x_test_80, y_test_80),
]


In [ ]:
subgroup_col_name = 'w09gender1'
group_a_val, group_b_val = ('남', '여')  # w09gender1: 남/여

rows = []
for case_label, folder_key, x_test, y_test in CASES:
    folder = FOLDERS[folder_key]
    if subgroup_col_name not in x_test.columns:
        print(f'Skip {case_label}: no column {subgroup_col_name}')
        continue
    subgroup_series = x_test[subgroup_col_name]
    if group_b_val is None:
        subgroup_series_binary = subgroup_series == group_a_val
        group_a_val_use, group_b_val_use = True, False
    else:
        group_a_val_use, group_b_val_use = group_a_val, group_b_val
    for pkl_name in PKL_NAMES:
        path = os.path.join(folder, 'results', pkl_name + '.pkl')
        if not os.path.isfile(path):
            continue
        with open(path, 'rb') as f:
            data = pickle.load(f)
        est = data.get('estimator')
        if est is None:
            continue
        try:
            y_proba = est.predict_proba(x_test)[:, 1]
        except Exception as e:
            print(f'{case_label} {pkl_name}: predict_proba failed', e)
            continue
        overall_auc = roc_auc_score(y_test, y_proba)
        if group_b_val is None:
            mask_a = (subgroup_series == group_a_val)
            mask_b = ~mask_a
            auc_a = roc_auc_score(y_test[mask_a], y_proba[mask_a]) if mask_a.sum() > 0 and y_test[mask_a].nunique() > 1 else np.nan
            auc_b = roc_auc_score(y_test[mask_b], y_proba[mask_b]) if mask_b.sum() > 0 and y_test[mask_b].nunique() > 1 else np.nan
            row = {'case': case_label, 'model': pkl_name, 'AUC_전체': overall_auc, '남': auc_a, '여': auc_b}
        else:
            sg = get_subgroup_auc(y_test.values, y_proba, subgroup_series, group_a_val_use, group_b_val_use)
            row = {'case': case_label, 'model': pkl_name, 'AUC_전체': overall_auc, '남': sg['남'], '여': sg['여']}
        rows.append(row)

df_sub = pd.DataFrame(rows)
display(df_sub)


## 요약: 경우별·모델별 AUC (전체 / 남 / 여)

In [ ]:
if len(df_sub) > 0:
    pd.set_option('display.max_rows', None)
    display(df_sub.round(4))
    pivot_overall = df_sub.pivot_table(index='model', columns='case', values='AUC_전체')
    display(pivot_overall.round(4))


## 시각화: 경우별 전체 AUC (모델별)

In [ ]:
if len(df_sub) > 0:
    fig, ax = plt.subplots(figsize=(12, 6))
    for model in df_sub['model'].unique():
        d = df_sub[df_sub['model'] == model]
        ax.plot(d['case'], d['AUC_전체'], 'o-', label=model)
    ax.set_xticklabels(df_sub['case'].unique(), rotation=45, ha='right')
    ax.set_ylabel('AUC')
    ax.set_title('6가지 데이터 버전별 전체 Test AUC (모델별)')
    ax.legend(bbox_to_anchor=(1.02, 1), loc='upper left')
    plt.tight_layout()
    plt.show()


## 시각화: 남 vs 여 AUC 비교 (모델·경우별)

In [ ]:
if len(df_sub) > 0:
    fig, ax = plt.subplots(figsize=(10, 5))
    x = np.arange(len(df_sub))
    w = 0.35
    ax.bar(x - w/2, df_sub['남'], width=w, label='남')
    ax.bar(x + w/2, df_sub['여'], width=w, label='여')
    ax.set_xticks(x)
    ax.set_xticklabels(df_sub['case'] + '_' + df_sub['model'], rotation=90, ha='right', fontsize=8)
    ax.set_ylabel('AUC')
    ax.set_title('남 vs 여 Test AUC')
    ax.legend()
    plt.tight_layout()
    plt.show()
