In [None]:
# 1. 데이터 로딩

In [None]:
# 2. 변수 타입 자동 판별 (규칙 기반)
###############################################
# 컬럼명 규칙 기반 타입 분류
#   n → numeric
#   s → score (numeric)
#   c → categorical
###############################################

def classify_columns(df, target_col='label'):
    numeric_cols = []
    categorical_cols = []

    for col in df.columns:
        if col == target_col:
            continue

        if col.endswith('n'):        # numeric
            numeric_cols.append(col)

        elif col.endswith('s'):      # score → numeric
            numeric_cols.append(col)

        elif col.endswith('c'):      # categorical
            categorical_cols.append(col)

    return numeric_cols, categorical_cols

numeric_cols, categorical_cols = classify_columns(df)

print("Numeric columns:", numeric_cols)
print("Categorical columns:", categorical_cols)

In [None]:
# 3. label 기본 분포 확인
print("=== Label distribution ===")
print(df['label'].value_counts(dropna=False))
print(df['label'].value_counts(normalize=True))

In [None]:
# 4. 숫자형 변수 EDA 요약
def summarize_numeric_by_label(data, numeric_cols, target_col='label'):
    summary_list = []
    for col in numeric_cols:
        g = data.groupby(target_col)[col]
        summary_list.append({
            'feature': col,
            'label_0_mean': g.mean().get(0, np.nan),
            'label_1_mean': g.mean().get(1, np.nan),
            'label_0_median': g.median().get(0, np.nan),
            'label_1_median': g.median().get(1, np.nan),
            'label_0_std': g.std().get(0, np.nan),
            'label_1_std': g.std().get(1, np.nan),
            'missing_rate': data[col].isna().mean()
        })
    return pd.DataFrame(summary_list).set_index('feature')

num_summary = summarize_numeric_by_label(df, numeric_cols)
print(num_summary)

In [None]:
# 5. 범주형 변수 EDA 요약
def summarize_categorical_by_label(data, cat_cols, target_col='label'):
    result = {}
    for col in cat_cols:
        ctab = pd.crosstab(data[col], data[target_col], normalize='columns')
        result[col] = ctab
    return result

cat_summary = summarize_categorical_by_label(df, categorical_cols)
for col, ctab in cat_summary.items():
    print(f"\n=== {col} (proportion by label) ===")
    print(ctab)

In [None]:
# 6. 시각화용 함수
def plot_numeric_by_label(data, col, target_col='label', bins=30):
    plt.figure(figsize=(8, 4))
    sns.histplot(data=data, x=col, hue=target_col,
                 bins=bins, stat='density', common_norm=False)
    plt.title(f"{col} distribution by label")
    plt.tight_layout()
    plt.show()


def boxplot_numeric_by_label(data, col, target_col='label'):
    plt.figure(figsize=(6, 4))
    sns.boxplot(data=data, x=target_col, y=col)
    plt.title(f"{col} boxplot by label")
    plt.tight_layout()
    plt.show()


def bar_categorical_by_label(data, col, target_col='label'):
    ctab = pd.crosstab(data[col], data[target_col], normalize='columns')
    ctab.plot(kind='bar', figsize=(6, 4))
    plt.title(f"{col} categorical distribution by label")
    plt.tight_layout()
    plt.show()

In [None]:
# 7. 대표 변수 자동 시각화 실행
# 대표 숫자형 3~5개만 시각화 예시
for col in numeric_cols[:5]:
    print(f"\n[Plot] {col}")
    plot_numeric_by_label(df, col)
    boxplot_numeric_by_label(df, col)

# 범주형 시각화
for col in categorical_cols:
    print(f"\n[Plot] {col}")
    bar_categorical_by_label(df, col)

In [None]:
# 8. 선택 변수에 대해 label=1 비율(Binning)
def bin_and_target_rate(data, col, target_col='label', bins=10):
    tmp = data[[col, target_col]].dropna()
    tmp['bin'] = pd.qcut(tmp[col], q=bins, duplicates='drop')
    out = tmp.groupby('bin')[target_col].agg(['mean', 'count'])
    out.rename(columns={'mean': 'label_1_rate', 'count': 'n'}, inplace=True)
    return out

# 예시
print(bin_and_target_rate(df, numeric_cols[0]))