In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import os
import warnings
from pathlib import Path
warnings.filterwarnings('ignore')

JUPYTER_PATH = Path.cwd()
PROJECT_ROOT = JUPYTER_PATH.parents[0]
os.chdir(PROJECT_ROOT)
print(f"Main Project Root: {os.getcwd()}")

Main Project Root: /Users/mungughyeon/Documents/contest/DACON_BDA


In [2]:
train_path = PROJECT_ROOT / "dataset/train.csv"
test_path = PROJECT_ROOT / "dataset/test.csv"

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print(f"Train Dataset: {train.shape}")
print(f"Test Dataset: {test.shape}")

Train Dataset: (748, 46)
Test Dataset: (814, 45)


In [3]:
from src.preprocess import Pipeline

pipe = Pipeline(config={'skip_steps': ['create_features', 'encode_text_embeddings']})
tr, te, tid = pipe.run(train, test)

X = tr.drop(columns=['completed'])
y = tr['completed']

[fill_na] 결측치 처리 완료
[fix_completed_semester] completed_semester 처리 완료
[map_major] major 매핑 완료 (test only)
[drop_useless] ['generation', 'contest_award', 'idea_contest', 'contest_participation'] 제거 완료
[encode_certs] 자격증 인코딩 완료
[encode_companies] 기업 카테고리 인코딩 완료
[encode_multi_hot] multi-hot 인코딩 완료
[encode_text_embeddings] 건너뜀
[drop_originals] 원본 컬럼 제거 완료
[create_features] 건너뜀
Preprocessing completed: train (748, 102), test (814, 101)


In [4]:
from src.models import ModelComparator

comparator = ModelComparator(n_splits=10)
results = comparator.run(X, y, pipeline=pipe)
results

LightGBM: 100%|██████████| 10/10 [00:02<00:00,  4.12it/s]


  → LightGBM: OOF F1=0.4657, Mean Fold F1=0.4786±0.0313, Threshold=0.28


XGBoost: 100%|██████████| 10/10 [00:12<00:00,  1.20s/it]


  → XGBoost: OOF F1=0.3846, Mean Fold F1=0.4125±0.0593, Threshold=0.12


CatBoost: 100%|██████████| 10/10 [00:02<00:00,  4.12it/s]


  → CatBoost: OOF F1=0.4696, Mean Fold F1=0.4828±0.0293, Threshold=0.4


RandomForest: 100%|██████████| 10/10 [00:04<00:00,  2.09it/s]


  → RandomForest: OOF F1=0.4607, Mean Fold F1=0.4938±0.0267, Threshold=0.13


ExtraTrees: 100%|██████████| 10/10 [00:03<00:00,  2.66it/s]

  → ExtraTrees: OOF F1=0.463, Mean Fold F1=0.4933±0.0233, Threshold=0.27





Unnamed: 0,name,oof_f1,mean_fold_f1,std_fold_f1,best_threshold
0,CatBoost,0.4696,0.4828,0.0293,0.4
1,LightGBM,0.4657,0.4786,0.0313,0.28
2,ExtraTrees,0.463,0.4933,0.0233,0.27
3,RandomForest,0.4607,0.4938,0.0267,0.13
4,XGBoost,0.3846,0.4125,0.0593,0.12


In [5]:
def feature_importance(
    comparator,
    X,
    model_indices,
    model_names,
    top_n=20,
    bottom_n=30
):
    imp_dfs = {}
    for i, name in zip(model_indices, model_names):
        m = comparator.models[i].models[0]
        imp = m.get_feature_importance() if name == 'CatBoost' else m.feature_importances_

        imp_dfs[name] = (
            pd.DataFrame({
                'feature': X.columns,
                'importance': imp
            })
            .sort_values('importance', ascending=False)
            .reset_index(drop=True)
        )

    top_sets = {name: set(df.head(top_n)['feature']) for name, df in imp_dfs.items()}
    bottom_sets = {name: set(df.tail(bottom_n)['feature']) for name, df in imp_dfs.items()}
    zero_sets = {name: set(df[df['importance'] == 0]['feature']) for name, df in imp_dfs.items()}

    top_common = set.intersection(*top_sets.values())
    bottom_common = set.intersection(*bottom_sets.values())
    zero_common = set.intersection(*zero_sets.values())
    rows = []

    for f in sorted(top_common):
        row = {'feature': f, 'group': f"Top_{top_n}"}
        for name, df in imp_dfs.items():
            row[name] = df.loc[df['feature'] == f, 'importance'].values[0]
        rows.append(row)

    for f in sorted(bottom_common):
        row = {'feature': f, 'group': f"Bottom_{bottom_n}"}
        for name, df in imp_dfs.items():
            row[name] = df.loc[df['feature'] == f, 'importance'].values[0]
        rows.append(row)

    for f in sorted(zero_common):
        row = {'feature': f, 'group': "Importance_0"}
        for name, df in imp_dfs.items():
            row[name] = df.loc[df['feature'] == f, 'importance'].values[0]
        rows.append(row)

    result_df = pd.DataFrame(rows)
    return result_df, imp_dfs

In [6]:
top3 = ['CatBoost', 'LightGBM', 'ExtraTrees']
top3_idx = [2, 0, 4]

common_df, imp_dfs = feature_importance(
    comparator=comparator,
    X=X,
    model_indices=top3_idx,
    model_names=top3,
    top_n=20,
    bottom_n=30
)

common_df

Unnamed: 0,feature,group,CatBoost,LightGBM,ExtraTrees
0,cert_count,Top_20,10.738569,1,0.01431
1,dje_PM기획,Top_20,0.82838,0,0.014614
2,major1_1,Top_20,1.804944,0,0.017528
3,school1,Top_20,44.614791,5,0.107371
4,whyBDA,Top_20,7.361158,1,0.019446
5,cert_구글애널리스트,Bottom_30,0.0,0,0.004668
6,cert_정보처리기사,Bottom_30,0.0,0,0.002838
7,cert_컴퓨터활용능력,Bottom_30,0.0,0,0.000597
8,cert_태블로,Bottom_30,0.0,0,0.000411
9,class4,Bottom_30,0.0,0,0.0


In [7]:
import matplotlib.pyplot as plt
from src.visualization.plot import set_korean_font, bar_plot
set_korean_font()

def plot_importance(imp_dfs, model_names, top_n=20, palette='Blues_r'):
    fig, axes = plt.subplots(1, len(model_names), figsize=(8 * len(model_names), 8))

    if len(model_names) == 1:
        axes = [axes]

    for ax, name in zip(axes, model_names):
        df_top = imp_dfs[name].head(top_n).sort_values('importance', ascending=False)

        bar_plot(
            df_top,
            x_col='importance',
            y_col='feature',
            ax=ax,
            palette=palette,
            title=f'{name} Top {top_n}',
            xlabel='Importance',
            show=False
        )

    plt.tight_layout()
    plt.show()

In [8]:
from src.preprocess import Pipeline

pipe = Pipeline(config={'skip_steps': ['encode_text_embeddings']})
tr_1, te_1, tid_1 = pipe.run(train, test)

zero_common_list = common_df[common_df['group'] == 'Importance_0']['feature'].tolist()
v1_drop_cols = zero_common_list + ['completed']

X_1 = tr_1.drop(columns=v1_drop_cols)
test = te_1.drop(columns=zero_common_list)

print(f"\nFeature: {X.shape[1]} → {X_1.shape[1]}")
print(f"X: {X.shape}, y: {y.shape}, test: {test.shape}")

[fill_na] 결측치 처리 완료
[fix_completed_semester] completed_semester 처리 완료
[map_major] major 매핑 완료 (test only)
[drop_useless] ['generation', 'contest_award', 'idea_contest', 'contest_participation'] 제거 완료
[encode_certs] 자격증 인코딩 완료
[encode_companies] 기업 카테고리 인코딩 완료
[encode_multi_hot] multi-hot 인코딩 완료
[encode_text_embeddings] 건너뜀
[drop_originals] 원본 컬럼 제거 완료
[create_features] 파생변수 23개 생성 완료
Preprocessing completed: train (748, 126), test (814, 125)

Feature: 101 → 121
X: (748, 101), y: (748,), test: (814, 121)


In [9]:
comparator_v1 = ModelComparator(n_splits=10)
result_v1 = comparator_v1.run(X_1, y, pipeline=pipe)
result_v1

LightGBM: 100%|██████████| 10/10 [00:02<00:00,  4.24it/s]


  → LightGBM: OOF F1=0.464, Mean Fold F1=0.4804±0.0241, Threshold=0.26


XGBoost: 100%|██████████| 10/10 [00:12<00:00,  1.22s/it]


  → XGBoost: OOF F1=0.417, Mean Fold F1=0.4334±0.041, Threshold=0.2


CatBoost: 100%|██████████| 10/10 [00:02<00:00,  3.79it/s]


  → CatBoost: OOF F1=0.4712, Mean Fold F1=0.4843±0.0346, Threshold=0.34


RandomForest: 100%|██████████| 10/10 [00:04<00:00,  2.34it/s]


  → RandomForest: OOF F1=0.4615, Mean Fold F1=0.5002±0.0245, Threshold=0.2


ExtraTrees: 100%|██████████| 10/10 [00:03<00:00,  2.69it/s]

  → ExtraTrees: OOF F1=0.4642, Mean Fold F1=0.5022±0.0357, Threshold=0.24





Unnamed: 0,name,oof_f1,mean_fold_f1,std_fold_f1,best_threshold
0,CatBoost,0.4712,0.4843,0.0346,0.34
1,ExtraTrees,0.4642,0.5022,0.0357,0.24
2,LightGBM,0.464,0.4804,0.0241,0.26
3,RandomForest,0.4615,0.5002,0.0245,0.2
4,XGBoost,0.417,0.4334,0.041,0.2


In [10]:
from src.models import Tuner

CAT_MODEL = 'catboost'
N_TRIALS = 500

tuner_cat = Tuner(CAT_MODEL, X_1, y, pipeline=pipe, n_trials=N_TRIALS)
study_cat = tuner_cat.run()

  0%|          | 0/500 [00:00<?, ?it/s]

Best F1: 0.5139
Best params: {'iterations': 1584, 'learning_rate': 0.13907769703668824, 'depth': 4, 'l2_leaf_reg': 2.083775185893973, 'border_count': 199, 'min_data_in_leaf': 19, 'subsample': 0.39656100701430563, 'colsample_bylevel': 0.6708556901815794, 'scale_pos_weight': 2.7770609509135653, 'random_strength': 8.844635001292021, 'bagging_temperature': 2.8114721223764607, 'random_seed': 42, 'verbose': 0, 'eval_metric': 'F1'}


In [11]:
from src.models import Tuner

GBM_MODEL = 'lightgbm'
N_TRIALS = 500

tuner_gbm = Tuner(GBM_MODEL, X_1, y, pipeline=pipe, n_trials=N_TRIALS)
study_gbm = tuner_gbm.run()

  0%|          | 0/500 [00:00<?, ?it/s]

Best F1: 0.5133
Best params: {'n_estimators': 2090, 'learning_rate': 0.45413295974562917, 'max_depth': 3, 'num_leaves': 40, 'min_child_samples': 34, 'subsample': 0.4529294322136558, 'colsample_bytree': 0.2305455275537757, 'reg_alpha': 0.00012279213442107525, 'reg_lambda': 0.021573453805613065, 'scale_pos_weight': 1.3252907265907174, 'random_state': 42, 'verbose': -1}


In [12]:
from src.models import predict_params
from src.models.tree_models import RFModel, ETModel

# CatBoost (튜닝)
oof_cat, test_probs_cat, _ = predict_params(
    'catboost', tuner_cat.best_params,
    X_1, y, test, tid_1,
    pipeline=pipe, version='cat_tuned_v1', n_splits=5, n_repeats=3
)

# LightGBM (튜닝)
oof_lgb, test_probs_lgb, _ = predict_params(
    'lightgbm', tuner_gbm.best_params,
    X_1, y, test, tid_1,
    pipeline=pipe, version='lgb_tuned_v1', n_splits=5, n_repeats=3
)

# RandomForest (기본 파라미터)
oof_rf, test_probs_rf, _ = predict_params(
    RFModel, None,
    X_1, y, test, tid_1,
    pipeline=pipe, version='rf_v1', n_splits=5, n_repeats=3
)

# ExtraTrees (기본 파라미터)
oof_et, test_probs_et, _ = predict_params(
    ETModel, None,
    X_1, y, test, tid_1,
    pipeline=pipe, version='et_v1', n_splits=5, n_repeats=3
)

OOF F1: 0.5139, Threshold: 0.51
Saved: submissions/cat_tuned_v1.csv
Class 분포:
completed
1    521
0    293
Name: count, dtype: int64
OOF F1: 0.5133, Threshold: 0.31
Saved: submissions/lgb_tuned_v1.csv
Class 분포:
completed
1    534
0    280
Name: count, dtype: int64
OOF F1: 0.4603, Threshold: 0.13
Saved: submissions/rf_v1.csv
Class 분포:
completed
1    814
Name: count, dtype: int64
OOF F1: 0.4633, Threshold: 0.24
Saved: submissions/et_v1.csv
Class 분포:
completed
1    809
0      5
Name: count, dtype: int64


In [13]:
import optuna
from sklearn.metrics import f1_score
optuna.logging.set_verbosity(optuna.logging.WARNING)

oofs = {'cat': oof_cat, 'lgb': oof_lgb, 'rf': oof_rf, 'et': oof_et}
test_dict = {'cat': test_probs_cat, 'lgb': test_probs_lgb, 'rf': test_probs_rf, 'et': test_probs_et}

def objective(trial):
      weights = {k: trial.suggest_float(f'w_{k}', 0.0, 1.0) for k in oofs}
      total = sum(weights.values())
      if total < 1e-6:
          return 0.0
      blend = sum(w * oofs[k] for k, w in weights.items()) / total
      return max(
          f1_score(y, (blend >= t).astype(int))
          for t in np.arange(0.10, 0.70, 0.01)
      )

study_blend = optuna.create_study(direction='maximize')
study_blend.optimize(objective, n_trials=500, show_progress_bar=True)

bp      = study_blend.best_params
total_w = sum(bp[f'w_{k}'] for k in oofs)
norm_w  = {k: bp[f'w_{k}'] / total_w for k in oofs}

blend_oof  = sum(norm_w[k] * oofs[k]     for k in oofs)
blend_test = sum(norm_w[k] * test_dict[k] for k in oofs)

best_t_w, best_f1_w = 0.5, 0.0
for t in np.arange(0.10, 0.70, 0.01):
    f1 = f1_score(y, (blend_oof >= t).astype(int))
    if f1 > best_f1_w:
        best_f1_w, best_t_w = f1, t

print(f"가중치: { {k: f'{v:.3f}' for k, v in norm_w.items()} }")
print(f"Weighted Blend OOF F1: {best_f1_w:.4f}, Threshold: {best_t_w:.2f}")

  0%|          | 0/500 [00:00<?, ?it/s]

가중치: {'cat': '0.154', 'lgb': '0.845', 'rf': '0.000', 'et': '0.000'}
Weighted Blend OOF F1: 0.5219, Threshold: 0.34


In [14]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_predict

meta_X_oof  = np.column_stack([oof_cat, oof_lgb, oof_rf, oof_et])
meta_X_test = np.column_stack([test_probs_cat, test_probs_lgb, test_probs_rf, test_probs_et])
meta_lr = LogisticRegression(C=0.01, class_weight='balanced', max_iter=1000, random_state=42)

meta_oof = cross_val_predict(
    meta_lr, meta_X_oof, y,
    cv=StratifiedKFold(5, shuffle=True, random_state=42),
    method='predict_proba'
)[:, 1]

best_t_s, best_f1_s = 0.5, 0.0
for t in np.arange(0.10, 0.70, 0.01):
    f1 = f1_score(y, (meta_oof >= t).astype(int))
    if f1 > best_f1_s:
        best_f1_s, best_t_s = f1, t

meta_lr.fit(meta_X_oof, y)
test_probs_stack = meta_lr.predict_proba(meta_X_test)[:, 1]
print(f"Stacking OOF F1: {best_f1_s:.4f}, Threshold: {best_t_s:.2f}")

Stacking OOF F1: 0.4593, Threshold: 0.10


In [15]:
def save_sub(probs, threshold, version):
    pred = (probs >= threshold).astype(int)
    sub = pd.DataFrame({'ID': tid_1, 'completed': pred})
    os.makedirs('submissions', exist_ok=True)
    path = f'submissions/{version}.csv'
    sub.to_csv(path, index=False)
    print(f"[{version}] threshold={threshold:.2f} | 1비율={pred.mean():.3f} | {path}")

print("=== OOF F1 비교 ===")
print(f"  Weighted Blend : {best_f1_w:.4f}")
print(f"  Stacking (LR)  : {best_f1_s:.4f}")

save_sub(blend_test,       best_t_w, 'ensemble_weighted_v1')
save_sub(test_probs_stack, best_t_s, 'ensemble_stacking_v1')

=== OOF F1 비교 ===
  Weighted Blend : 0.5219
  Stacking (LR)  : 0.4593
[ensemble_weighted_v1] threshold=0.34 | 1비율=0.667 | submissions/ensemble_weighted_v1.csv
[ensemble_stacking_v1] threshold=0.10 | 1비율=1.000 | submissions/ensemble_stacking_v1.csv
