In [3]:
# ── 1. 라이브러리 ───────────────────────────────────────────
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split

# ── 2. 데이터 불러오기 ───────────────────────────────────────
train_df = pd.read_parquet("회원_신용_청구.parquet")
test_df = pd.read_parquet("회원_신용_청구_test.parquet")

# ── 3. 범주형 변수 자동 식별 함수 ──────────────────────────
def get_cat_features(df):
    return df.select_dtypes(include=['object', 'category']).columns.tolist()

# ── 4. Stage 1: Binary Classification (E vs Others) ─────────
train_stage1 = train_df.copy()
train_stage1["Segment_bin"] = train_stage1["Segment"].apply(lambda x: "E" if x == "E" else "Not_E")

X_stage1 = train_stage1.drop(columns=["ID", "기준년월", "Segment", "Segment_bin"])
y_stage1 = train_stage1["Segment_bin"]
cat_features_stage1 = get_cat_features(X_stage1)

model_stage1 = CatBoostClassifier(
    iterations=300,
    random_state=42,
    cat_features=cat_features_stage1,
    verbose=0
)
model_stage1.fit(X_stage1, y_stage1)

# ── 5. Stage 2: Multiclass Classification (A~D) ─────────────
train_stage2 = train_df[train_df["Segment"] != "E"].copy()
X_stage2 = train_stage2.drop(columns=["ID", "기준년월", "Segment"])
y_stage2 = train_stage2["Segment"]
cat_features_stage2 = get_cat_features(X_stage2)

model_stage2 = CatBoostClassifier(
    iterations=500,
    random_state=42,
    cat_features=cat_features_stage2,
    verbose=0
)
model_stage2.fit(X_stage2, y_stage2)

# ── 6. 테스트 데이터 준비 및 Stage 1 예측 ──────────────────
X_test = test_df.drop(columns=["ID", "기준년월"])
cat_features_test = get_cat_features(X_test)

stage1_preds = model_stage1.predict(X_test)

# ── 7. Stage 2 적용: E가 아닌 데이터에만 ─────────────────────
X_test_stage2 = X_test[stage1_preds != "E"]
stage2_preds = model_stage2.predict(X_test_stage2)

# ── 8. Stage 1 + Stage 2 결합 ────────────────────────────────
final_preds = []
j = 0
for p in stage1_preds:
    if p == "E":
        final_preds.append("E")
    else:
        final_preds.append(stage2_preds[j])
        j += 1

# ── 9. ID별 대표 Segment 선정 (최빈값 기반) ────────────────
df_pred = pd.DataFrame({
    "ID": test_df["ID"],
    "Segment": final_preds
})
df_mode = df_pred.groupby("ID")["Segment"].agg(lambda x: x.mode().iloc[0]).reset_index()

# ── 10. 제출 파일 저장 ───────────────────────────────────────
sample = pd.read_csv("sample_submission.csv")
submit = sample[["ID"]].merge(df_mode, on="ID", how="left")
submit["Segment"] = submit["Segment"].fillna("A")  # 누락시 E 처리
submit.to_csv("catboost_twostage_submission.csv", index=False)


ArrowMemoryError: malloc of size 960000000 failed