In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score

# ─── 1. 데이터 불러오기 ─────────────────────────
train = pd.read_csv("train_주요변수.csv")
test = pd.read_csv("test_주요변수.csv")

# ─── 2. 1차 이진 분류용 라벨 생성 ───────────────
def binary_label(segment):
    return 0 if segment in ['A', 'B', 'C'] else 1  # 0: ABC, 1: DE

train['BinarySegment'] = train['Segment'].apply(binary_label)

# ─── 3. CatBoostEncoder 대체: Label Mapping ──────
segment_mapping = {seg: i for i, seg in enumerate(sorted(train['Segment'].unique()))}
inverse_mapping = {v: k for k, v in segment_mapping.items()}
train['Segment_encoded'] = train['Segment'].map(segment_mapping)

# ─── 4. 이진 분류 모델 학습 ─────────────────────
X = train.drop(columns=['ID', 'Segment', 'Segment_encoded', 'BinarySegment'])
y_binary = train['BinarySegment']

X_train_bin, X_val_bin, y_train_bin, y_val_bin = train_test_split(
    X, y_binary, test_size=0.3, stratify=y_binary, random_state=42
)

rf_bin = RandomForestClassifier(
    n_estimators=200, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1
)
rf_bin.fit(X_train_bin, y_train_bin)
print("★ [Step 1] 이진 분류 결과:")
print(classification_report(y_val_bin, rf_bin.predict(X_val_bin), digits=4))

# ─── 5. 그룹별 다중 분류 학습 ────────────────────
group_models = {}
group_labels = {0: ['A', 'B', 'C'], 1: ['D', 'E']}

for group_id, group_segs in group_labels.items():
    subset = train[train['Segment'].isin(group_segs)]
    X_group = subset.drop(columns=['ID', 'Segment', 'Segment_encoded', 'BinarySegment'])
    y_group = subset['Segment'].map(segment_mapping)

    X_train_g, X_val_g, y_train_g, y_val_g = train_test_split(
        X_group, y_group, test_size=0.3, stratify=y_group, random_state=42
    )

    model = RandomForestClassifier(n_estimators=200, max_depth=10, class_weight='balanced', random_state=42, n_jobs=-1)
    model.fit(X_train_g, y_train_g)
    group_models[group_id] = model

    print(f"★ [Step 2] 다중 분류 결과 (Group {group_segs}):")
    print(classification_report(y_val_g, model.predict(X_val_g), digits=4))

# ─── 6. 테스트 데이터 예측 ───────────────────────
X_submit = test.drop(columns=["ID"])
binary_pred = rf_bin.predict(X_submit)

final_segments = []
for i, row in X_submit.iterrows():
    group = binary_pred[i]
    model = group_models[group]
    pred_encoded = model.predict(row.to_frame().T)[0]  # ← 수정됨
    final_segments.append(inverse_mapping[pred_encoded])
# ─── 7. ID별 다수결 처리 후 저장 ─────────────────
submit_df = pd.DataFrame({
    "ID": test["ID"],
    "Segment": final_segments
})
submit_majority = submit_df.groupby("ID")["Segment"].agg(lambda x: x.value_counts().idxmax()).reset_index()

submit_majority.to_csv("rf_2step_segment_prediction.csv", index=False)
print("▶ 저장 완료: rf_2step_segment_prediction.csv")


★ [Step 1] 이진 분류 결과:
              precision    recall  f1-score   support

           0     0.2786    0.8620    0.4211     38612
           1     0.9911    0.8735    0.9286    681388

    accuracy                         0.8729    720000
   macro avg     0.6348    0.8678    0.6748    720000
weighted avg     0.9529    0.8729    0.9014    720000

★ [Step 2] 다중 분류 결과 (Group ['A', 'B', 'C']):
              precision    recall  f1-score   support

           0     0.1312    0.9178    0.2296       292
           1     0.2301    0.6047    0.3333        43
           2     0.9990    0.9514    0.9746     38277

    accuracy                         0.9508     38612
   macro avg     0.4534    0.8246    0.5125     38612
weighted avg     0.9915    0.9508    0.9683     38612

★ [Step 2] 다중 분류 결과 (Group ['D', 'E']):
              precision    recall  f1-score   support

           3     0.4651    0.8236    0.5945    104773
           4     0.9627    0.8279    0.8902    576616

    accuracy          