In [2]:
import pandas as pd
import os
from glob import glob
import numpy as np
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [3]:
ex1 = pd.read_parquet('회원_신용_청구_잔액.parquet')
ex1 = ex1.fillna(-1)

In [4]:
ex1

Unnamed: 0,기준년월,ID,잔액_리볼빙CA이월_B0M,연체잔액_현금서비스_B0M,잔액_현금서비스_B1M,잔액_할부_B1M,잔액_일시불_B2M,연체일수_B2M,연체원금_최근,최종연체회차,...,청구금액_B0,포인트_포인트_건별_B0M,포인트_포인트_월적립_B0M,포인트_잔여포인트_B0M,마일_적립포인트_R12M,마일_잔여포인트_B0M,할인건수_R3M,할인건수_B0M,할인금액_B0M,선결제건수_R3M
0,201807,TRAIN_000000,0,0,27930,640,1083,-999999,0,0,...,12226,1444,0,0,0,0,1,1,0,0
1,201807,TRAIN_000001,0,0,0,2779,1424,-999999,0,0,...,5834,0,0,0,0,0,1,1,0,0
2,201807,TRAIN_000002,0,0,26452,9451,6143,-999999,0,0,...,21866,2305,0,0,0,0,1,1,0,0
3,201807,TRAIN_000003,0,0,34054,6134,708,-999999,0,-99,...,16356,1452,0,0,0,0,1,1,0,0
4,201807,TRAIN_000004,0,0,0,0,0,-999999,0,-99,...,0,0,0,0,0,0,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2399995,201812,TRAIN_399995,0,0,0,0,0,-999999,0,-99,...,0,0,0,0,0,0,1,1,0,0
2399996,201812,TRAIN_399996,0,0,0,0,4362,-999999,0,-99,...,14402,3339,0,0,0,0,1,1,0,0
2399997,201812,TRAIN_399997,0,0,0,3291,2898,-999999,0,-99,...,5731,0,0,0,0,0,1,1,0,0
2399998,201812,TRAIN_399998,0,0,0,0,0,-999999,0,-99,...,0,0,0,0,0,0,1,1,0,0


In [5]:
X = ex1.drop(columns=['ID', '기준년월', 'Segment'])
X = X.select_dtypes(include=[np.number])
X = X.fillna(-1)  # ← 학습 데이터도 결측치 처리

y = pd.factorize(ex1['Segment'], sort=True)[0]  # A~E → 0~4로 인코딩

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

rf = RandomForestClassifier(
    n_estimators=300,
    max_depth=22,
    class_weight='balanced',  # 클래스 불균형 보정
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
print("★ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

# ─── 제출 데이터 예측 ─────────────────────────────
test_df = pd.read_parquet("회원_신용_청구_잔액_test.parquet")

X_submit = test_df.drop(columns=['ID', '기준년월'])
X_submit = X_submit.select_dtypes(include=[np.number])
X_submit = X_submit.fillna(-1)  # ← 결측치 처리 중요!

y_submit_pred = rf.predict(X_submit)

segment_labels = pd.factorize(ex1['Segment'], sort=True)[1]  # Index([A, B, C, D, E], dtype=object)
predicted_segments = segment_labels[y_submit_pred]

predict = pd.DataFrame({
    'ID': test_df['ID'],
    'Segment': predicted_segments
})

print(predict.head())
predict.to_csv("rf_model.csv", index=False)


★ Accuracy: 0.9045625
              precision    recall  f1-score   support

           0     0.8922    0.7680    0.8255       194
           1     1.0000    0.6207    0.7660        29
           2     0.8641    0.8213    0.8422     25518
           3     0.6470    0.8482    0.7341     69848
           4     0.9721    0.9204    0.9455    384411

    accuracy                         0.9046    480000
   macro avg     0.8751    0.7957    0.8226    480000
weighted avg     0.9190    0.9046    0.9092    480000

           ID Segment
0  TEST_00000       D
1  TEST_00001       D
2  TEST_00002       E
3  TEST_00003       E
4  TEST_00004       E


In [6]:

df_pred = pd.read_csv("rf_model.csv")  # ID, Segment

df_mode = df_pred.groupby("ID")["Segment"].agg(lambda x: x.mode().iloc[0]).reset_index()

# 3. sample_submission 포맷 확인
sample = pd.read_csv("sample_submission.csv")
print("샘플 행 수:", len(sample))  # 100,000

# 4. ID 기준으로 merge
final_submission = sample[['ID']].merge(df_mode, on='ID', how='left')

# 5. Segment가 없는 경우 예비값 지정 (예: 'A')
final_submission['Segment'] = final_submission['Segment'].fillna('A')

# 6. 저장
final_submission.to_csv("final_segment_submission.csv", index=False)


샘플 행 수: 100000
