In [1]:
import pandas as pd
import os
from glob import glob
import numpy as np
from pandas.api.types import CategoricalDtype
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

In [8]:
ex1 = pd.read_parquet('회원_신용_청구.parquet')

In [11]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# ─── 2. 피처 및 라벨 정의 ──────────────────────
X = ex1.drop(columns=['ID', '기준년월', 'Segment']).select_dtypes(include=[np.number])
y, segment_labels = pd.factorize(ex1['Segment'], sort=True)  # A~E → 0~4

# ─── 3. Train/Test 분할 ────────────────────────
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, stratify=y, random_state=42
)

# ─── 4. SMOTE로 오버샘플링 ──────────────────────
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train, y_train)

# ─── 5. 랜덤 포레스트 학습 ──────────────────────
rf = RandomForestClassifier(
    n_estimators=200,
    max_depth=10,
    class_weight=None,  # SMOTE를 썼기 때문에 class_weight는 제거
    random_state=42,
    n_jobs=-1
)
rf.fit(X_train_resampled, y_train_resampled)

# ─── 6. 검증 세트 평가 ─────────────────────────
y_pred = rf.predict(X_test)
print("★ Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred, digits=4))

# ─── 7. 테스트 데이터 예측 ─────────────────────
test_df = pd.read_parquet("회원_신용_청구_test.parquet")
X_submit = test_df.drop(columns=['ID', '기준년월']).select_dtypes(include=[np.number])
y_submit_pred = rf.predict(X_submit)

predicted_segments = segment_labels[y_submit_pred]

predict = pd.DataFrame({
    'ID': test_df['ID'],
    'Segment': predicted_segments
})

# ─── 8. 저장 ──────────────────────────────────
predict.to_csv("rf_segment_prediction_oversampled.csv", index=False)
print("▶ 예측 결과 저장 완료: rf_segment_prediction_oversampled.csv")
print(predict.head())


AttributeError: 'SMOTE' object has no attribute '_validate_data'

In [6]:

df_pred = pd.read_csv("rf_segment_prediction1.csv")  # ID, Segment

df_mode = df_pred.groupby("ID")["Segment"].agg(lambda x: x.mode().iloc[0]).reset_index()

# 3. sample_submission 포맷 확인
sample = pd.read_csv("sample_submission.csv")
print("샘플 행 수:", len(sample))  # 100,000

# 4. ID 기준으로 merge
final_submission = sample[['ID']].merge(df_mode, on='ID', how='left')

# 5. Segment가 없는 경우 예비값 지정 (예: 'A')
final_submission['Segment'] = final_submission['Segment'].fillna('A')

# 6. 저장
final_submission.to_csv("final_segment_submission.csv", index=False)


샘플 행 수: 100000
