In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기

#file_name = '201812_processed_All'
#file_name = '201812_corr_drop_All'
#file_name = 'corr_drop_All'
file_name = '201812_vif_drop_All'
train_df = pd.read_parquet(f'{root_path}/train/{file_name}.parquet')

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

# 타겟 레이블 재구성
def get_target_label(df):
    segment_columns = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
    return df[segment_columns].idxmax(axis=1).str[-1]

# 타겟 생성
train_df['Segment'] = get_target_label(train_df)

# 학습용 피처/타겟 정의
X = train_df.drop(columns=['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment', 'ID', '기준년월'])
y = train_df['Segment']
# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# CatBoost 모델 정의 및 학습
'''
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1:average=Micro',
    verbose=100,
    random_seed=42
)
'''
model = CatBoostClassifier(verbose=100, random_state=42)
model.fit(X_train, y_train)

# 검증 성능 평가
y_pred_val = model.predict(X_val)
f1 = f1_score(y_val, y_pred_val, average='micro')
print(f"Validation F1 score (micro): {f1:.4f}")

# 최종 예측
y_pred_test = model.predict(X_test).flatten()

# 제출파일 생성
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Segment': y_pred_test
})

submission.to_csv(f'../results/{file_name}_catboost_submission.csv', index=False)
print(f"{file_name}_submission 저장 완료!")