In [18]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb

# 🔹 1. 데이터 불러오기
file_path = "/content/drive/MyDrive/12조 파이널프로젝트/data/train/201812_add_segment1.parquet"
df = pd.read_parquet(file_path)

# 🔹 2. Feature / Target 설정
X = df.drop(columns=['ID', '기준년월','Segment1','Segment'], errors='ignore')
y = df['Segment1']  # 'E' 또는 'other'

# 🔹 3. 라벨 인코딩 (문자 -> 숫자)
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # 'E':0, 'other':1 (또는 반대)

# 🔹 4. 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

# 🔹 5. XGBoost 모델 정의 및 학습
model = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    random_state=42
    #tree_method='hist'  # Colab에서 속도 빠르게
)

model.fit(X_train, y_train)

# 🔹 6. 예측 및 평가
y_pred = model.predict(X_val)
f1 = f1_score(y_val, y_pred, average='micro')
print(f"\n✅ [Validation F1 Score (micro)]: {f1:.4f}")
print("📘 클래스 인코딩 순서:", list(le.classes_))


Parameters: { "use_label_encoder" } are not used.




✅ [Validation F1 Score (micro)]: 0.8928
📘 클래스 인코딩 순서: ['E', 'other']
