In [1]:
# 기본
import pandas as pd
import numpy as np
from functools import reduce

import xgboost as xgb
from sklearn.metrics import f1_score, classification_report
import matplotlib.pyplot as plt

# 그래프 설정
plt.rcParams['font.family'] = 'Malgun Gothic'

# 경고 뜨지 않게 설정
import warnings
warnings.filterwarnings('ignore')

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
file_path = "/content/drive/MyDrive/12조 파이널프로젝트/data/train/201812_add_segment1.parquet"
df = pd.read_parquet(file_path)

In [4]:
df

Unnamed: 0,ID,기준년월,소지카드수_유효_신용,소지카드수_이용가능_신용,입회경과개월수_신용,이용금액_R3M_신용_가족,이용여부_3M_해외겸용_본인,이용여부_3M_해외겸용_신용_본인,2순위신용체크구분_인코딩,CA한도금액,...,_3순위교통업종_이용금액,_1순위납부업종_이용금액,RP건수_B0M,RP유형건수_B0M,이용개월수_오프라인_R6M,연속유실적개월수_기본_24M_카드,할인건수_R3M,잔액_일시불_B2M,Segment,Segment1
0,TRAIN_000000,201812,1,1,71,0,0,0,0,5795,...,0,1970,1,1,6,17,0,554,D,other
1,TRAIN_000001,201812,1,1,16,0,0,0,-1,4236,...,0,2260,0,0,6,17,0,2898,E,E
2,TRAIN_000002,201812,1,1,128,0,0,0,0,28985,...,0,0,0,0,6,8,0,4369,C,other
3,TRAIN_000003,201812,2,2,31,0,1,1,0,10156,...,0,1961,0,0,6,24,0,1018,D,other
4,TRAIN_000004,201812,1,1,6,0,1,0,0,53912,...,0,0,0,0,0,0,0,0,E,E
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399995,TRAIN_399995,201812,1,1,209,0,1,0,0,10167,...,0,0,0,0,0,0,0,0,E,E
399996,TRAIN_399996,201812,1,1,17,0,1,1,-1,31159,...,0,161,2,2,6,17,0,4362,D,other
399997,TRAIN_399997,201812,1,1,115,0,1,1,-1,19429,...,0,0,0,0,6,24,0,2898,C,other
399998,TRAIN_399998,201812,1,1,71,0,0,0,-1,4228,...,0,0,0,0,0,0,0,0,E,E


In [5]:
# E를 제외한 데이터만 필터링
df_filtered = df[df['Segment'] != 'E'].copy()

# Segment2 컬럼 생성
def assign_segment2(seg):
    if seg == 'C':
        return 'C'
    elif seg == 'D':
        return 'D'
    else:
        return 'other'

df_filtered['Segment2'] = df_filtered['Segment'].apply(assign_segment2)

In [6]:
df_filtered

Unnamed: 0,ID,기준년월,소지카드수_유효_신용,소지카드수_이용가능_신용,입회경과개월수_신용,이용금액_R3M_신용_가족,이용여부_3M_해외겸용_본인,이용여부_3M_해외겸용_신용_본인,2순위신용체크구분_인코딩,CA한도금액,...,_1순위납부업종_이용금액,RP건수_B0M,RP유형건수_B0M,이용개월수_오프라인_R6M,연속유실적개월수_기본_24M_카드,할인건수_R3M,잔액_일시불_B2M,Segment,Segment1,Segment2
0,TRAIN_000000,201812,1,1,71,0,0,0,0,5795,...,1970,1,1,6,17,0,554,D,other,D
2,TRAIN_000002,201812,1,1,128,0,0,0,0,28985,...,0,0,0,6,8,0,4369,C,other,C
3,TRAIN_000003,201812,2,2,31,0,1,1,0,10156,...,1961,0,0,6,24,0,1018,D,other,D
8,TRAIN_000008,201812,3,3,141,13893,1,1,1,59784,...,1833,2,2,6,24,1,18955,C,other,C
10,TRAIN_000010,201812,1,1,74,0,1,1,-1,27852,...,4524,1,1,5,24,0,45241,D,other,D
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
399979,TRAIN_399979,201812,2,2,248,0,1,1,1,26268,...,2026,0,0,6,24,0,5596,D,other,D
399987,TRAIN_399987,201812,1,1,113,0,1,1,-1,10026,...,415,0,0,6,24,0,5311,C,other,C
399993,TRAIN_399993,201812,3,3,127,0,1,1,1,22513,...,1619,4,3,6,24,2,5234,C,other,C
399996,TRAIN_399996,201812,1,1,17,0,1,1,-1,31159,...,161,2,2,6,17,0,4362,D,other,D


In [7]:
df_filtered = df_filtered.drop(columns=['Segment1'])

In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import xgboost as xgb

# 🔹 1. 데이터 불러오기
#file_path = "/content/drive/MyDrive/12조 파이널프로젝트/data/train/201812_add_segment1.parquet"
#df = pd.read_parquet(file_path)

# 🔹 2. Feature / Target 설정
X = df_filtered.drop(columns=['ID', '기준년월','Segment2','Segment'], errors='ignore')
y = df_filtered['Segment2']  # 'E' 또는 'other'

# 🔹 3. 라벨 인코딩 (문자 -> 숫자)
le = LabelEncoder()
y_encoded = le.fit_transform(y)  # 'E':0, 'other':1 (또는 반대)

# 🔹 4. 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(
    X, y_encoded,
    test_size=0.2,
    stratify=y_encoded,
    random_state=42
)

# 🔹 5. XGBoost 모델 정의 및 학습
model = xgb.XGBClassifier(
    objective='binary:logistic',
    use_label_encoder=False,
    random_state=42
    #tree_method='hist'  # Colab에서 속도 빠르게
)

model.fit(X_train, y_train)

# 🔹 6. 예측 및 평가
y_pred = model.predict(X_val)
f1 = f1_score(y_val, y_pred, average='micro')
print(f"\n✅ [Validation F1 Score (micro)]: {f1:.4f}")
print("📘 클래스 인코딩 순서:", list(le.classes_))



✅ [Validation F1 Score (micro)]: 0.7993
📘 클래스 인코딩 순서: ['C', 'D', 'other']
