### 단일모델링 (Segment_A~E)

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기

#train_file = '201812_processed_All'
#train_file = '201812_corr_drop_All'
#file_name = 'corr_drop_All'
#file_name = '201812_vif_drop_All'
file_name = '201812_vif_drop_threshold20'
train_df = pd.read_parquet(f'{root_path}/train/{file_name}.parquet')

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

# 타겟 레이블 재구성
def get_target_label(df):
    segment_columns = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
    return df[segment_columns].idxmax(axis=1).str[-1]

# 타겟 생성
train_df['Segment'] = get_target_label(train_df)

# 학습용 피처/타겟 정의
X = train_df.drop(columns=['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment', 'ID', '기준년월'])
y = train_df['Segment']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# CatBoost 모델 정의 및 학습
'''
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1:average=Micro',
    verbose=100,
    random_seed=42
)
'''
model = CatBoostClassifier(verbose=100, random_state=42)
model.fit(X_train, y_train)

# 검증 성능 평가
y_pred_val = model.predict(X_val)
f1 = f1_score(y_val, y_pred_val, average='micro')
print(f"Validation F1 score (micro): {f1:.4f}")

# 최종 예측
y_pred_test = model.predict(X_test).flatten()

# 제출파일 생성
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Segment': y_pred_test
})

submission.to_csv(f'../results/{file_name}_catboost_submission.csv', index=False)
print(f"{file_name}_submission 저장 완료!")

### 단일 모델링 (Segment)

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기
file_name = 'vif_one_segment' # 이 부분만 수정하여 모델1,2,3 똑같이 검정
train_df = pd.read_parquet(f'{root_path}/train/{file_name}.parquet')

# 학습용 피처/타겟 정의
X = train_df.drop(columns=['ID', '기준년월', 'Segment'])
y = train_df['Segment']

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# CatBoost 모델 정의 및 학습
'''
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1:average=Micro',
    verbose=100,
    random_seed=42
)
'''
model = CatBoostClassifier(verbose=100, random_state=42)
model.fit(X_train, y_train)

# 검증 성능 평가
y_pred_val = model.predict(X_val)
f1 = f1_score(y_val, y_pred_val, average='micro')
print(f"Validation F1 score (micro): {f1:.4f}")

# 혼동행렬
# 혼동행렬을 DataFrame으로 보기 좋게 출력
labels = ['A', 'B', 'C', 'D', 'E']
cm = confusion_matrix(y_val, y_pred_val, labels=labels)
cm_df = pd.DataFrame(cm, index=[f"실제_{l}" for l in labels], columns=[f"예측_{l}" for l in labels])

print("📊 Confusion Matrix")
display(cm_df)


### 파이프라인 모델링 (3단계)

In [None]:

from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.model_selection import train_test_split
from sklearn.utils.class_weight import compute_class_weight
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기

#train_file = '201812_vif_drop_All'
#train_file = '201812_corr_drop_All'
#train_file = '201812_processed_All'
train_file = 'vif_one_segment'
#train_file = 'corr_one_segment'
origin_df = pd.read_parquet(f'{root_path}/train/{train_file}.parquet')

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

y_origin = origin_df['Segment']

### 학습/검증 분리 (인덱스 유지/ y_origin 클래스 비율에 맞춰서)
train_df, score_df = train_test_split(origin_df, test_size=0.2, random_state=42, stratify=y_origin)

print("✅ 단계 1: Segment == E vs Other")

# 분류 변경
train_df["Segment1"] = train_df["Segment"].apply(lambda x: "E" if x == "E" else "other")

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X = train_df[feature_cols]

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X_train = X[common_cols]
X_test = test_df[common_cols]
X_score = score_df[common_cols]

y = train_df['Segment1']
y_score = score_df['Segment']

### 학습
model1 = CatBoostClassifier(verbose=100, random_state=42)
model1.fit(X_train, y)

# 검증 성능 평가 (3단계까지 끝난 후 y_score과 비교하여 검증평가)
score_df['Segment_pred'] = model1.predict(X_score)

# 예측
test_df['Segment_pred'] = model1.predict(X_test)

In [None]:
test_df['Segment_pred'].value_counts()
score_df['Segment_pred'].value_counts()

In [None]:

print("✅ 단계 2: Segment == C or D vs Other (E 제거)") 

train_df2 = train_df[train_df['Segment'] != 'E'].copy()
test_df2 = test_df[test_df['Segment_pred'] == 'other'].copy()
score_df2 = score_df[score_df['Segment_pred']== 'other'].copy()

# 분류 변경
train_df2['Segment1'] = train_df2['Segment'].apply(lambda x: x if x in ['C', 'D'] else 'other')

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df2.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X2 = train_df2[feature_cols]
y2 = train_df2['Segment1']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X2.columns if col in test_df2.columns]
X_train2 = X2[common_cols]
X_test2 = test_df2[common_cols]
X_score2 = score_df2[common_cols]

### 학습
# 클래스별 가중치 적용
classes = np.unique(train_df2['Segment1'])

# 가중치 자동 계산
#weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df2['Segment1'])

# 리스트로 변환
#class_weights = weights.tolist()
#print(dict(zip(classes, class_weights)))

model2 = CatBoostClassifier(
    verbose=100, 
    random_state=42#,
    #class_weights=class_weights
)

model2.fit(X_train2, y2)

# 검증 성능 평가 (3단계까지 끝난 후 y_origin_val과 비교하여 검증평가)
score_df2['Segment_pred'] = model2.predict(X_score2).flatten()

# 예측
test_df2['Segment_pred'] = model2.predict(X_test2).flatten()

In [None]:
test_df2['Segment_pred'].value_counts()
score_df2['Segment_pred'].value_counts()

In [None]:
# 2단계 결과 score_df에 반영
score_df.loc[score_df2.index, 'Segment_pred'] = score_df2['Segment_pred']

# 예측 결과 병합
# 중복된 ID 제거: 마지막 값을 기준으로 유지
test_df.loc[test_df2.index, 'Segment_pred'] = test_df2['Segment_pred']
    
test_df['Segment_pred'].value_counts()
score_df['Segment_pred'].value_counts()

In [None]:
print("✅ 단계 3: Segment == A or B (C, D, E 제외)")
train_df3 = train_df[train_df['Segment'].isin(['A', 'B'])].copy()
test_df3 = test_df[test_df['Segment_pred'] == 'other'].copy()
score_df3 = score_df[score_df['Segment_pred']== 'other'].copy()

# 분류 변경
train_df3['Segment1'] = train_df3['Segment']

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df3.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X3 = train_df3[feature_cols]
y3 = train_df3['Segment1']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X3.columns if col in test_df3.columns]
X_train3 = X3[common_cols]
X_test3 = test_df3[common_cols]
X_score3 = score_df3[common_cols]

### 학습
model3 = CatBoostClassifier(verbose=100, random_state=42)
model3.fit(X_train3, y3)

# 검증 성능 평가 (3단계까지 끝난 후 y_origin_val과 비교하여 검증평가)
score_df3['Segment_pred'] = model3.predict(X_score3)

# 예측
test_df3['Segment_pred'] = model3.predict(X_test3)

In [None]:
test_df3['Segment_pred'].value_counts()
score_df3['Segment_pred'].value_counts()

In [None]:
# 검증 결과 병합
score_df.loc[score_df3.index, 'Segment_pred'] = score_df3['Segment_pred']

# 예측 결과 병합
test_df.loc[test_df3.index, 'Segment_pred'] = test_df3['Segment_pred']

# 최종 예측 결과 저장
submission = test_df[['ID', 'Segment_pred']].copy()
submission.rename(columns={'Segment_pred': 'Segment'}, inplace=True)
submission.to_csv(f'../results/{train_file}_model3.csv', index=False)
print(f"{train_file}_model3 저장 완료!")

# 파이프라인 전체 성능 검사
f1 = f1_score(y_score, score_df["Segment_pred"], average='micro')
print(f"Validation F1 score (micro): {f1:.4f}")

# 혼동행렬
# 혼동행렬을 DataFrame으로 보기 좋게 출력
labels = ['A', 'B', 'C', 'D', 'E']
cm = confusion_matrix(y_score, score_df["Segment_pred"], labels=labels)
cm_df = pd.DataFrame(cm, index=[f"실제_{l}" for l in labels], columns=[f"예측_{l}" for l in labels])

print("📊 Confusion Matrix")
display(cm_df)

class_accuracy = {}
for i, label in enumerate(labels):
    true_positive = cm[i, i]
    total_actual = cm[i, :].sum()
    acc = true_positive / total_actual if total_actual > 0 else 0
    class_accuracy[label] = round(acc, 4)

print("📊 Segment별 정확도:")
for seg, acc in class_accuracy.items():
    print(f"Segment {seg}: {acc:.4f}")

### 파이프라인 모델링 (2단계)

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

#train_file = '201812_vif_drop_All'
#train_file = '201812_corr_drop_All'
#train_file = '201812_processed_All'
train_file = 'vif_one_segment'
origin_df = pd.read_parquet(f'{root_path}/train/{train_file}.parquet')

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

y_origin = origin_df['Segment']

### 학습/검증 분리 (인덱스 유지/ y_origin 클래스 비율에 맞춰서)
train_df, score_df = train_test_split(origin_df, test_size=0.2, random_state=42, stratify=y_origin)

print("✅ 단계 1: Segment == C,D,E vs Other")

train_df["Segment1"] = train_df["Segment"].apply(lambda x: x if x in ['C', 'D', 'E'] else 'other')

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X = train_df[feature_cols]

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X_train = X[common_cols]
X_test = test_df[common_cols]
X_score = score_df[common_cols]

y = train_df['Segment1']
y_score = score_df['Segment']

### 학습
'''
# 클래스별 가중치 적용
classes = np.unique(train_df['Segment1'])

# 가중치 자동 계산
weights = compute_class_weight(class_weight='balanced', classes=classes, y=train_df['Segment1'])

# 리스트로 변환
class_weights = weights.tolist()
print(dict(zip(classes, class_weights)))
'''
model1 = CatBoostClassifier(
    verbose=100, 
    random_state=42
    #class_weights=class_weights
)
model1.fit(X_train, y)

# 검증 성능 평가
score_df['Segment_pred'] = model1.predict(X_score).flatten()

# 예측
test_df['Segment_pred'] = model1.predict(X_test).flatten()

In [None]:
test_df['Segment_pred'].value_counts()
score_df['Segment_pred'].value_counts()

In [None]:
print("✅ 단계 2: Segment == A or B ") 

train_df2 = train_df[train_df['Segment'].isin(['A', 'B'])].copy()
test_df2 = test_df[test_df['Segment_pred'] == 'other'].copy()
score_df2 = score_df[score_df['Segment_pred']== 'other'].copy()

# 분류 변경
train_df2['Segment1'] = train_df2['Segment']

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df2.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X2 = train_df2[feature_cols]
y2 = train_df2['Segment1']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X2.columns if col in test_df2.columns]
X_train2 = X2[common_cols]
X_test2 = test_df2[common_cols]
X_score2 = score_df2[common_cols]

### 학습
model2 = CatBoostClassifier(verbose=100, random_state=42)
model2.fit(X_train2, y2)

# 검증 성능 평가 (3단계까지 끝난 후 y_origin_val과 비교하여 검증평가)
score_df2['Segment_pred'] = model2.predict(X_score2).flatten()

# 예측
test_df2['Segment_pred'] = model2.predict(X_test2).flatten()

In [None]:
test_df2['Segment_pred'].value_counts()
score_df2['Segment_pred'].value_counts()

In [None]:
# 2단계 결과 score_df에 반영
score_df.loc[score_df2.index, 'Segment_pred'] = score_df2['Segment_pred']

# 예측 결과 병합
test_df.loc[test_df2.index, 'Segment_pred'] = test_df2['Segment_pred']

# 최종 예측 결과 저장
submission = test_df[['ID', 'Segment_pred']].copy()
submission.rename(columns={'Segment_pred': 'Segment'}, inplace=True)
submission.to_csv(f'../results/{train_file}_model2.csv', index=False)
print(f"{train_file}_model2 저장 완료!")

#파이프라인 전체 성능 검사
f1 = f1_score(y_score, score_df["Segment_pred"], average='micro')
print(f"Validation F1 score (micro): {f1:.4f}")

# 혼동행렬
# 혼동행렬을 DataFrame으로 보기 좋게 출력
labels = ['A', 'B', 'C', 'D', 'E']
cm = confusion_matrix(y_score, score_df["Segment_pred"], labels=labels)
cm_df = pd.DataFrame(cm, index=[f"실제_{l}" for l in labels], columns=[f"예측_{l}" for l in labels])

print("📊 Confusion Matrix")
display(cm_df)

class_accuracy = {}
for i, label in enumerate(labels):
    true_positive = cm[i, i]
    total_actual = cm[i, :].sum()
    acc = true_positive / total_actual if total_actual > 0 else 0
    class_accuracy[label] = round(acc, 4)

print("📊 Segment별 정확도:")
for seg, acc in class_accuracy.items():
    print(f"Segment {seg}: {acc:.4f}")