### Segment_A~E 인 경우

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기

#train_file = '201812_processed_All'
#train_file = '201812_corr_drop_All'
#file_name = 'corr_drop_All'
#file_name = '201812_vif_drop_All'
file_name = '201812_vif_drop_threshold20'
train_df = pd.read_parquet(f'{root_path}/train/{file_name}.parquet')

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

# 타겟 레이블 재구성
def get_target_label(df):
    segment_columns = ['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E']
    return df[segment_columns].idxmax(axis=1).str[-1]

# 타겟 생성
train_df['Segment'] = get_target_label(train_df)

# 학습용 피처/타겟 정의
X = train_df.drop(columns=['Segment_A', 'Segment_B', 'Segment_C', 'Segment_D', 'Segment_E', 'Segment', 'ID', '기준년월'])
y = train_df['Segment']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# CatBoost 모델 정의 및 학습
'''
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1:average=Micro',
    verbose=100,
    random_seed=42
)
'''
model = CatBoostClassifier(verbose=100, random_state=42)
model.fit(X_train, y_train)

# 검증 성능 평가
y_pred_val = model.predict(X_val)
f1 = f1_score(y_val, y_pred_val, average='micro')
print(f"Validation F1 score (micro): {f1:.4f}")

# 최종 예측
y_pred_test = model.predict(X_test).flatten()

# 제출파일 생성
submission = pd.DataFrame({
    'ID': test_df['ID'],
    'Segment': y_pred_test
})

submission.to_csv(f'../results/{file_name}_catboost_submission.csv', index=False)
print(f"{file_name}_submission 저장 완료!")

### 각 단계별 모델 f1 점수 내보기

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기
file_name = '201812_add_segment3' # 이 부분만 수정하여 모델1,2,3 똑같이 검정
train_df = pd.read_parquet(f'{root_path}/train/{file_name}.parquet')

# 학습용 피처/타겟 정의
X = train_df.drop(columns=['ID', '기준년월', 'Segment', 'Segment1'])
y = train_df['Segment1']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

# 학습/검증 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# CatBoost 모델 정의 및 학습
'''
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.05,
    depth=6,
    loss_function='MultiClass',
    eval_metric='TotalF1:average=Micro',
    verbose=100,
    random_seed=42
)
'''
model = CatBoostClassifier(verbose=100, random_state=42)
model.fit(X_train, y_train)

# 검증 성능 평가
y_pred_val = model.predict(X_val)
f1 = f1_score(y_val, y_pred_val, average='micro')
print(f"Validation F1 score (micro): {f1:.4f}")


### 모델3개로 단계적으로 예측하기

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기

#train_file = '201812_vif_drop_All'
train_file = '201812_corr_drop_All'
#train_file = '201812_processed_All'
train_df = pd.read_parquet(f'{root_path}/train/{train_file}.parquet')

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

print("✅ 단계 1: Segment == E vs Other")

train_df["Segment1"] = train_df["Segment"].apply(lambda x: "E" if x == "E" else "other")

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X = train_df[feature_cols]
y = train_df['Segment1']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

# 학습
model1 = CatBoostClassifier(verbose=100, random_state=42)
model1.fit(X, y)

# 예측
test_df['Segment_pred'] = model1.predict(X_test)

In [None]:
test_df['Segment_pred'].value_counts()

In [None]:
print("✅ 단계 2: Segment == C or D vs Other (E 제거)") 

# 학습용 피처/타겟 정의
train_df2= train_df[train_df['Segment'] != 'E'].copy()
train_df2['Segment1'] = train_df2['Segment'].apply(lambda x: x if x in ['C', 'D'] else 'other')

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X2 = train_df2[feature_cols]
y2 = train_df2['Segment1']

# 예측할 test subset (1단계 결과가 Other인 경우)
test_df2 = test_df[test_df['Segment_pred'] == 'other'].copy()

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X2.columns if col in test_df2.columns]
X2 = X2[common_cols]
X_test2 = test_df2[common_cols]

#학습
model2 = CatBoostClassifier(verbose=100, random_state=42)
model2.fit(X2, y2)

# 예측
test_df2['Segment_pred'] = model2.predict(X_test2).flatten()

In [None]:
test_df2['Segment_pred'].value_counts()

In [None]:
# 예측 결과 병합
# 중복된 ID 제거: 마지막 값을 기준으로 유지
test_df2_dedup = test_df2.drop_duplicates(subset='ID', keep='last')
test_df2_indexed = test_df2_dedup.set_index('ID')

# test_df의 Segment_pred를 ID 기준으로 업데이트
test_df.loc[test_df['ID'].isin(test_df2_indexed.index), 'Segment_pred'] = \
    test_df.loc[test_df['ID'].isin(test_df2_indexed.index), 'ID'].map(test_df2_indexed['Segment_pred'])

In [None]:
print("✅ 단계 3: Segment == A or B (C, D, E 제외)")

# 학습용 피처/타겟 정의
train_df3 = train_df[train_df['Segment'].isin(['A', 'B'])].copy()
train_df3['Segment1'] = train_df3['Segment']

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X3 = train_df3[feature_cols]
y3 = train_df3['Segment1']

# 예측할 test subset (1단계 결과가 Other인 경우)
test_df3 = test_df[test_df['Segment_pred'] == 'other'].copy()

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X3.columns if col in test_df3.columns]
X3 = X3[common_cols]
X_test3 = test_df3[common_cols]

model3 = CatBoostClassifier(verbose=100, random_state=42)
model3.fit(X3, y3)

# 예측
test_df3['Segment_pred'] = model3.predict(X_test3)

In [None]:
test_df3['Segment_pred'].value_counts()

In [None]:
# 예측 결과 병합
# 중복된 ID 제거: 마지막 값을 기준으로 유지
test_df3_dedup = test_df3.drop_duplicates(subset='ID', keep='last')
test_df3_indexed = test_df3_dedup.set_index('ID')

# test_df의 Segment_pred를 ID 기준으로 업데이트
test_df.loc[test_df['ID'].isin(test_df3_indexed.index), 'Segment_pred'] = \
    test_df.loc[test_df['ID'].isin(test_df3_indexed.index), 'ID'].map(test_df3_indexed['Segment_pred'])

# 최종 예측 결과 저장
submission = test_df[['ID', 'Segment_pred']].copy()
submission.rename(columns={'Segment_pred': 'Segment'}, inplace=True)
submission.to_csv(f'../results/{train_file}_catboost_model3_submission.csv', index=False)
print(f"{train_file}_catboost_model3_submission 저장 완료!")

### 모델 2개로 단계적 예측

In [None]:
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np

# local
root_path = '../data/open'

# colab
# root_path = '/content/drive/MyDrive/12조 파이널프로젝트/data'

# 데이터 불러오기

#train_file = '201812_vif_drop_All'
train_file = '201812_corr_drop_All'
#train_file = '201812_processed_All'
train_df = pd.read_parquet(f'{root_path}/train/{train_file}.parquet')

test_file = f'{root_path}/test/201812_processed_All.parquet'
test_df = pd.read_parquet(test_file)

print("✅ 단계 1: Segment == C,D,E vs Other")

train_df["Segment1"] = train_df["Segment"].apply(lambda x: x if x in ['C', 'D', 'E'] else 'other')

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X = train_df[feature_cols]
y = train_df['Segment1']

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X.columns if col in test_df.columns]
X = X[common_cols]
X_test = test_df[common_cols]

# 학습
model1 = CatBoostClassifier(verbose=100, random_state=42)
model1.fit(X, y)

# 예측
test_df['Segment_pred'] = model1.predict(X_test).flatten()

In [None]:
test_df['Segment_pred'].value_counts()

In [None]:
print("✅ 단계 2: Segment == A or B ") 

# 학습용 피처/타겟 정의
train_df2= train_df[train_df['Segment'].isin(['A', 'B'])].copy()
train_df2['Segment1'] = train_df2['Segment']

exclude_cols = ['ID', '기준년월', 'Segment', 'Segment1']
feature_cols = [
    col for col in train_df.columns
    if col not in exclude_cols
]

# 학습용 피처/타겟 정의
X2 = train_df2[feature_cols]
y2 = train_df2['Segment1']

# 예측할 test subset (1단계 결과가 Other인 경우)
test_df2 = test_df[test_df['Segment_pred'] == 'other'].copy()

# X와 test_df 양쪽에 모두 존재하는 컬럼만 선택
common_cols = [col for col in X2.columns if col in test_df2.columns]
X2 = X2[common_cols]
X_test2 = test_df2[common_cols]

#학습
model2 = CatBoostClassifier(verbose=100, random_state=42)
model2.fit(X2, y2)

# 예측
test_df2['Segment_pred'] = model2.predict(X_test2).flatten()

In [None]:
test_df2['Segment_pred'].value_counts()

In [None]:
# 예측 결과 병합
# 중복된 ID 제거: 마지막 값을 기준으로 유지
test_df2_dedup = test_df2.drop_duplicates(subset='ID', keep='last')
test_df2_indexed = test_df2_dedup.set_index('ID')

# test_df의 Segment_pred를 ID 기준으로 업데이트
test_df.loc[test_df['ID'].isin(test_df2_indexed.index), 'Segment_pred'] = \
    test_df.loc[test_df['ID'].isin(test_df2_indexed.index), 'ID'].map(test_df2_indexed['Segment_pred'])
    
# 최종 예측 결과 저장
submission = test_df[['ID', 'Segment_pred']].copy()
submission.rename(columns={'Segment_pred': 'Segment'}, inplace=True)
submission.to_csv(f'../results/{train_file}_catboost_model2_submission.csv', index=False)
print(f"{train_file}_catboost_model2_submission 저장 완료!")