In [1]:
import sys
sys.path.append("utils/")

In [2]:
# %pip install -qU xgboost lightgbm catboost optuna

In [3]:
from data_loader import load_final_data

dataset_path = "../dataset/"
train_df, test_df = load_final_data(dataset_path)

✅ File: ../dataset/train\balance.parquet Completed!
✅ File: ../dataset/train\bill.parquet Completed!
✅ File: ../dataset/train\channel.parquet Completed!
✅ File: ../dataset/train\credit.parquet Completed!
✅ File: ../dataset/train\marketing.parquet Completed!
✅ File: ../dataset/train\member.parquet Completed!
✅ File: ../dataset/train\perf.parquet Completed!
✅ File: ../dataset/train\tx.parquet Completed!
🔹 Shape : (2400000, 292)

✅ File: ../dataset/test\balance.parquet Completed!
✅ File: ../dataset/test\bill.parquet Completed!
✅ File: ../dataset/test\channel.parquet Completed!
✅ File: ../dataset/test\credit.parquet Completed!
✅ File: ../dataset/test\marketing.parquet Completed!
✅ File: ../dataset/test\member.parquet Completed!
✅ File: ../dataset/test\perf.parquet Completed!
✅ File: ../dataset/test\tx.parquet Completed!
🔹 Shape : (600000, 291)


In [4]:
from sklearn.preprocessing import LabelEncoder
import numpy as np

X = train_df.drop(columns=['ID', '기준년월', 'Segment'])
X_test = test_df.drop(columns=['ID', '기준년월'])
y = train_df['Segment']

le = LabelEncoder()
y_encoded = le.fit_transform(y).astype(np.uint8)

In [5]:
X.replace([np.inf, -np.inf], 0, inplace=True)
X_test.replace([np.inf, -np.inf], 0, inplace=True)

In [6]:
counts = np.bincount(y_encoded)
n_classes = len(counts)
total_samples = counts.sum()

class_weights = {i: total_samples / (n_classes * count) for i, count in enumerate(counts)}
class_weights

{0: 493.82716049382714,
 1: 3333.3333333333335,
 2: 3.7620503174229953,
 3: 1.3744051402752246,
 4: 0.2497330977517778}

In [7]:
# from sklearn.tree import DecisionTreeClassifier
# from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBClassifier
# from lightgbm import LGBMClassifier
# from catboost import CatBoostClassifier

# models = {
#     "decision_tree": DecisionTreeClassifier(random_state=42, max_depth=10),
#     "random_forest": RandomForestClassifier(class_weight='balanced', random_state=42, n_jobs=-1, n_estimators=100, max_depth=10),
#     "xgboost": XGBClassifier(random_state=42, n_estimators=100,),
#     "lightgbm": LGBMClassifier(random_state=42, n_jobs=-1, n_estimators=100, max_depth=10, class_weight='balanced',),
#     "catboost": CatBoostClassifier(random_state=42, n_estimators=100, max_depth=10, class_weights=class_weights),
# }

In [8]:
# from sklearn.model_selection import StratifiedKFold
# from sklearn.metrics import f1_score
# import numpy as np

# def kfold_train_eval(X, y, model, k=5):
#     skf = StratifiedKFold(n_splits=k)
#     scores = []

#     for i, (train_idx, test_idx) in enumerate(skf.split(X, y)):
#         X_train, X_val = X.iloc[train_idx], X.iloc[test_idx]
#         y_train, y_val = y[train_idx], y[test_idx]

#         model.fit(X_train, y_train)
#         y_pred = model.predict(X_val)

#         score = f1_score(y_val, y_pred, average='macro')
#         scores.append(score)
#         print(f"Fold {i+1}/{k} - F1 Score: {score:.4f}")

#     return np.mean(scores), np.std(scores)

In [9]:
# for name, model in models.items():
#     print(f"\n📌 Model: {name}")
#     mean_score, std_score = kfold_train_eval(X, y_encoded, model)
#     print(f"Mean F1 Score: {mean_score:.4f} ± {std_score:.4f}")

| Model         | Mean F1 Score   | Std (±)           | 
| ------------- | --------------- | ----------------- | 
| Decision Tree | 0.4599          | ±0.0134           | 
| Random Forest | 0.6005          | ±0.0242           | 
| XGBoost       | 0.7422          | ±0.0340           | 
| LightGBM      | 0.7372          | ±0.0181           | 
| CatBoost      | **0.7501**      | ± 0.0324          | 


In [None]:
import optuna
import numpy as np
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
def cat_objective(trial):
    
    param = {
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3, log=True),
        "objective": "MultiClassOneVsAll",
        "l2_leaf_reg": trial.suggest_float("l2_leaf_reg", 3.0, 10.0, log=True),
        "random_strength": trial.suggest_float("random_strength", 1.0, 10.0),
        "bagging_temperature": trial.suggest_float("bagging_temperature", 0.1, 1.0),
        "depth": trial.suggest_int("depth", 6, 10),
        "iterations": trial.suggest_int("iterations", 400, 1000),
        "class_weights": class_weights, 
        "task_type": "GPU", 
        "random_state": 42,
        "verbose": 200
    }

    # 그룹 KFold 설정
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    model = CatBoostClassifier(**param)
    scores = []

    for train_idx, val_idx in cv.split(X, y_encoded):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

        model.fit(
            X_train,
            y_train,
            eval_set=(X_val, y_val),
        )

        y_pred = model.predict(X_val)
        score = f1_score(y_val, y_pred, average='macro')
        scores.append(score)
        print(f"std : {np.std(scores)}")

    return np.mean(scores)


In [None]:
import json

def save_best_trial_json(study, trial):
    if study.best_trial == trial:
        best_result = {
            "value": trial.value,
            "params": trial.params
        }
        with open("./result/best_trial_result.json", "w", encoding="utf-8") as f:
            json.dump(best_result, f, indent=4)
        print("✅ Best trial updated and saved to JSON.")


In [12]:
cat_study = optuna.create_study(direction='maximize')
cat_study.optimize(cat_objective, n_trials=30, callbacks=[save_best_trial_json])

print("Best trial:")
trial = cat_study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")

for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-05-16 13:26:36,612] A new study created in memory with name: no-name-0fa9498e-21d1-4ede-9230-e3e0dbb7c81c
[WinError 2] 지정된 파일을 찾을 수 없습니다
  File "c:\Users\user\Desktop\Credit-Card-Segment-Classfication\.venv\lib\site-packages\joblib\externals\loky\backend\context.py", line 257, in _count_physical_cores
    cpu_info = subprocess.run(
  File "C:\Users\user\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 501, in run
    with Popen(*popenargs, **kwargs) as process:
  File "C:\Users\user\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 969, in __init__
    self._execute_child(args, executable, preexec_fn, close_fds,
  File "C:\Users\user\AppData\Local\Programs\Python\Python310\lib\subprocess.py", line 1438, in _execute_child
    hp, ht, pid, tid = _winapi.CreateProcess(executable, args,


0:	learn: 0.6771942	test: 0.6850484	best: 0.6850484 (0)	total: 560ms	remaining: 3m 52s
200:	learn: 0.0250021	test: 0.3840615	best: 0.3840615 (200)	total: 1m 42s	remaining: 1m 49s
400:	learn: 0.0040212	test: 0.3354703	best: 0.3354703 (400)	total: 3m 17s	remaining: 7.89s
416:	learn: 0.0037202	test: 0.3329358	best: 0.3329358 (416)	total: 3m 25s	remaining: 0us
bestTest = 0.3329357872
bestIteration = 416
std : 0.0


[W 2025-05-16 13:39:32,989] Trial 0 failed with parameters: {'learning_rate': 0.01698598660601052, 'l2_leaf_reg': 4.206373078910925, 'random_strength': 3.561833542412253, 'bagging_temperature': 0.4088511342625908, 'depth': 10, 'iterations': 417} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "c:\Users\user\Desktop\Credit-Card-Segment-Classfication\.venv\lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\user\AppData\Local\Temp\ipykernel_37244\3759987235.py", line 34, in cat_objective
    X_resampled, y_resampled = sm.fit_resample(X_train, y_train)
  File "c:\Users\user\Desktop\Credit-Card-Segment-Classfication\.venv\lib\site-packages\imblearn\base.py", line 202, in fit_resample
    return super().fit_resample(X, y, **params)
  File "c:\Users\user\Desktop\Credit-Card-Segment-Classfication\.venv\lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_meth

KeyboardInterrupt: 

In [None]:
import json
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score

# JSON에서 파라미터 로드
with open("result/best_trial_result.json", "r", encoding="utf-8") as f:
    best_result = json.load(f)
params = best_result["params"]

# CatBoost 설정
params.update({
    "random_state": 42,
    "class_weights": class_weights,
    "task_type": "GPU",
    "verbose": 200,
})

print(params)

model = CatBoostClassifier(**params)

print("파라미터 업데이트 완료...!")

# CV 준비
n_splits = 5
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
scores = []

print("K-fold CV 시작")

for i, (train_idx, val_idx) in enumerate(skf.split(X, y_encoded)):
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y_encoded[train_idx], y_encoded[val_idx]

    model.fit(X_train, y_train, eval_set=(X_val, y_val))

    # test에 대한 확률 예측
    y_pred = model.predict(X_val)  # 각 클래스별 확률

    score = f1_score(y_val, y_pred, average='macro')
    
    scores.append(score)
    print(f"Fold {i+1}/{n_splits} - F1 Score: {score:.4f}")

y_pred_labels = model.predict(X_test)
y_pred_labels = le.inverse_transform(y_pred_labels)

model.save_model('result/catboost_final_model.cbm')

# 제출 파일 생성
test_data = test_df.copy()
test_data["pred_label"] = y_pred_labels

submission = test_data.groupby("ID")["pred_label"].agg(lambda x: x.value_counts().idxmax()).reset_index()
submission.columns = ["ID", "Segment"]
submission.to_csv("submission.csv", index=False)
print("✅ Submission 생성 완료! ")


{'learning_rate': 0.14443040357384754, 'l2_leaf_reg': 3.579206732697243, 'random_strength': 9.260795094764234, 'bagging_temperature': 0.8085060703189194, 'depth': 9, 'iterations': 940, 'random_state': 42, 'class_weights': {0: 493.82716049382714, 1: 3333.3333333333335, 2: 3.7620503174229953, 3: 1.3744051402752246, 4: 0.2497330977517778}, 'task_type': 'GPU', 'verbose': 100}
파라미터 업데이트 완료...!
K-fold CV 시작
0:	learn: 1.3007837	test: 1.3291329	best: 1.3291329 (0)	total: 129ms	remaining: 2m 1s
100:	learn: 0.2921535	test: 0.4258981	best: 0.4163611 (88)	total: 13.3s	remaining: 1m 50s
200:	learn: 0.2505161	test: 0.4073600	best: 0.4061890 (197)	total: 26.2s	remaining: 1m 36s
300:	learn: 0.2256003	test: 0.3930573	best: 0.3922999 (297)	total: 39.2s	remaining: 1m 23s
400:	learn: 0.2072978	test: 0.3776745	best: 0.3768182 (396)	total: 52.2s	remaining: 1m 10s
500:	learn: 0.1936344	test: 0.3750940	best: 0.3733882 (467)	total: 1m 4s	remaining: 56.8s
600:	learn: 0.1812814	test: 0.3649985	best: 0.3623688 (5

  y = column_or_1d(y, warn=True)


✅ Submission 생성 완료! 
