In [1]:
!pip install catboost
!pip install optuna

Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m9.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8
Collecting optuna
  Downloading optuna-4.4.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.4-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Downloading optuna-4.4.0-py3-none-any.whl (395 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m395.9/395.9 kB[0m [31m14.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading alembic-1.16.4-py3-none-any.whl (247 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m247.0/247.0 kB[0m [31m23.6 MB/s[0m eta [36m

In [2]:
import pandas as pd
import gc
from google.colab import drive
drive.mount('/content/drive')

data_root = '/content/drive/MyDrive/부트캠프/'
data_path = data_root + '11_파이널/'

Mounted at /content/drive


In [3]:
origin_train = pd.read_parquet(data_path + 'processed/train_all.parquet')
# origin_test = pd.read_parquet(data_path + 'processed/test_all.parquet')

In [4]:
segment = pd.read_csv(data_path + 'processed/segment.csv')
segment['CD'] = (segment['Segment'].isin(['C','D'])).astype(int)

In [5]:
train_cd = origin_train.loc[segment['CD'] == 1,:]
segment_cd = segment.loc[segment['CD'] == 1,:'Segment']

In [6]:
x = train_cd.drop(['ID', '기준년월'], axis=1)
y = segment_cd == 'D'

In [7]:
y.drop('ID', axis=1, inplace=True)

In [9]:
print(x.shape)
y.value_counts()

(476832, 310)


Unnamed: 0_level_0,count
Segment,Unnamed: 1_level_1
True,349242
False,127590


In [10]:
from sklearn.preprocessing import RobustScaler

scaler = RobustScaler()
x_scaled = scaler.fit_transform(x)

In [11]:
from sklearn.model_selection import train_test_split

x_train1, x_val, y_train1, y_val = train_test_split(x_scaled, y, test_size=0.2, random_state=42, stratify=y)
x_train, x_opt, y_train, y_opt = train_test_split(x_train1, y_train1, test_size=0.2, random_state=42, stratify=y_train1)

In [None]:
# from imblearn.over_sampling import SMOTE

# sampler = SMOTE(random_state=42)
# x_sample, y_sample = sampler.fit_resample(x_train, y_train)

In [12]:
import optuna
from catboost import CatBoostClassifier
from sklearn.metrics import f1_score

def cat_objective(trial):
    params = {
        'depth': trial.suggest_int('depth', 4, 10),
        'class_weights': trial.suggest_float('class_weights', 1.0, 5.0),
        'border_count': trial.suggest_int('border_count', 32, 255),
        'l2_leaf_reg': trial.suggest_int('l2_leaf_reg', 1, 10),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
    }

    clf = CatBoostClassifier(depth= params.get('depth'),
                             class_weights= [params.get('class_weights'), 1.0],
                             border_count= params.get('border_count'),
                             l2_leaf_reg= params.get('l2_leaf_reg'),
                             learning_rate = params.get('learning_rate'),
                             iterations=1000, early_stopping_rounds=20,
                             task_type="GPU", devices='0',
                             verbose=False,
                             random_state=42)
    clf.fit(x_train, y_train, eval_set=(x_opt, y_opt))

    y_val_pred = clf.predict(x_val)
    f1_macro = f1_score(y_val, y_val_pred, average='macro')

    return f1_macro

In [13]:
study = optuna.create_study(direction='maximize')
study.optimize(cat_objective, n_trials=10)

print("Best trial:")
trial = study.best_trial
print("  Value: {}".format(trial.value))
print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

[I 2025-07-14 08:31:14,384] A new study created in memory with name: no-name-2f993f7a-3e0c-49e9-a119-19eaef2ad2f8
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
[I 2025-07-14 08:32:26,055] Trial 0 finished with value: 0.7523459858714765 and parameters: {'depth': 6, 'class_weights': 1.6057074526121689, 'border_count': 32, 'l2_leaf_reg': 6, 'learning_rate': 2.491202164558461e-05}. Best is trial 0 with value: 0.7523459858714765.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
[I 2025-07-14 08:34:06,837] Trial 1 finished with value: 0.7678117733090524 and parameters: {'depth': 9, 'class_weights': 2.2602437705346903, 'border_count': 184, 'l2_leaf_reg': 1, 'learning_rate': 0.0004036812188988321}. Best is trial 1 with value: 0.7678117733090524.
  'learning_rate': trial.suggest_loguniform('learning_rate', 1e-5, 1e-3)
[I 2025-07-14 08:35:25,958] Trial 2 finished with value: 0.6945576112485479 and parameters: {'depth': 6, 'class_weights': 4.4230

Best trial:
  Value: 0.7770840355833937
  Params: 
    depth: 10
    class_weights: 1.8313457035613827
    border_count: 75
    l2_leaf_reg: 1
    learning_rate: 0.0007365925961101593


In [14]:
from catboost import CatBoostClassifier

model = CatBoostClassifier(depth=10,
                           learning_rate=7e-4,
                           l2_leaf_reg=1,
                           early_stopping_rounds=20, iterations=1000,
                           class_weights=[1.83, 1],
                           border_count = 75,
                           task_type="GPU", devices='0',
                           random_state=42)
# model.fit(x_sample, y_sample, eval_set=(x_opt, y_opt))
model.fit(x_train, y_train, eval_set=(x_opt, y_opt))

0:	learn: 0.6927837	test: 0.6927854	best: 0.6927854 (0)	total: 163ms	remaining: 2m 42s
1:	learn: 0.6924266	test: 0.6924293	best: 0.6924293 (1)	total: 295ms	remaining: 2m 27s
2:	learn: 0.6920671	test: 0.6920714	best: 0.6920714 (2)	total: 389ms	remaining: 2m 9s
3:	learn: 0.6917105	test: 0.6917174	best: 0.6917174 (3)	total: 501ms	remaining: 2m 4s
4:	learn: 0.6913553	test: 0.6913631	best: 0.6913631 (4)	total: 615ms	remaining: 2m 2s
5:	learn: 0.6909957	test: 0.6910050	best: 0.6910050 (5)	total: 727ms	remaining: 2m
6:	learn: 0.6906338	test: 0.6906452	best: 0.6906452 (6)	total: 847ms	remaining: 2m
7:	learn: 0.6902853	test: 0.6902976	best: 0.6902976 (7)	total: 962ms	remaining: 1m 59s
8:	learn: 0.6899284	test: 0.6899424	best: 0.6899424 (8)	total: 1.07s	remaining: 1m 58s
9:	learn: 0.6895777	test: 0.6895927	best: 0.6895927 (9)	total: 1.19s	remaining: 1m 57s
10:	learn: 0.6892280	test: 0.6892447	best: 0.6892447 (10)	total: 1.3s	remaining: 1m 56s
11:	learn: 0.6888825	test: 0.6889010	best: 0.6889010 

<catboost.core.CatBoostClassifier at 0x7a49b20e7ad0>

In [15]:

y_val_pred = model.predict(x_val)
f1_micro = f1_score(y_val, y_val_pred, average='micro')
print(f"F1 score (micro): {f1_micro:.4f}")

f1_macro = f1_score(y_val, y_val_pred, average='macro')
print(f"F1 score (macro): {f1_macro:.4f}")

F1 score (micro): 0.8259
F1 score (macro): 0.7768


In [16]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_val, y_val_pred)
print(conf_matrix)

[[17011  8507]
 [ 8094 61755]]


In [None]:
from sklearn.metrics import roc_auc_score

y_val_pred_proba = model.predict_proba(x_val)[:, 1]
roc_auc = roc_auc_score(y_val, y_val_pred_proba)

In [17]:
import pickle

with open(data_path + 'data/step3_Op_SMOTE.dat', 'wb') as fp:
    pickle.dump(model, fp)
    pickle.dump(scaler, fp)