In [None]:
from psutil import virtual_memory
ram_gb = virtual_memory().total / 1e9
print('{:.1f}GB RAM\n'.format(ram_gb))

if ram_gb < 20:
  print('고성능 램 사용 안하고 있네')
else:
  print('고성능 램 사용중ㅋ!')

# Package Install

In [None]:
!pip install catboost==1.2.8
!pip install optuna==4.4.0

# Library load

In [None]:
import pandas as pd
import numpy as np
import os
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder
import optuna

# Data Load

In [None]:
test = pd.read_parquet('test.parquet')
train = pd.read_parquet('train.parquet')

# Target data split (Segment = [C,D,E])

In [None]:
ab_ids = train[train['Segment'].isin(['A', 'B'])]['ID'].unique()
train = train[~train['ID'].isin(ab_ids)].copy()
train['Segment'].value_counts()

In [None]:
# target data Encoding
label_encoder = LabelEncoder()
train['Segment'] = label_encoder.fit_transform(train['Segment'])

# feature, target sep
X = train.drop(columns=['Segment', 'ID'])
y = train['Segment']
X_test = test.drop(columns=['ID'])

# Def cat_features
cat_features = [col for col in X.columns if X[col].dtype == 'object']
for col in cat_features:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Modeling (Segment = [C,D,E])

In [None]:
# seed 설정
seed = 14

# 모델 정의
def objective(trial):
    bootstrap_type = trial.suggest_categorical('bootstrap_type', ['Bayesian', 'Bernoulli'])

    params = {
        'iterations': 1500,
        'learning_rate': trial.suggest_float("learning_rate", 0.01, 0.3),
        'depth': 8,
        'l2_leaf_reg': trial.suggest_float("l2_leaf_reg", 1.0, 10.0),
        'random_strength': trial.suggest_float("random_strength", 1e-9, 10.0),
        'border_count': trial.suggest_int("border_count", 32, 255),
        'bootstrap_type': bootstrap_type,
        'task_type': 'GPU', # colab A100 사용 중
        'loss_function': 'MultiClass',
        'eval_metric': 'Accuracy',  # 평가지표 Accuracy
        'verbose': 0,
        'random_seed': seed,
        'class_weights': [2, 1, 1]
    }

    if bootstrap_type == 'Bayesian':
        params['bagging_temperature'] = trial.suggest_float("bagging_temperature", 0.0, 1.0)
    else:
        params['subsample'] = trial.suggest_float("subsample", 0.5, 1.0)

    X_train_sub, X_valid, y_train_sub, y_valid = train_test_split(
        X, y, test_size=0.2, stratify=y, random_state=seed
    ) # 1차적으로 test_size = 0.2로 줌

    model = CatBoostClassifier(**params)
    model.fit(
        X_train_sub, y_train_sub,
        eval_set=(X_valid, y_valid),
        cat_features=cat_features,
        use_best_model=True,
        early_stopping_rounds=100
    )

    preds = model.predict(X_valid)
    score = accuracy_score(y_valid, preds)
    return score

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=5) # 모델 잘 돌아가는지 보기위해 5회만 실행

best_params = study.best_trial.params

if best_params['bootstrap_type'] == 'Bayesian':
    best_params['bagging_temperature'] = study.best_trial.params['bagging_temperature']
else:
    best_params['subsample'] = study.best_trial.params['subsample']
  
best_params.update({
    'iterations': 1500,
    'loss_function': 'MultiClass',
    'eval_metric': 'Accuracy',
    'task_type': 'GPU',
    'verbose': 100,
    'random_seed': seed,
    'class_weights': [2, 1, 1],
    'depth': 8
})

In [None]:
# Stratified K-Fold(10) 적용

n_classes = len(np.unique(y))
kf = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
all_test_probs = np.zeros((X_test.shape[0], n_classes))

for fold, (train_idx, valid_idx) in enumerate(kf.split(X, y)):
    print(f"🚀 Fold {fold+1} training...")

    X_train_fold, y_train_fold = X.iloc[train_idx], y.iloc[train_idx]
    X_valid_fold, y_valid_fold = X.iloc[valid_idx], y.iloc[valid_idx]

    model = CatBoostClassifier(**best_params)
    model.fit(X_train_fold, y_train_fold, cat_features=cat_features)

    fold_probs = model.predict_proba(X_test)
    all_test_probs += fold_probs

avg_test_probs = all_test_probs / kf.get_n_splits()
prob_df = pd.DataFrame(avg_test_probs, columns=range(n_classes))
prob_df['ID'] = test['ID'].values

mean_probs = prob_df.groupby('ID').mean().reset_index()
mean_probs['Segment'] = mean_probs.drop(columns='ID').values.argmax(axis=1)

segment_mapping = {0: 'C', 1: 'D', 2: 'E'}
mean_probs['Segment'] = mean_probs['Segment'].map(segment_mapping)

submission = pd.DataFrame({'ID': mean_probs['ID'], 'Segment': mean_probs['Segment']})
submission.to_csv('base_catboost_kfold.csv', index=False)

print("예측 완료")

# Modeling (Segment = [A, B])