# CatBoost Experiment with Optuna

This notebook performs hyperparameter tuning for CatBoost using Optuna.

In [1]:
import pandas as pd
import numpy as np
import catboost as cb
import optuna
from optuna.samplers import TPESampler
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.preprocessing import LabelEncoder
import warnings

warnings.filterwarnings('ignore')
SEED = 42
np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Data Loading & Feature Engineering

In [2]:
def create_advanced_features(df):
    df = df.copy()
    activity_cols = ['hobby_engagement_level', 'physical_activity_index', 
                     'creative_expression_index', 'altruism_score']
    df['total_activity'] = df[activity_cols].sum(axis=1)
    df['support_guidance_combo'] = df['support_environment_score'] * (df['external_guidance_usage'] + 1)
    df['focus_efficiency'] = df['focus_intensity'] / (df['consistency_score'] + 1)
    df['consistency_gap'] = 30 - df['consistency_score']
    df['focus_sq'] = df['focus_intensity'] ** 2
    df['focus_X_consistency'] = df['focus_intensity'] * df['consistency_score']
    df['low_focus_high_consist'] = ((df['focus_intensity'] < 5) & (df['consistency_score'] > 24)).astype(int)
    return df

try:
    train_df = pd.read_csv('./dataset/train.csv')
    test_df = pd.read_csv('./dataset/test.csv')
except FileNotFoundError:
    print("❌ Upload train.csv and test.csv!")

train_df = create_advanced_features(train_df)
test_df = create_advanced_features(test_df)

X = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y = train_df['personality_cluster']
test_ids = test_df['participant_id']
X_test = test_df.drop(['participant_id'], axis=1)

## 2. Preprocessing (CatBoost Native)

In [3]:
cat_cols = [
    'identity_code', 'cultural_background', 'age_group', 
    'upbringing_influence', 'support_environment_score', 
    'hobby_engagement_level', 'physical_activity_index',
    'creative_expression_index', 'altruism_score',
    'low_focus_high_consist'
]

# Ensure categorical columns are strings or integers for CatBoost
for col in cat_cols:
    X[col] = X[col].astype(str)
    X_test[col] = X_test[col].astype(str)

# Encode target
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
num_classes = len(target_le.classes_)

## 3. Optuna Optimization

In [4]:
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-9, 10, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 100, log=True),
        'loss_function': 'MultiClass',
        'eval_metric': 'TotalF1',
        'random_seed': SEED,
        'verbose': False,
        'cat_features': cat_cols
    }
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y_encoded):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y_encoded[train_idx], y_encoded[val_idx]
        
        train_pool = cb.Pool(X_train_fold, y_train_fold, cat_features=cat_cols)
        val_pool = cb.Pool(X_val_fold, y_val_fold, cat_features=cat_cols)
        
        model = cb.CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=val_pool, early_stopping_rounds=50)
        
        preds = model.predict(X_val_fold)
        scores.append(f1_score(y_val_fold, preds, average='macro'))
        
    return np.mean(scores)

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
study.optimize(objective, n_trials=30)

print("Best params:", study.best_params)

[I 2025-11-27 17:47:06,152] A new study created in memory with name: no-name-03632056-519f-448f-9f7e-4eb2edee49e8
[I 2025-11-27 17:47:28,475] Trial 0 finished with value: 0.5668123485006045 and parameters: {'iterations': 437, 'depth': 10, 'learning_rate': 0.06504856968981275, 'random_strength': 0.0009695826644515227, 'bagging_temperature': 0.15601864044243652, 'l2_leaf_reg': 3.6303224667798554e-07}. Best is trial 0 with value: 0.5668123485006045.
[I 2025-11-27 17:47:43,348] Trial 1 finished with value: 0.5643157121773372 and parameters: {'iterations': 152, 'depth': 10, 'learning_rate': 0.030834348179355788, 'random_strength': 0.01204275297251681, 'bagging_temperature': 0.020584494295802447, 'l2_leaf_reg': 50.014798288569374}. Best is trial 0 with value: 0.5668123485006045.
[I 2025-11-27 17:47:46,079] Trial 2 finished with value: 0.5068009894270256 and parameters: {'iterations': 850, 'depth': 5, 'learning_rate': 0.002820996133514492, 'random_strength': 6.824095540630416e-08, 'bagging_te

Best params: {'iterations': 228, 'depth': 6, 'learning_rate': 0.1420513118443535, 'random_strength': 3.387360934138429e-07, 'bagging_temperature': 0.4215710277462361, 'l2_leaf_reg': 0.8840028010515606}


## 4. Final Training & Submission

In [5]:
best_params = study.best_params
best_params.update({
    'loss_function': 'MultiClass',
    'eval_metric': 'TotalF1',
    'random_seed': SEED,
    'verbose': 100,
    'cat_features': cat_cols
})

final_model = cb.CatBoostClassifier(**best_params)
final_model.fit(X, y_encoded)

test_preds = final_model.predict(X_test)
final_labels = target_le.inverse_transform(test_preds.flatten())

submission = pd.DataFrame({
    'participant_id': test_ids,
    'personality_cluster': final_labels
})
submission.to_csv('submission_catboost.csv', index=False)
print("✅ Saved submission_catboost.csv")

0:	learn: 0.7160867	total: 23.3ms	remaining: 5.29s
100:	learn: 0.8764189	total: 1.52s	remaining: 1.91s
200:	learn: 0.9410964	total: 3.39s	remaining: 456ms
227:	learn: 0.9617823	total: 3.86s	remaining: 0us
✅ Saved submission_catboost.csv
