# XGBoost Experiment with Optuna

This notebook performs hyperparameter tuning for XGBoost using Optuna.

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import optuna
from optuna.samplers import TPESampler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
from sklearn.utils.class_weight import compute_class_weight
import warnings

warnings.filterwarnings('ignore')
SEED = 42
np.random.seed(SEED)

  from .autonotebook import tqdm as notebook_tqdm


## 1. Data Loading & Feature Engineering

In [2]:
def create_advanced_features(df):
    df = df.copy()
    activity_cols = ['hobby_engagement_level', 'physical_activity_index', 
                     'creative_expression_index', 'altruism_score']
    df['total_activity'] = df[activity_cols].sum(axis=1)
    df['support_guidance_combo'] = df['support_environment_score'] * (df['external_guidance_usage'] + 1)
    df['focus_efficiency'] = df['focus_intensity'] / (df['consistency_score'] + 1)
    df['consistency_gap'] = 30 - df['consistency_score']
    df['focus_sq'] = df['focus_intensity'] ** 2
    df['focus_X_consistency'] = df['focus_intensity'] * df['consistency_score']
    df['low_focus_high_consist'] = ((df['focus_intensity'] < 5) & (df['consistency_score'] > 24)).astype(int)
    return df

try:
    train_df = pd.read_csv('./dataset/train.csv')
    test_df = pd.read_csv('./dataset/test.csv')
except FileNotFoundError:
    print("❌ Upload train.csv and test.csv!")

train_df = create_advanced_features(train_df)
test_df = create_advanced_features(test_df)

X = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y = train_df['personality_cluster']
test_ids = test_df['participant_id']
X_test = test_df.drop(['participant_id'], axis=1)

## 2. Preprocessing

In [3]:
cat_cols = [
    'identity_code', 'cultural_background', 'age_group', 
    'upbringing_influence', 'support_environment_score', 
    'hobby_engagement_level', 'physical_activity_index',
    'creative_expression_index', 'altruism_score',
    'low_focus_high_consist'
]

# Label Encoding for XGBoost
for col in cat_cols:
    le = LabelEncoder()
    full_data = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(full_data)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Scale Numerical Features (as per demo.py)
from sklearn.preprocessing import StandardScaler
num_cols = [c for c in X.columns if c not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
num_classes = len(target_le.classes_)

## 3. Optuna Optimization

In [4]:
def objective(trial):
    params = {
        'objective': 'multi:softmax',
        'num_class': num_classes,
        'tree_method': 'hist',
        'eval_metric': 'mlogloss',
        'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'learning_rate': trial.suggest_float('learning_rate', 1e-3, 0.3, log=True),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'reg_alpha': trial.suggest_float('reg_alpha', 1e-8, 10.0, log=True),
        'reg_lambda': trial.suggest_float('reg_lambda', 1e-8, 10.0, log=True),
        'random_state': SEED,
        'n_jobs': -1
    }
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in skf.split(X, y_encoded):
        X_train_fold, X_val_fold = X.iloc[train_idx], X.iloc[val_idx]
        y_train_fold, y_val_fold = y_encoded[train_idx], y_encoded[val_idx]
        
        model = xgb.XGBClassifier(**params)
        model.fit(X_train_fold, y_train_fold)
        
        preds = model.predict(X_val_fold)
        scores.append(f1_score(y_val_fold, preds, average='macro'))
        
    return np.mean(scores)

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
study.optimize(objective, n_trials=30)

print("Best params:", study.best_params)

[I 2025-11-27 18:01:29,976] A new study created in memory with name: no-name-56898d0c-7e0a-4ca8-9d08-67fc1b0a9fb9
[I 2025-11-27 18:01:42,881] Trial 0 finished with value: 0.5631908658383744 and parameters: {'n_estimators': 437, 'max_depth': 10, 'learning_rate': 0.06504856968981275, 'subsample': 0.7993292420985183, 'colsample_bytree': 0.5780093202212182, 'min_child_weight': 2, 'reg_alpha': 3.3323645788192616e-08, 'reg_lambda': 0.6245760287469887}. Best is trial 0 with value: 0.5631908658383744.
[I 2025-11-27 18:02:01,975] Trial 1 finished with value: 0.5523027194376559 and parameters: {'n_estimators': 641, 'max_depth': 8, 'learning_rate': 0.001124579825911934, 'subsample': 0.9849549260809971, 'colsample_bytree': 0.9162213204002109, 'min_child_weight': 3, 'reg_alpha': 4.329370014459266e-07, 'reg_lambda': 4.4734294104626844e-07}. Best is trial 0 with value: 0.5631908658383744.
[I 2025-11-27 18:02:12,031] Trial 2 finished with value: 0.5713274944229729 and parameters: {'n_estimators': 374,

Best params: {'n_estimators': 708, 'max_depth': 9, 'learning_rate': 0.007967225414701476, 'subsample': 0.6183051824215566, 'colsample_bytree': 0.5942586002997777, 'min_child_weight': 5, 'reg_alpha': 1.982316621457163e-05, 'reg_lambda': 0.024010129635236837}


## 4. Final Training & Submission

In [5]:
best_params = study.best_params
best_params.update({
    'objective': 'multi:softmax',
    'num_class': num_classes,
    'tree_method': 'hist',
    'eval_metric': 'mlogloss',
    'random_state': SEED,
    'n_jobs': -1
})

final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X, y_encoded)

test_preds = final_model.predict(X_test)
final_labels = target_le.inverse_transform(test_preds)

submission = pd.DataFrame({
    'participant_id': test_ids,
    'personality_cluster': final_labels
})
submission.to_csv('submission_xgboost.csv', index=False)
print("✅ Saved submission_xgboost.csv")

✅ Saved submission_xgboost.csv
