# KNN Experiment with Optuna

This notebook performs hyperparameter tuning for K-Nearest Neighbors using Optuna.

In [3]:
import pandas as pd
import numpy as np
import optuna
from optuna.samplers import TPESampler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import f1_score
import warnings

warnings.filterwarnings('ignore')
SEED = 42
np.random.seed(SEED)

## 1. Data Loading & Feature Engineering

In [4]:
def create_advanced_features(df):
    df = df.copy()
    activity_cols = ['hobby_engagement_level', 'physical_activity_index', 
                     'creative_expression_index', 'altruism_score']
    df['total_activity'] = df[activity_cols].sum(axis=1)
    df['support_guidance_combo'] = df['support_environment_score'] * (df['external_guidance_usage'] + 1)
    df['focus_efficiency'] = df['focus_intensity'] / (df['consistency_score'] + 1)
    df['consistency_gap'] = 30 - df['consistency_score']
    df['focus_sq'] = df['focus_intensity'] ** 2
    df['focus_X_consistency'] = df['focus_intensity'] * df['consistency_score']
    df['low_focus_high_consist'] = ((df['focus_intensity'] < 5) & (df['consistency_score'] > 24)).astype(int)
    return df

try:
    train_df = pd.read_csv('./dataset/train.csv')
    test_df = pd.read_csv('./dataset/test.csv')
except FileNotFoundError:
    print("❌ Upload train.csv and test.csv!")

train_df = create_advanced_features(train_df)
test_df = create_advanced_features(test_df)

X = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y = train_df['personality_cluster']
test_ids = test_df['participant_id']
X_test = test_df.drop(['participant_id'], axis=1)

## 2. Preprocessing (Scaling is Critical for KNN)

In [5]:
cat_cols = [
    'identity_code', 'cultural_background', 'age_group', 
    'upbringing_influence', 'support_environment_score', 
    'hobby_engagement_level', 'physical_activity_index',
    'creative_expression_index', 'altruism_score',
    'low_focus_high_consist'
]

# Label Encode Categorical Features
for col in cat_cols:
    le = LabelEncoder()
    full_data = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(full_data)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))

# Scale ALL features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(X_test)

target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
num_classes = len(target_le.classes_)

## 3. Optuna Optimization

In [6]:
def objective(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 3, 50)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    p = trial.suggest_int('p', 1, 2) # 1: Manhattan, 2: Euclidean
    
    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    scores = []
    
    for train_idx, val_idx in skf.split(X_scaled, y_encoded):
        X_train_fold, X_val_fold = X_scaled[train_idx], X_scaled[val_idx]
        y_train_fold, y_val_fold = y_encoded[train_idx], y_encoded[val_idx]
        
        model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, p=p, n_jobs=-1)
        model.fit(X_train_fold, y_train_fold)
        
        preds = model.predict(X_val_fold)
        scores.append(f1_score(y_val_fold, preds, average='macro'))
        
    return np.mean(scores)

study = optuna.create_study(direction='maximize', sampler=TPESampler(seed=SEED))
study.optimize(objective, n_trials=30)

print("Best params:", study.best_params)

[I 2025-11-27 17:44:51,571] A new study created in memory with name: no-name-4d32476a-c2cd-46f6-8078-89fe9cc6a116
[I 2025-11-27 17:44:51,766] Trial 0 finished with value: 0.41157923922411704 and parameters: {'n_neighbors': 20, 'weights': 'uniform', 'p': 2}. Best is trial 0 with value: 0.41157923922411704.
[I 2025-11-27 17:44:51,780] Trial 1 finished with value: 0.45308562163859084 and parameters: {'n_neighbors': 10, 'weights': 'uniform', 'p': 2}. Best is trial 1 with value: 0.45308562163859084.
[I 2025-11-27 17:44:51,801] Trial 2 finished with value: 0.3837109079336029 and parameters: {'n_neighbors': 31, 'weights': 'uniform', 'p': 2}. Best is trial 1 with value: 0.45308562163859084.
[I 2025-11-27 17:44:51,836] Trial 3 finished with value: 0.3968019490360639 and parameters: {'n_neighbors': 42, 'weights': 'uniform', 'p': 1}. Best is trial 1 with value: 0.45308562163859084.
[I 2025-11-27 17:44:51,863] Trial 4 finished with value: 0.4466091401874843 and parameters: {'n_neighbors': 17, 'wei

Best params: {'n_neighbors': 9, 'weights': 'distance', 'p': 1}


## 4. Final Training & Submission

In [7]:
best_params = study.best_params
final_model = KNeighborsClassifier(**best_params, n_jobs=-1)
final_model.fit(X_scaled, y_encoded)

test_preds = final_model.predict(X_test_scaled)
final_labels = target_le.inverse_transform(test_preds)

submission = pd.DataFrame({
    'participant_id': test_ids,
    'personality_cluster': final_labels
})
submission.to_csv('submission_knn.csv', index=False)
print("✅ Saved submission_knn.csv")

✅ Saved submission_knn.csv
