# Tabular Classification with PyTorch & Optuna

This notebook demonstrates advanced tabular classification using PyTorch, Optuna for hyperparameter tuning, and feature engineering.

## 1. Import Libraries and Set Seed
Import all required libraries and set the random seed for reproducibility. Print the device being used (CPU/GPU).

In [1]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random
import os
import optuna
from optuna.samplers import TPESampler
from torch.utils.data import Dataset, DataLoader
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import f1_score

# Set seed for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

SEED = 42
seed_everything(SEED)

DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"✅ Processing on: {DEVICE}")

BATCH_SIZE = 64
N_TRIALS = 500  # How many Optuna trials to run

  from .autonotebook import tqdm as notebook_tqdm


✅ Processing on: cpu


## 2. Load Data and Feature Engineering
Load train and test data, and apply advanced feature engineering.

In [2]:
def create_advanced_features(df):
    df = df.copy()
    activity_cols = ['hobby_engagement_level', 'physical_activity_index', 
                     'creative_expression_index', 'altruism_score']
    df['total_activity'] = df[activity_cols].sum(axis=1)
    df['support_guidance_combo'] = df['support_environment_score'] * (df['external_guidance_usage'] + 1)
    df['focus_efficiency'] = df['focus_intensity'] / (df['consistency_score'] + 1)
    df['consistency_gap'] = 30 - df['consistency_score']
    df['focus_sq'] = df['focus_intensity'] ** 2
    df['focus_X_consistency'] = df['focus_intensity'] * df['consistency_score']
    df['low_focus_high_consist'] = ((df['focus_intensity'] < 5) & (df['consistency_score'] > 24)).astype(int)
    return df

try:
    train_df = pd.read_csv('../dataset/train.csv')
    test_df = pd.read_csv('../dataset/test.csv')
except FileNotFoundError:
    raise FileNotFoundError("❌ Upload train.csv and test.csv!")

train_df = create_advanced_features(train_df)
test_df = create_advanced_features(test_df)

X = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y = train_df['personality_cluster']
test_ids = test_df['participant_id']
X_test = test_df.drop(['participant_id'], axis=1)

## 3. Preprocess Data (Encoding & Scaling)
Encode categorical features and scale numerical features for both train and test sets.

In [3]:
cat_cols = [
    'identity_code', 'cultural_background', 'age_group', 
    'upbringing_influence', 'support_environment_score', 
    'hobby_engagement_level', 'physical_activity_index',
    'creative_expression_index', 'altruism_score',
    'low_focus_high_consist'
]

cat_dims = []
for col in cat_cols:
    le = LabelEncoder()
    full_data = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(full_data)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
    num_classes = len(le.classes_)
    emb_dim = min(50, (num_classes + 1) // 2)
    cat_dims.append((num_classes, emb_dim))

num_cols = [c for c in X.columns if c not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

X_cat = X[cat_cols].values.astype(np.int64)
X_num = X[num_cols].values.astype(np.float32)
X_test_cat = X_test[cat_cols].values.astype(np.int64)
X_test_num = X_test[num_cols].values.astype(np.float32)

## 4. Prepare Datasets and Class Weights
Encode target labels, compute class weights, and prepare PyTorch datasets for training and validation.

In [4]:
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
num_classes_target = len(target_le.classes_)

class_weights = compute_class_weight('balanced', classes=np.unique(y_encoded), y=y_encoded)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(DEVICE)

## 5. Define Dynamic Model Architecture
Implement dataset and model classes for flexible tabular neural network modeling.

In [5]:
class AdvancedTabularDataset(Dataset):
    def __init__(self, x_cat, x_num, y=None):
        self.x_cat = torch.tensor(x_cat, dtype=torch.long)
        self.x_num = torch.tensor(x_num, dtype=torch.float)
        self.y = torch.tensor(y, dtype=torch.long) if y is not None else None
    def __len__(self): return len(self.x_cat)
    def __getitem__(self, idx):
        if self.y is not None: return self.x_cat[idx], self.x_num[idx], self.y[idx]
        return self.x_cat[idx], self.x_num[idx]

class DynamicTabularModel(nn.Module):
    def __init__(self, cat_dims, num_dim, output_dim, 
                 l1_size, l2_size, l3_size, 
                 emb_dropout, hidden_dropout):
        super(DynamicTabularModel, self).__init__()
        self.embeddings = nn.ModuleList([
            nn.Embedding(n, d) for n, d in cat_dims
        ])
        self.emb_dropout = nn.Dropout(emb_dropout)
        total_emb_dim = sum([d for _, d in cat_dims])
        input_dim = total_emb_dim + num_dim
        self.bn0 = nn.BatchNorm1d(num_dim)
        self.fc1 = nn.Linear(input_dim, l1_size)
        self.bn1 = nn.BatchNorm1d(l1_size)
        self.act1 = nn.GELU()
        self.drop1 = nn.Dropout(hidden_dropout)
        self.fc2 = nn.Linear(l1_size, l2_size)
        self.bn2 = nn.BatchNorm1d(l2_size)
        self.act2 = nn.GELU()
        self.drop2 = nn.Dropout(hidden_dropout)
        self.fc3 = nn.Linear(l2_size, l3_size)
        self.bn3 = nn.BatchNorm1d(l3_size)
        self.act3 = nn.GELU()
        self.drop3 = nn.Dropout(hidden_dropout / 2)
        self.output = nn.Linear(l3_size, output_dim)
    def forward(self, x_cat, x_num):
        emb_list = [emb(x_cat[:, i]) for i, emb in enumerate(self.embeddings)]
        x_emb = self.emb_dropout(torch.cat(emb_list, dim=1))
        x_num = self.bn0(x_num)
        x = torch.cat([x_emb, x_num], dim=1)
        x = self.drop1(self.act1(self.bn1(self.fc1(x))))
        x = self.drop2(self.act2(self.bn2(self.fc2(x))))
        x = self.drop3(self.act3(self.bn3(self.fc3(x))))
        return self.output(x)

## 6. Optuna Hyperparameter Tuning
Define the Optuna objective function and run the study to find the best hyperparameters using cross-validation.

In [6]:
print(f"--- Starting Optuna Tuning ({N_TRIALS} Trials) ---")

def objective(trial):
    l1_size = trial.suggest_int('l1_size', 128, 512)
    l2_size = trial.suggest_int('l2_size', 64, 256)
    l3_size = trial.suggest_int('l3_size', 32, 128)
    emb_dropout = trial.suggest_float('emb_dropout', 0.1, 0.5)
    hidden_dropout = trial.suggest_float('hidden_dropout', 0.1, 0.5)
    lr = trial.suggest_float('lr', 1e-4, 1e-2, log=True)
    weight_decay = trial.suggest_float('weight_decay', 1e-6, 1e-3, log=True)
    skf_tune = StratifiedKFold(n_splits=3, shuffle=True, random_state=SEED)
    scores = []
    for train_idx, val_idx in skf_tune.split(X_num, y_encoded):
        train_ds = AdvancedTabularDataset(X_cat[train_idx], X_num[train_idx], y_encoded[train_idx])
        val_ds = AdvancedTabularDataset(X_cat[val_idx], X_num[val_idx], y_encoded[val_idx])
        train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
        val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=0)
        model = DynamicTabularModel(cat_dims, X_num.shape[1], num_classes_target,
                                    l1_size, l2_size, l3_size, emb_dropout, hidden_dropout).to(DEVICE)
        criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
        optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
        for epoch in range(25):
            model.train()
            for c, n, y_batch in train_loader:
                c, n, y_batch = c.to(DEVICE), n.to(DEVICE), y_batch.to(DEVICE)
                optimizer.zero_grad()
                loss = criterion(model(c, n), y_batch)
                loss.backward()
                optimizer.step()
        model.eval()
        preds, labels = [], []
        with torch.no_grad():
            for c, n, y_batch in val_loader:
                c, n = c.to(DEVICE), n.to(DEVICE)
                p = torch.argmax(model(c, n), dim=1)
                preds.extend(p.cpu().numpy())
                labels.extend(y_batch.numpy())
        scores.append(f1_score(labels, preds, average='macro'))
        trial.report(scores[-1], 0)
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()
    return np.mean(scores)

sampler = TPESampler(seed=SEED)
study = optuna.create_study(direction="maximize", sampler=sampler)
study.optimize(objective, n_trials=N_TRIALS)

print("\n✅ BEST PARAMS FOUND:")
print(study.best_params)
best_p = study.best_params

[I 2025-11-27 16:46:26,392] A new study created in memory with name: no-name-b185cea5-2680-490e-9329-b1861c0057bd


--- Starting Optuna Tuning (500 Trials) ---


[I 2025-11-27 16:46:35,109] Trial 0 finished with value: 0.5568763884287226 and parameters: {'l1_size': 272, 'l2_size': 247, 'l3_size': 103, 'emb_dropout': 0.3394633936788147, 'hidden_dropout': 0.1624074561769746, 'lr': 0.00020511104188433984, 'weight_decay': 1.493656855461763e-06}. Best is trial 0 with value: 0.5568763884287226.
[I 2025-11-27 16:46:40,634] Trial 1 finished with value: 0.5723278969143774 and parameters: {'l1_size': 461, 'l2_size': 180, 'l3_size': 100, 'emb_dropout': 0.10823379771832098, 'hidden_dropout': 0.4879639408647978, 'lr': 0.004622589001020831, 'weight_decay': 4.335281794951567e-06}. Best is trial 1 with value: 0.5723278969143774.
[I 2025-11-27 16:46:45,176] Trial 2 finished with value: 0.5975094421005186 and parameters: {'l1_size': 198, 'l2_size': 99, 'l3_size': 61, 'emb_dropout': 0.30990257265289517, 'hidden_dropout': 0.2727780074568463, 'lr': 0.0003823475224675188, 'weight_decay': 6.847920095574779e-05}. Best is trial 2 with value: 0.5975094421005186.
[I 2025


✅ BEST PARAMS FOUND:
{'l1_size': 169, 'l2_size': 67, 'l3_size': 108, 'emb_dropout': 0.478614857301453, 'hidden_dropout': 0.4168510741311652, 'lr': 0.004873287679167104, 'weight_decay': 0.0009067328775340047}


## 7. Final Model Training with Best Hyperparameters
Retrain the model using the best hyperparameters found by Optuna on full 5-fold cross-validation. Aggregate test predictions across folds.

In [7]:
print("\n--- Retraining Final Model with Best Params (5 Folds) ---")

skf_final = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
test_probs_sum = np.zeros((len(X_test), num_classes_target))
final_f1_scores = []
FINAL_EPOCHS = 50

for fold, (train_idx, val_idx) in enumerate(skf_final.split(X_num, y_encoded)):
    train_ds = AdvancedTabularDataset(X_cat[train_idx], X_num[train_idx], y_encoded[train_idx])
    val_ds = AdvancedTabularDataset(X_cat[val_idx], X_num[val_idx], y_encoded[val_idx])
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=0)
    val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=0)
    model = DynamicTabularModel(
        cat_dims, X_num.shape[1], num_classes_target,
        best_p['l1_size'], best_p['l2_size'], best_p['l3_size'],
        best_p['emb_dropout'], best_p['hidden_dropout']
    ).to(DEVICE)
    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor, label_smoothing=0.1)
    optimizer = optim.AdamW(model.parameters(), lr=best_p['lr'], weight_decay=best_p['weight_decay'])
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=best_p['lr']*10, 
                                              steps_per_epoch=len(train_loader), epochs=FINAL_EPOCHS)
    best_val_f1 = 0
    best_state = None
    for epoch in range(FINAL_EPOCHS):
        model.train()
        for c, n, y_batch in train_loader:
            c, n, y_batch = c.to(DEVICE), n.to(DEVICE), y_batch.to(DEVICE)
            optimizer.zero_grad()
            loss = criterion(model(c, n), y_batch)
            loss.backward()
            optimizer.step()
            scheduler.step()
        model.eval()
        preds, labels = [], []
        with torch.no_grad():
            for c, n, y_batch in val_loader:
                c, n = c.to(DEVICE), n.to(DEVICE)
                p = torch.argmax(model(c, n), dim=1)
                preds.extend(p.cpu().numpy())
                labels.extend(y_batch.numpy())
        f1 = f1_score(labels, preds, average='macro')
        if f1 > best_val_f1:
            best_val_f1 = f1
            best_state = model.state_dict()
    print(f"Fold {fold+1} | Best F1: {best_val_f1:.4f}")
    final_f1_scores.append(best_val_f1)
    model.load_state_dict(best_state)
    model.eval()
    test_ds = AdvancedTabularDataset(X_test_cat, X_test_num)
    test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE*2, shuffle=False, num_workers=0)
    fold_probs = []
    with torch.no_grad():
        for c, n in test_loader:
            c, n = c.to(DEVICE), n.to(DEVICE)
            probs = torch.softmax(model(c, n), dim=1)
            fold_probs.append(probs.cpu().numpy())
    test_probs_sum += np.concatenate(fold_probs)


--- Retraining Final Model with Best Params (5 Folds) ---
Fold 1 | Best F1: 0.6674
Fold 2 | Best F1: 0.6453
Fold 3 | Best F1: 0.6894
Fold 4 | Best F1: 0.6424
Fold 5 | Best F1: 0.6296


## 8. Save Predictions and Submission Files
Save the test set prediction probabilities and final submission file as CSVs.

In [8]:
print(f"\n🏆 Final Average F1: {np.mean(final_f1_scores):.4f}")

avg_test_probs = test_probs_sum / 5

# Save Probs
prob_df = pd.DataFrame(avg_test_probs, columns=[f'prob_{i}' for i in range(num_classes_target)])
prob_df['participant_id'] = test_ids.values
prob_df.to_csv('nn_optuna_probs.csv', index=False)
print("✅ Saved 'nn_optuna_probs.csv'")

# Save Submission
final_indices = np.argmax(avg_test_probs, axis=1)
final_labels = target_le.inverse_transform(final_indices)
submission_df = pd.DataFrame({
    'participant_id': test_ids,
    'personality_cluster': final_labels
})
submission_df.to_csv('submission_nn_optuna.csv', index=False)
print("✅ Saved 'submission_nn_optuna.csv'")


🏆 Final Average F1: 0.6548
✅ Saved 'nn_optuna_probs.csv'
✅ Saved 'submission_nn_optuna.csv'
