# Loading Data set

In [None]:
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import numpy as np # For numerical operations and array handling

train_ds_original = pd.read_csv('/kaggle/input/multidimensional-personality-cluster-prediction/train.csv')
test_ds_original = pd.read_csv('/kaggle/input/multidimensional-personality-cluster-prediction/test.csv')

# want to see the data set shape
print("Given Train Data set Shape: ", train_ds_original.shape)

# Get Dataset info
train_ds_original.info()
print('%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%')
test_ds_original.info()

In [None]:
# participant_id      -> index column
# personality_cluster -> target column
index_column = 'participant_id'
target_column = 'personality_cluster'
train_ds = train_ds_original.drop(columns=[index_column, target_column])

# Statistics of my Data set
train_ds.describe()

# Exploratory Data Analysis (EDA)
1. Correlation Matrix
2. Box plot
3. Scattered Plot
4. histogram

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(12, 8))

corr = train_ds.corr(numeric_only=True)   # ensures only numeric columns

sns.heatmap(corr, annot=True, cmap="coolwarm", linewidths=0.5)

plt.title("Correlation Heatmap")
plt.show()

In [None]:
import matplotlib.pyplot as plt

numeric_cols = train_ds_original.select_dtypes(include=['int64', 'float64']).columns

plt.figure(figsize=(12, 6))
plt.boxplot(train_ds_original[numeric_cols].values, labels=numeric_cols)
plt.xticks(rotation=90)
plt.title("Boxplot of All Numeric Features")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np

feature = "consistency_score"   # üîÅ change to any feature you want

classes = train_ds_original["personality_cluster"].unique()

data = [
    train_ds_original[train_ds_original["personality_cluster"] == cls][feature]
    for cls in classes
]

plt.figure()
plt.boxplot(
    data,
    labels=classes,
    patch_artist=True,
    boxprops=dict(facecolor='#01519a', alpha=0.15),
    medianprops=dict(color='#01519a')
)

plt.xlabel("Personality Cluster")
plt.ylabel(feature)
plt.title(f"Boxplot of {feature} vs Personality Cluster")
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import math

# Select numeric features only
numeric_cols = train_ds_original.select_dtypes(include=['int64', 'float64']).columns
numeric_cols = [col for col in numeric_cols if col != "participant_id"]

clusters = train_ds_original["personality_cluster"].unique()

n_features = len(numeric_cols)
cols = 3                                  # 3 plots per row
rows = math.ceil(n_features / cols)

plt.figure(figsize=(cols*5, rows*4))

for i, feature in enumerate(numeric_cols, 1):

    data = [
        train_ds_original[train_ds_original["personality_cluster"] == cls][feature]
        for cls in clusters
    ]

    plt.subplot(rows, cols, i)
    plt.boxplot(
        data,
        labels=clusters,
        patch_artist=True,
        boxprops=dict(facecolor="#01519a", alpha=0.15),
        medianprops=dict(color="#01519a")
    )

    plt.title(feature)
    plt.xlabel("Cluster")
    plt.ylabel("Value")
    plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

x_feature = "focus_intensity"
y_feature = "consistency_score"

clusters = train_ds_original["personality_cluster"].unique()

plt.figure(figsize=(8, 6))

for cls in clusters:
    subset = train_ds_original[train_ds_original["personality_cluster"] == cls]
    plt.scatter(
        subset[x_feature],
        subset[y_feature],
        alpha=0.4,
        label=cls,
        color="#01519a"
    )

plt.xlabel(x_feature)
plt.ylabel(y_feature)
plt.title("Scatter Plot of Focus Intensity vs Consistency Score")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
import math
from sklearn.preprocessing import LabelEncoder

# Encode target for scatter plotting
le_scatter = LabelEncoder()
y_encoded_scatter = le_scatter.fit_transform(
    train_ds_original["personality_cluster"]
)

# Select numeric features (exclude ID)
numeric_cols = train_ds_original.select_dtypes(
    include=['int64', 'float64']
).columns
numeric_cols = [col for col in numeric_cols if col != "participant_id"]

X_data = train_ds_original[numeric_cols]

n_features = len(numeric_cols)
cols = 3
rows = math.ceil(n_features / cols)

plt.figure(figsize=(cols * 5, rows * 4))

for i, feature in enumerate(numeric_cols, 1):
    plt.subplot(rows, cols, i)

    # Add small jitter to target for better visibility
    jitter = np.random.normal(0, 0.05, size=len(y_encoded_scatter))

    plt.scatter(
        X_data[feature],
        y_encoded_scatter + jitter,
        alpha=0.4,
        color="#01519a"   # iiitbblue
    )

    plt.yticks(
        np.arange(len(le_scatter.classes_)),
        le_scatter.classes_
    )

    plt.xlabel(feature)
    plt.ylabel("Personality Cluster")
    plt.title(f"{feature} vs Target")
    plt.grid(True)

plt.tight_layout()
plt.show()


In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import gaussian_kde

# Extract the feature
data = train_ds_original["consistency_score"].dropna().values

# Create KDE
kde = gaussian_kde(data)
x_vals = np.linspace(data.min(), data.max(), 300)
kde_vals = kde(x_vals)

# Plot
plt.figure(figsize=(8, 5))

# Histogram (iiitbblue)
plt.hist(
    data,
    bins=30,
    density=True,
    color="#01519a",      # iiitbblue
    edgecolor="black",
    label="Histogram"
)

# Density Curve (iiitbgray)
plt.plot(
    x_vals,
    kde_vals,
    color="#838280",     # iiitbgray
    linewidth=2,
    label="Density"
)

plt.xlabel("Consistency Score")
plt.ylabel("Density")
plt.title("Distribution of Consistency Score")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


# Model Training Arc

# 1. MLP

In [None]:
!pip uninstall -y protobuf
!pip install protobuf==3.20.3
!pip install --upgrade tensorflow


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from tensorflow.keras.utils import to_categorical
import numpy as np

target_column = 'personality_cluster'
y = train_ds_original[target_column]

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    train_ds, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_split_scaled = scaler.fit_transform(X_train_split)
X_val_split_scaled = scaler.transform(X_val_split)

le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train_split)
y_val_encoded = le.transform(y_val_split)

num_classes = len(np.unique(y_train_encoded))

y_train_onehot = to_categorical(y_train_encoded, num_classes)
y_val_onehot = to_categorical(y_val_encoded, num_classes)


In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

from sklearn.metrics import accuracy_score, classification_report

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_split_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    Dropout(0.2),

    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


early_stop = EarlyStopping(
    monitor='val_loss',
    patience=10,
    restore_best_weights=True
)

history = model.fit(
    X_train_split_scaled, y_train_onehot,
    validation_data=(X_val_split_scaled, y_val_onehot),
    epochs=150,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)



val_preds = model.predict(X_val_split_scaled)
val_preds_labels = np.argmax(val_preds, axis=1)

print("MLP Validation Accuracy:", accuracy_score(y_val_encoded, val_preds_labels))
print(classification_report(y_val_encoded, val_preds_labels))

In [None]:
import matplotlib.pyplot as plt

plt.figure()

# Train Accuracy - iiitbblue!15!white
plt.plot(
    history.history['accuracy'],
    label='Train Accuracy',
    color='#01519a',
    linewidth=2
)

# Validation Accuracy - iiitbgray
plt.plot(
    history.history['val_accuracy'],
    label='Validation Accuracy',
    color='#838280',
    linewidth=2
)

plt.xlabel("Epochs")
plt.ylabel("Accuracy")
plt.title("MLP Training vs Validation Accuracy")
plt.legend()
plt.grid(True)
plt.tight_layout()
plt.show()


In [None]:
# Drop target and index from test set
X_test_final = test_ds_original.drop(columns=['participant_id'], errors='ignore')

# Apply the SAME scaler used for training
X_test_final_scaled = scaler.transform(X_test_final)

test_preds = model.predict(X_test_final_scaled)
test_preds_labels = np.argmax(test_preds, axis=1)

test_preds_clusters = le.inverse_transform(test_preds_labels)

submission = pd.DataFrame({
    "participant_id": test_ds_original['participant_id'],
    "personality_cluster": test_preds_clusters
})

submission.to_csv("mlp_submission_26_14_44.csv", index=False)
print("‚úÖ Submission file saved as mlp_submission_26_14_44.csv")

In [None]:
y.value_counts(normalize=True)

Observed that there is so much unbalence in Target column

In [None]:
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_encoded),
    y=y_train_encoded
)
class_weights = dict(enumerate(class_weights))

model = Sequential([
    Dense(512, activation='relu', input_shape=(X_train_split_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.4),

    Dense(256, activation='relu'),
    BatchNormalization(),
    Dropout(0.3),

    Dense(128, activation='relu'),
    Dropout(0.2),

    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.0007),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

history = model.fit(
    X_train_split_scaled, y_train_onehot,
    validation_data=(X_val_split_scaled, y_val_onehot),
    epochs=150,
    batch_size=32,
    class_weight=class_weights,
    callbacks=[early_stop],
    verbose=1
)

# Drop target and index from test set
X_test_final = test_ds_original.drop(columns=['participant_id'], errors='ignore')

# Apply the SAME scaler used for training
X_test_final_scaled = scaler.transform(X_test_final)

test_preds = model.predict(X_test_final_scaled)
test_preds_labels = np.argmax(test_preds, axis=1)

test_preds_clusters = le.inverse_transform(test_preds_labels)

submission = pd.DataFrame({
    "participant_id": test_ds_original['participant_id'],
    "personality_cluster": test_preds_clusters
})

submission.to_csv("mlp_submission_26_14_57.csv", index=False)
print("‚úÖ Submission file saved as mlp_submission_26_14_44.csv")

In [None]:
print("Train Accuracy:", history.history['accuracy'][-1])
print("Val Accuracy:", history.history['val_accuracy'][-1])


# 2. SVM

In [None]:
from sklearn.preprocessing import StandardScaler

# Features and target
X_train = train_ds_original.drop(columns=['participant_id', 'personality_cluster'])
y_train = train_ds_original['personality_cluster']

X_test = test_ds.drop(columns=['participant_id'])

# Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)


In [None]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.1, 1, 10, 50, 100],
    'gamma': [0.1, 0.01, 0.001, 'scale'],
    'kernel': ['rbf']
}

svm_grid = GridSearchCV(
    SVC(probability=True),
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

svm_grid.fit(X_train_scaled, y_train)

best_svm = svm_grid.best_estimator_
print("Best SVM Params:", svm_grid.best_params_)
print("Best CV Accuracy:", svm_grid.best_score_)


svm_test_preds = best_svm.predict(X_test_scaled)

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": svm_test_preds
})

submission.to_csv("svm_submission.csv", index=False)
print("‚úÖ SVM submission saved")


In [None]:
from sklearn.utils.class_weight import compute_class_weight
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np

class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train_encoded),
    y=y_train_encoded
)
class_weights = dict(enumerate(class_weights))

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_split_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.30),

    Dense(64, activation='relu'),
    Dropout(0.20),

    Dense(num_classes, activation='softmax')
])


model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)
early_stop = EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)

history = model.fit(
    X_train_split_scaled, y_train_onehot,
    validation_data=(X_val_split_scaled, y_val_onehot),
    epochs=150,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)
test_preds = model.predict(X_test_scaled)
test_preds_numeric = np.argmax(test_preds, axis=1)
test_preds_clusters = le.inverse_transform(test_preds_numeric)

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": test_preds_clusters
})

submission.to_csv("mlp_submission_v3.csv", index=False)
print("‚úÖ MLP submission saved")


In [None]:
mlp_preds = pd.read_csv("mlp_submission_v3.csv")["personality_cluster"]
svm_preds = pd.read_csv("svm_submission.csv")["personality_cluster"]

final_preds = []

for a, b in zip(mlp_preds, svm_preds):
    if a == b:
        final_preds.append(a)
    else:
        final_preds.append(a)  # tie-break ‚Üí trust MLP

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": final_preds
})

submission.to_csv("svm_mlp_ensemble.csv", index=False)
print("‚úÖ Ensemble submission saved")


In [None]:
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#             Applied PCA and then SVM
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
import pandas as pd

# Separate features and target
X_train = train_ds_original.drop(columns=['participant_id', 'personality_cluster'])
y_train = train_ds_original['personality_cluster']

X_test = test_ds.drop(columns=['participant_id'])

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled  = scaler.transform(X_test)

pca = PCA(n_components=0.95, random_state=42)

X_train_pca = pca.fit_transform(X_train_scaled)
X_test_pca  = pca.transform(X_test_scaled)

print("Original features:", X_train.shape[1])
print("Reduced features after PCA:", X_train_pca.shape[1])

param_grid = {
    'C': [0.5, 1, 5, 10, 50],
    'gamma': ['scale', 0.01, 0.001],
    'kernel': ['rbf']
}

svm_grid = GridSearchCV(
    SVC(probability=True),
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

svm_grid.fit(X_train_pca, y_train)

best_svm = svm_grid.best_estimator_

print("Best SVM Params:", svm_grid.best_params_)
print("Best CV Accuracy:", svm_grid.best_score_)

svm_test_preds = best_svm.predict(X_test_pca)

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": svm_test_preds
})

submission.to_csv("svm_pca_submission.csv", index=False)
print("‚úÖ svm_pca_submission.csv saved")


In [None]:
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#             Using Grid Search CV
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

param_grid = {
    'C': [0.5, 1, 2, 5, 10, 50],
    'gamma': [0.05, 0.01, 0.005, 'scale'],
    'kernel': ['rbf']
}

svm_grid = GridSearchCV(
    SVC(class_weight='balanced', probability=True),
    param_grid,
    cv=5,
    n_jobs=-1,
    verbose=2
)

svm_grid.fit(X_train_scaled, y_train)

best_svm = svm_grid.best_estimator_

print("Best Params:", svm_grid.best_params_)
print("Best CV Accuracy:", svm_grid.best_score_)

svm_preds = best_svm.predict(X_test_scaled)

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": svm_preds
})

submission.to_csv("svm_weighted_submission.csv", index=False)
print("‚úÖ svm_weighted_submission.csv saved")


In [None]:
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#            Applied Poly
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


param_grid_poly = {
    'C': [1, 5, 10],
    'degree': [2, 3, 4],
    'gamma': ['scale'],
    'kernel': ['poly']
}

svm_poly = GridSearchCV(
    SVC(class_weight='balanced'),
    param_grid_poly,
    cv=5,
    n_jobs=-1,
    verbose=2
)

svm_poly.fit(X_train_scaled, y_train)

best_poly = svm_poly.best_estimator_

poly_preds = best_poly.predict(X_test_scaled)

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": poly_preds
})

submission.to_csv("svm_poly_submission.csv", index=False)


# 3. Ensemble SVM and MLP

In [None]:

svm_probs = best_svm.predict_proba(X_test_scaled)
mlp_probs = model.predict(X_test_scaled)

final_probs = 0.5 * svm_probs + 0.5 * mlp_probs
final_preds_numeric = np.argmax(final_probs, axis=1)
final_preds = le.inverse_transform(final_preds_numeric)

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": final_preds
})

submission.to_csv("svm_mlp_prob_ensemble.csv", index=False)
print("‚úÖ svm_mlp_prob_ensemble.csv saved")

UNderstood that we can't go ahead with SVM

# 4. Coming back to MLP

In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import numpy as np

X = train_ds_original.drop(columns=['participant_id', 'personality_cluster'])
y = train_ds_original['personality_cluster']

X_test = test_ds.drop(columns=['participant_id'])
test_ids = test_ds['participant_id']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)

# Train/validation split
X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)

# Scale
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_split)
X_val_scaled   = scaler.transform(X_val_split)
X_test_scaled  = scaler.transform(X_test)

# One-hot for TensorFlow
num_classes = len(np.unique(y_encoded))
y_train_onehot = to_categorical(y_train_split, num_classes)
y_val_onehot   = to_categorical(y_val_split, num_classes)



In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

model = Sequential([
    Dense(128, activation='relu', input_shape=(X_train_scaled.shape[1],)),
    BatchNormalization(),
    Dropout(0.30),

    Dense(96, activation='relu'),
    BatchNormalization(),
    Dropout(0.25),

    Dense(64, activation='relu'),
    Dropout(0.20),

    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=15,
    restore_best_weights=True
)

lr_reduce = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,
    patience=6,
    min_lr=1e-5
)
history = model.fit(
    X_train_scaled, y_train_onehot,
    validation_data=(X_val_scaled, y_val_onehot),
    epochs=300,
    batch_size=32,
    callbacks=[early_stop, lr_reduce],
    verbose=1
)



In [None]:
from sklearn.metrics import accuracy_score, classification_report

val_preds = model.predict(X_val_scaled)
val_preds_labels = np.argmax(val_preds, axis=1)

print("MLP Validation Accuracy:", accuracy_score(y_val_split, val_preds_labels))
print(classification_report(y_val_split, val_preds_labels))

test_preds = model.predict(X_test_scaled)
test_preds_numeric = np.argmax(test_preds, axis=1)
test_preds_clusters = le.inverse_transform(test_preds_numeric)

submission = pd.DataFrame({
    "participant_id": test_ids,
    "personality_cluster": test_preds_clusters
})

submission.to_csv("mlp_submission_v4.csv", index=False)
print("‚úÖ nn_submission_final.csv saved")


In [None]:
print("Train Accuracy:", history.history['accuracy'][-1])
print("Val Accuracy:", history.history['val_accuracy'][-1])


In [None]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split
from tensorflow.keras.utils import to_categorical
import numpy as np


# Full features and target
X_full = train_ds_original.drop(columns=['participant_id', 'personality_cluster'])
y_full = train_ds_original['personality_cluster']

# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y_full)

# Scale FULL training data
scaler = StandardScaler()
X_full_scaled = scaler.fit_transform(X_full)

X_test = test_ds_original.drop(columns=['participant_id'])
# Scale test data with SAME scaler
X_test_scaled = scaler.transform(X_test)


In [None]:
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
#           Feature Engineering
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%


# X_full['focus_x_consistency'] = X_full['focus_intensity'] * X_full['consistency_score']
# X_test['focus_x_consistency'] = X_test['focus_intensity'] * X_test['consistency_score']

# X_full['support_x_focus'] = X_full['support_environment_score'] * X_full['focus_intensity']
# X_test['support_x_focus'] = X_test['support_environment_score'] * X_test['focus_intensity']


In [None]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


In [None]:
def build_model(input_dim, num_classes):
    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.3),

        Dense(96, activation='relu'),
        BatchNormalization(),
        Dropout(0.25),

        Dense(64, activation='relu'),
        Dropout(0.2),

        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


In [None]:
num_classes = len(np.unique(y_encoded))
test_prob_sum = np.zeros((X_test_scaled.shape[0], num_classes))

for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_scaled, y_encoded)):
    print(f"\n--- Fold {fold+1} ---")

    X_tr = X_full_scaled[train_idx]
    X_va = X_full_scaled[val_idx]

    y_tr = y_encoded[train_idx]
    y_va = y_encoded[val_idx]

    y_tr_oh = to_categorical(y_tr, num_classes)
    y_va_oh = to_categorical(y_va, num_classes)

    model = build_model(X_tr.shape[1], num_classes)

    early_stop = EarlyStopping(monitor='val_loss', patience=12, restore_best_weights=True)

    model.fit(
        X_tr, y_tr_oh,
        validation_data=(X_va, y_va_oh),
        epochs=200,
        batch_size=32,
        callbacks=[early_stop],
        verbose=0
    )

    # Predict on Kaggle test set
    test_fold_probs = model.predict(X_test_scaled)
    test_prob_sum += test_fold_probs

final_test_probs = test_prob_sum / 5
final_test_numeric = np.argmax(final_test_probs, axis=1)
final_test_clusters = le.inverse_transform(final_test_numeric)

submission = pd.DataFrame({
    "participant_id": test_ds["participant_id"],
    "personality_cluster": final_test_clusters
})
# greater than 1m. 1m 12s for v2 (without feature)
# v3 with feature engineering
submission.to_csv("nn_kfold_ensemble_v2.csv", index=False)
print("‚úÖ nn_kfold_ensemble.csv saved")

Trying MULTI-RUN K-FOLD NEURAL NETWORK AVERAGING (FINAL STAGE)

In [None]:
def build_model(input_dim, num_classes, seed):
    tf.keras.utils.set_random_seed(seed)

    model = Sequential([
        Dense(128, activation='relu', input_shape=(input_dim,)),
        BatchNormalization(),
        Dropout(0.30),

        Dense(96, activation='relu'),
        BatchNormalization(),
        Dropout(0.25),

        Dense(64, activation='relu'),
        Dropout(0.20),

        Dense(num_classes, activation='softmax')
    ])

    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
        loss='categorical_crossentropy',
        metrics=['accuracy']
    )

    return model


In [None]:
from sklearn.model_selection import StratifiedKFold

RUNS = 5   # number of independent NN runs
FOLDS = 5 # k-fold inside each run

num_classes = len(np.unique(y_encoded))
final_test_prob_sum = np.zeros((X_test_scaled.shape[0], num_classes))

for run in range(RUNS):
    print(f"\n================ RUN {run+1} ================")

    skf = StratifiedKFold(
        n_splits=FOLDS,
        shuffle=True,
        random_state=42 + run
    )

    test_prob_sum = np.zeros((X_test_scaled.shape[0], num_classes))

    for fold, (train_idx, val_idx) in enumerate(skf.split(X_full_scaled, y_encoded)):
        print(f"  Fold {fold+1}")

        X_tr = X_full_scaled[train_idx]
        X_va = X_full_scaled[val_idx]

        y_tr = y_encoded[train_idx]
        y_va = y_encoded[val_idx]

        y_tr_oh = to_categorical(y_tr, num_classes)
        y_va_oh = to_categorical(y_va, num_classes)

        model = build_model(X_tr.shape[1], num_classes, seed=100 + run*10 + fold)

        early_stop = EarlyStopping(
            monitor='val_loss',
            patience=12,
            restore_best_weights=True
        )

        model.fit(
            X_tr, y_tr_oh,
            validation_data=(X_va, y_va_oh),
            epochs=200,
            batch_size=32,
            callbacks=[early_stop],
            verbose=0
        )

        fold_test_probs = model.predict(X_test_scaled)
        test_prob_sum += fold_test_probs

    # Average this run‚Äôs 5 folds
    test_prob_sum /= FOLDS
    final_test_prob_sum += test_prob_sum

final_test_probs = final_test_prob_sum / RUNS
final_test_numeric = np.argmax(final_test_probs, axis=1)
final_test_clusters = le.inverse_transform(final_test_numeric)

submission = pd.DataFrame({
    "participant_id": test_ids,
    "personality_cluster": final_test_clusters
})
# 5m 26s
submission.to_csv("nn_multirun_ensemble.csv", index=False)
print("‚úÖ nn_multirun_ensemble.csv saved")


# 5. Top 5 Model training Part 1

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score
import warnings
warnings.filterwarnings("ignore")


# ============================================================
# 1. PREPROCESSING STRATEGIES
# ============================================================

def get_preprocessor(strategy, categorical_cols, numerical_cols, binary_cols):

    if strategy == "standard":
        scaler = StandardScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "minmax":
        scaler = MinMaxScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "robust":
        scaler = RobustScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "labelencode_only":
        return "labelencode_only"   # special handling

    else:  # none
        return "none"

    return ColumnTransformer(
        transformers=[
            ("num", scaler, numerical_cols),
            ("cat", encode, categorical_cols),
            ("bin", "passthrough", binary_cols)
        ]
    )


# ============================================================
# 2. MAIN FUNCTION ‚Äì SVM + MLP TESTING (WITH TOP 5 MODELS)
# ============================================================

def evaluate_models(train_df, test_df):

    # ---------------------------------------------
    # Step A: Split X and y
    # ---------------------------------------------
    X = train_df.drop(["participant_id", "personality_cluster"], axis=1)
    y = train_df["personality_cluster"]

    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)

    # Categorical & numerical groups
    categorical_cols = ["cultural_background"]

    numerical_cols = [
        "age_group", "upbringing_influence", "focus_intensity",
        "consistency_score", "support_environment_score"
    ]

    binary_cols = [
        "identity_code", "external_guidance_usage", "hobby_engagement_level",
        "physical_activity_index", "creative_expression_index", "altruism_score"
    ]

    # Split
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    preprocessing_strategies = ["standard", "minmax", "robust", "labelencode_only", "none"]
    svm_C_values = [1, 10, 50]

    # Expanded MLP sizes
    mlp_sizes = [
        (64,),
        (128,),
        (256,),
        (64, 32),
        (128, 64),
        (256, 128),
        (128, 128),
        (256, 128, 64)
    ]

    # Track ALL models
    model_results = []

    # ============================================================
    # 3. LOOP OVER PREPROCESSORS + SVM + MLP
    # ============================================================

    for strategy in preprocessing_strategies:

        print("\n\n==============================================")
        print(f" Testing Preprocessing Strategy: {strategy} ")
        print("==============================================")

        # --------------------------------------
        # Prepare preprocessing
        # --------------------------------------
        if strategy == "labelencode_only":
            label_encoders = {}
            X_train_copy = X_train.copy()
            X_val_copy = X_val.copy()

            for col in categorical_cols:
                le = LabelEncoder()
                X_train_copy[col] = le.fit_transform(X_train_copy[col])
                X_val_copy[col] = le.transform(X_val_copy[col])
                label_encoders[col] = le

            Xt = X_train_copy
            Xv = X_val_copy

        elif strategy == "none":
            Xt = X_train.copy()
            Xv = X_val.copy()

        else:
            preprocessor = get_preprocessor(strategy, categorical_cols, numerical_cols, binary_cols)
            Xt = preprocessor.fit_transform(X_train)
            Xv = preprocessor.transform(X_val)

        # ============================================================
        # SVM MODELS
        # ============================================================

        for C in svm_C_values:
            svm = SVC(C=C, kernel="rbf", gamma="scale")
            svm.fit(Xt, y_train)

            preds = svm.predict(Xv)
            f1 = f1_score(y_val, preds, average="macro")
            print(f"SVM (strategy={strategy}, C={C}) ‚Üí F1={f1:.4f}")

            model_results.append({
                "name": f"SVM_C{C}_{strategy}",
                "f1": f1,
                "model": svm,
                "preprocessor": "none" if strategy=="none" else (
                    "labelencode_only" if strategy=="labelencode_only" else preprocessor
                )
            })

        # ============================================================
        # MLP MODELS
        # ============================================================

        for layers in mlp_sizes:
            mlp = MLPClassifier(
                hidden_layer_sizes=layers,
                max_iter=1200,
                activation="relu",
                solver="adam",
                learning_rate_init=0.001
            )

            mlp.fit(Xt, y_train)
            preds = mlp.predict(Xv)
            f1 = f1_score(y_val, preds, average="macro")

            print(f"MLP (strategy={strategy}, layers={layers}) ‚Üí F1={f1:.4f}")

            model_results.append({
                "name": f"MLP_{layers}_{strategy}",
                "f1": f1,
                "model": mlp,
                "preprocessor": "none" if strategy=="none" else (
                    "labelencode_only" if strategy=="labelencode_only" else preprocessor
                )
            })

    # ============================================================
    # 4. SELECT TOP 5 MODELS
    # ============================================================

    model_results = sorted(model_results, key=lambda x: x["f1"], reverse=True)
    top5 = model_results[:5]

    print("\n========== TOP 5 MODELS ==========")
    for i, m in enumerate(top5, 1):
        print(f"{i}. {m['name']} ‚Üí F1 = {m['f1']:.4f}")

    # ============================================================
    # 5. GENERATE CSV FOR TOP 5 MODELS
    # ============================================================

    X_test = test_df.drop(["participant_id"], axis=1)
    test_ids = test_df["participant_id"]

    for rank, m in enumerate(top5, 1):
        model = m["model"]
        preproc = m["preprocessor"]

        # Apply correct preprocessing
        if preproc == "none":
            Xtst = X_test.copy()

        elif preproc == "labelencode_only":
            Xtst = X_test.copy()
            for col in categorical_cols:
                Xtst[col] = label_encoders[col].transform(X_test[col])

        else:
            Xtst = preproc.transform(X_test)

        preds = model.predict(Xtst)
        preds_labels = le_target.inverse_transform(preds)

        filename = f"top{rank}_{m['name']}.csv"
        output = pd.DataFrame({
            "participant_id": test_ids,
            "personality_cluster": preds_labels
        })
        output.to_csv(filename, index=False)
        print(f"Saved {filename}")

    return top5


import pandas as pd

train_df = pd.read_csv("/content/train.csv")
test_df = pd.read_csv("/content/test.csv")

best_model_name, best_f1 = evaluate_models(train_df, test_df)

print("Best model:", best_model_name)
print("Best F1 score:", best_f1)

# 6. Top 5 Best Models - Best Kaggle Score

In [None]:
from tensorflow.keras.utils import to_categorical
import numpy as np

num_classes = len(np.unique(y_encoded))
y_full_onehot = to_categorical(y_encoded, num_classes)

print("num_classes:", num_classes)
print("y_full_onehot shape:", y_full_onehot.shape)


In [None]:
from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_full_scaled,
    y_encoded,
    test_size=0.2,
    random_state=42,
    stratify=y_encoded
)

# One-hot for NN
y_train_onehot = to_categorical(y_train_split, num_classes)
y_val_onehot   = to_categorical(y_val_split, num_classes)


In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers import Adam

model = Sequential([
    Dense(256, activation='relu', input_shape=(X_train_split.shape[1],)),
    BatchNormalization(),
    Dropout(0.35),

    Dense(128, activation='relu'),
    BatchNormalization(),
    Dropout(0.25),

    Dense(64, activation='relu'),
    Dropout(0.20),

    Dense(num_classes, activation='softmax')
])

model.compile(
    optimizer=Adam(learning_rate=0.0008),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

early_stop = EarlyStopping(
    monitor='val_loss',
    patience=12,
    restore_best_weights=True
)

history = model.fit(
    X_train_split,
    y_train_onehot,
    validation_data=(X_val_split, y_val_onehot),
    epochs=200,
    batch_size=64,
    callbacks=[early_stop],
    verbose=1
)

test_probs = model.predict(X_test_scaled)
test_preds_numeric = np.argmax(test_probs, axis=1)

# Convert back to Cluster labels
test_preds_clusters = le.inverse_transform(test_preds_numeric)

submission = pd.DataFrame({
    "participant_id": test_ds_original["participant_id"],
    "personality_cluster": test_preds_clusters
})

submission.to_csv("nn_submission_v1.csv", index=False)
print("‚úÖ submission_nn_final.csv saved")



In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")


# ============================================================
# 1. PREPROCESSING STRATEGIES
# ============================================================

def get_preprocessor(strategy, categorical_cols, numerical_cols, binary_cols):
    """
    Return a fitted or to-be-fitted ColumnTransformer for a given strategy.
    For 'labelencode_only' and 'none', this function returns a string marker.
    """

    if strategy == "standard":
        scaler = StandardScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "minmax":
        scaler = MinMaxScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "robust":
        scaler = RobustScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "labelencode_only":
        return "labelencode_only"

    else:  # "none"
        return "none"

    return ColumnTransformer(
        transformers=[
            ("num", scaler, numerical_cols),
            ("cat", encode, categorical_cols),
            ("bin", "passthrough", binary_cols)
        ]
    )


# ============================================================
# 2. MAIN FUNCTION ‚Äì SVM + MLP TESTING
# ============================================================

def evaluate_models(train_df, test_df):

    # ---------------------------------------------
    # Step A: Split X and y
    # ---------------------------------------------
    X = train_df.drop(["participant_id", "personality_cluster"], axis=1)
    y = train_df["personality_cluster"]

    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)

    # Categorical & numerical groups
    categorical_cols = ["cultural_background"]

    numerical_cols = [
        "age_group", "upbringing_influence", "focus_intensity",
        "consistency_score", "support_environment_score"
    ]

    binary_cols = [
        "identity_code", "external_guidance_usage", "hobby_engagement_level",
        "physical_activity_index", "creative_expression_index", "altruism_score"
    ]

    # Split train / val
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    # What we‚Äôll try
    preprocessing_strategies = ["standard", "minmax", "robust", "labelencode_only", "none"]

    svm_C_values = [0.1, 1, 10, 50, 100]
    svm_gamma_values = [0.01, 0.05, 0.1, "scale"]

    mlp_sizes = [
        (64,),
        (128,),
        (256,),
        (128, 64),
        (256, 128),
        (256, 128, 64)
    ]

    alphas = [1e-5, 1e-4, 1e-3]
    learning_rates = [0.001, 0.0005]

    model_results = []

    # ============================================================
    # 3. LOOP OVER PREPROCESSORS + SVM + MLP
    # ============================================================

    for strategy in preprocessing_strategies:

        print("\n==============================================")
        print(f" Testing Preprocessing Strategy: {strategy} ")
        print("==============================================")

        label_encoders = None
        preprocessor = get_preprocessor(strategy, categorical_cols, numerical_cols, binary_cols)

        # ----- Preprocessing -----
        if preprocessor == "labelencode_only":
            label_encoders = {}
            X_train_copy = X_train.copy()
            X_val_copy = X_val.copy()

            for col in categorical_cols:
                le = LabelEncoder()
                X_train_copy[col] = le.fit_transform(X_train_copy[col])
                X_val_copy[col] = le.transform(X_val_copy[col])
                label_encoders[col] = le

            Xt, Xv = X_train_copy, X_val_copy
            fitted_preprocessor = "labelencode_only"

        elif preprocessor == "none":
            Xt, Xv = X_train.copy(), X_val.copy()
            fitted_preprocessor = "none"

        else:
            # Fit transformer on train, apply to train/val
            fitted_preprocessor = preprocessor.fit(X_train)
            Xt = fitted_preprocessor.transform(X_train)
            Xv = fitted_preprocessor.transform(X_val)

        # ============================================================
        # SVM MODELS
        # ============================================================

        for C in svm_C_values:
            for gamma in svm_gamma_values:

                svm = SVC(
                    C=C,
                    gamma=gamma,
                    kernel="rbf",
                    class_weight="balanced",
                    probability=True
                )

                svm.fit(Xt, y_train)
                preds = svm.predict(Xv)

                f1 = f1_score(y_val, preds, average="macro")

                print(f"SVM (strategy={strategy}, C={C}, gamma={gamma}) ‚Üí F1={f1:.4f}")

                model_results.append({
                    "name": f"SVM_C{C}_G{gamma}_{strategy}",
                    "f1": f1,
                    "model": svm,
                    "preprocessor": fitted_preprocessor,
                    "label_encoders": label_encoders
                })

        # ============================================================
        # MLP MODELS
        # ============================================================

        for layers in mlp_sizes:
            for alpha in alphas:
                for lr in learning_rates:

                    mlp = MLPClassifier(
                        hidden_layer_sizes=layers,
                        max_iter=1500,
                        activation="relu",
                        solver="adam",
                        learning_rate_init=lr,
                        alpha=alpha,
                        early_stopping=True,
                        n_iter_no_change=20,
                        random_state=42
                    )

                    mlp.fit(Xt, y_train)
                    preds = mlp.predict(Xv)

                    f1 = f1_score(y_val, preds, average="macro")

                    print(
                        f"MLP (strategy={strategy}, layers={layers}, "
                        f"alpha={alpha}, lr={lr}) ‚Üí F1={f1:.4f}"
                    )

                    model_results.append({
                        "name": f"MLP_{layers}_A{alpha}_LR{lr}_{strategy}",
                        "f1": f1,
                        "model": mlp,
                        "preprocessor": fitted_preprocessor,
                        "label_encoders": label_encoders
                    })

    # ============================================================
    # 4. SELECT TOP 5 MODELS
    # ============================================================

    model_results = sorted(model_results, key=lambda x: x["f1"], reverse=True)
    top5 = model_results[:5]

    print("\n========== TOP 5 MODELS ==========")
    for i, m in enumerate(top5, 1):
        print(f"{i}. {m['name']} ‚Üí F1 = {m['f1']:.4f}")

    # ============================================================
    # 5. GENERATE CSV FOR TOP 5 MODELS
    # ============================================================

    X_test = test_df.drop(["participant_id"], axis=1)
    test_ids = test_df["participant_id"]

    for rank, m in enumerate(top5, 1):

        model = m["model"]
        fitted_preprocessor = m["preprocessor"]
        label_encs = m["label_encoders"]

        # Apply same preprocessing to test
        if fitted_preprocessor == "none":
            Xtst = X_test.copy()

        elif fitted_preprocessor == "labelencode_only":
            Xtst = X_test.copy()
            for col in categorical_cols:
                Xtst[col] = label_encs[col].transform(X_test[col])

        else:
            Xtst = fitted_preprocessor.transform(X_test)

        preds = model.predict(Xtst)
        preds_labels = le_target.inverse_transform(preds)

        filename = f"mlp_submission_v{rank+9}.csv"
        output = pd.DataFrame({
            "participant_id": test_ids,
            "personality_cluster": preds_labels
        })

        output.to_csv(filename, index=False)
        print(f"‚úÖ Saved {filename}")

    return top5


# ============================================================
# 3. RUN PIPELINE
# ============================================================

train_df = pd.read_csv("/content/train.csv")
test_df  = pd.read_csv("/content/test.csv")

top5 = evaluate_models(train_df, test_df)
print("‚úÖ Pipeline completed successfully.")


# 7. Top 5 Best Models - applied Feature Engineering
tried to improve previous code score but failed

In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import f1_score

import warnings
warnings.filterwarnings("ignore")


# ============================================================
# 1. PREPROCESSING STRATEGIES
# ============================================================

def get_preprocessor(strategy, categorical_cols, numerical_cols, binary_cols):
    """
    Return a fitted or to-be-fitted ColumnTransformer for a given strategy.
    For 'labelencode_only' and 'none', this function returns a string marker.
    """

    if strategy == "standard":
        scaler = StandardScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "minmax":
        scaler = MinMaxScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "robust":
        scaler = RobustScaler()
        encode = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

    elif strategy == "labelencode_only":
        return "labelencode_only"

    else:  # "none"
        return "none"

    return ColumnTransformer(
        transformers=[
            ("num", scaler, numerical_cols),
            ("cat", encode, categorical_cols),
            ("bin", "passthrough", binary_cols)
        ]
    )


# ============================================================
# 2. MAIN FUNCTION ‚Äì SVM + MLP TESTING
# ============================================================

def evaluate_models(train_df, test_df):

    # ---------------------------------------------
    # Step A: Split X and y
    # ---------------------------------------------
    X = train_df.drop(["participant_id", "personality_cluster"], axis=1)
    y = train_df["personality_cluster"]

    le_target = LabelEncoder()
    y_encoded = le_target.fit_transform(y)

    # Categorical & numerical groups
    categorical_cols = ["cultural_background"]

    numerical_cols = [
        "age_group", "upbringing_influence", "focus_intensity",
        "consistency_score", "support_environment_score"
    ]

    binary_cols = [
        "identity_code", "external_guidance_usage", "hobby_engagement_level",
        "physical_activity_index", "creative_expression_index", "altruism_score"
    ]

    # Split train / val
    X_train, X_val, y_train, y_val = train_test_split(
        X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
    )

    # What we‚Äôll try
    preprocessing_strategies = ["standard", "minmax", "robust", "labelencode_only", "none"]

    svm_C_values = [0.1, 1, 10, 50, 100]
    svm_gamma_values = [0.01, 0.05, 0.1, "scale"]

    mlp_sizes = [
        (64,),
        (128,),
        (256,),
        (128, 64),
        (256, 128),
        (256, 128, 64)
    ]

    alphas = [1e-5, 1e-4, 1e-3]
    learning_rates = [0.001, 0.0005]

    model_results = []

    # ============================================================
    # 3. LOOP OVER PREPROCESSORS + SVM + MLP
    # ============================================================

    for strategy in preprocessing_strategies:

        print("\n==============================================")
        print(f" Testing Preprocessing Strategy: {strategy} ")
        print("==============================================")

        label_encoders = None
        preprocessor = get_preprocessor(strategy, categorical_cols, numerical_cols, binary_cols)

        # ----- Preprocessing -----
        if preprocessor == "labelencode_only":
            label_encoders = {}
            X_train_copy = X_train.copy()
            X_val_copy = X_val.copy()

            for col in categorical_cols:
                le = LabelEncoder()
                X_train_copy[col] = le.fit_transform(X_train_copy[col])
                X_val_copy[col] = le.transform(X_val_copy[col])
                label_encoders[col] = le

            Xt, Xv = X_train_copy, X_val_copy
            fitted_preprocessor = "labelencode_only"

        elif preprocessor == "none":
            Xt, Xv = X_train.copy(), X_val.copy()
            fitted_preprocessor = "none"

        else:
            # Fit transformer on train, apply to train/val
            fitted_preprocessor = preprocessor.fit(X_train)
            Xt = fitted_preprocessor.transform(X_train)
            Xv = fitted_preprocessor.transform(X_val)

        # ============================================================
        # SVM MODELS
        # ============================================================

        for C in svm_C_values:
            for gamma in svm_gamma_values:

                svm = SVC(
                    C=C,
                    gamma=gamma,
                    kernel="rbf",
                    class_weight="balanced",
                    probability=True
                )

                svm.fit(Xt, y_train)
                preds = svm.predict(Xv)

                f1 = f1_score(y_val, preds, average="macro")

                print(f"SVM (strategy={strategy}, C={C}, gamma={gamma}) ‚Üí F1={f1:.4f}")

                model_results.append({
                    "name": f"SVM_C{C}_G{gamma}_{strategy}",
                    "f1": f1,
                    "model": svm,
                    "preprocessor": fitted_preprocessor,
                    "label_encoders": label_encoders
                })

        # ============================================================
        # MLP MODELS
        # ============================================================

        for layers in mlp_sizes:
            for alpha in alphas:
                for lr in learning_rates:

                    mlp = MLPClassifier(
                        hidden_layer_sizes=layers,
                        max_iter=1500,
                        activation="relu",
                        solver="adam",
                        learning_rate_init=lr,
                        alpha=alpha,
                        early_stopping=True,
                        n_iter_no_change=20,
                        random_state=42
                    )

                    mlp.fit(Xt, y_train)
                    preds = mlp.predict(Xv)

                    f1 = f1_score(y_val, preds, average="macro")

                    print(
                        f"MLP (strategy={strategy}, layers={layers}, "
                        f"alpha={alpha}, lr={lr}) ‚Üí F1={f1:.4f}"
                    )

                    model_results.append({
                        "name": f"MLP_{layers}_A{alpha}_LR{lr}_{strategy}",
                        "f1": f1,
                        "model": mlp,
                        "preprocessor": fitted_preprocessor,
                        "label_encoders": label_encoders
                    })

    # ============================================================
    # 4. SELECT TOP 5 MODELS
    # ============================================================

    model_results = sorted(model_results, key=lambda x: x["f1"], reverse=True)
    top5 = model_results[:5]

    print("\n========== TOP 5 MODELS ==========")
    for i, m in enumerate(top5, 1):
        print(f"{i}. {m['name']} ‚Üí F1 = {m['f1']:.4f}")

    # ============================================================
    # 5. GENERATE CSV FOR TOP 5 MODELS
    # ============================================================

    X_test = test_df.drop(["participant_id"], axis=1)
    test_ids = test_df["participant_id"]

    for rank, m in enumerate(top5, 1):

        model = m["model"]
        fitted_preprocessor = m["preprocessor"]
        label_encs = m["label_encoders"]

        # Apply same preprocessing to test
        if fitted_preprocessor == "none":
            Xtst = X_test.copy()

        elif fitted_preprocessor == "labelencode_only":
            Xtst = X_test.copy()
            for col in categorical_cols:
                Xtst[col] = label_encs[col].transform(X_test[col])

        else:
            Xtst = fitted_preprocessor.transform(X_test)

        preds = model.predict(Xtst)
        preds_labels = le_target.inverse_transform(preds)

        filename = f"mlp_submission_v{rank+14}.csv"
        output = pd.DataFrame({
            "participant_id": test_ids,
            "personality_cluster": preds_labels
        })

        output.to_csv(filename, index=False)
        print(f"‚úÖ Saved {filename}")

    return top5


# ============================================================
# 3. RUN PIPELINE
# ============================================================

train_df = pd.read_csv("/content/train.csv")
test_df  = pd.read_csv("/content/test.csv")
def add_safe_fe(df):

    # small interaction features (scaled naturally)
    df["focus_consistency"] = df["focus_intensity"] * df["consistency_score"]

    df["support_guidance"] = (
        df["support_environment_score"] + df["external_guidance_usage"]
    ) / 2  # NOT multiplication

    df["creative_hobby_mean"] = (
        df["creative_expression_index"] + df["hobby_engagement_level"]
    ) / 2

    df["activity_strength"] = (
        df["physical_activity_index"]
        + df["hobby_engagement_level"]
        + df["creative_expression_index"]
    ) / 3

    df["stability_mean"] = (
        df["consistency_score"] + df["support_environment_score"]
    ) / 2

    df["guidance_ratio"] = df["external_guidance_usage"] / (
        1 + df["support_environment_score"]
    )

    return df

train_df=add_safe_fe(train_df)
test_df=add_safe_fe(test_df)

top5 = evaluate_models(train_df, test_df)
print("‚úÖ Pipeline completed successfully.")

# 8. KNN

In [None]:
X_train = train_ds.drop(["participant_id", "personality_cluster"], axis=1)
y_train = train_ds["personality_cluster"]

X_test = test_ds.drop(["participant_id"], axis=1)
test_ids = test_ds["participant_id"]


In [None]:
# ---------------------------
# 3. Scaling
# ---------------------------
scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ---------------------------
# 4. KNN Model
# ---------------------------
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train)

# ---------------------------
# 5. Predict on Test Dataset
# ---------------------------
preds = knn.predict(X_test_scaled)

# ---------------------------
# 6. Save Submission CSV
# ---------------------------
submission = pd.DataFrame({
    "participant_id": test_ids,
    "personality_cluster": preds
})

submission.to_csv("knn_submission_21_1923.csv", index=False)
print("File saved as knn_submission_21_1923.csv")

In [None]:
from sklearn.model_selection import train_test_split

X_train_split, X_val_split, y_train_split, y_val_split = train_test_split(
    X_train, y_train, test_size=0.2, random_state=42, stratify=y_train
)

scaler = StandardScaler()
X_train_split_scaled = scaler.fit_transform(X_train_split)
X_val_split_scaled = scaler.transform(X_val_split)

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_split_scaled, y_train_split)

from sklearn.metrics import accuracy_score

val_pred = knn.predict(X_val_split_scaled)
print("Validation Accuracy:", accuracy_score(y_val_split, val_pred))
