## 1. Import Libraries and Set Seed
Import required libraries and set the random seed for reproducibility.

In [1]:
import pandas as pd
import numpy as np
import random
import os
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, confusion_matrix, classification_report

# Set seed for reproducibility
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
SEED = 42
seed_everything(SEED)

## 2. Load Data and Feature Engineering
Load train and test data, and apply advanced feature engineering.

In [2]:
def create_advanced_features(df):
    df = df.copy()
    activity_cols = ['hobby_engagement_level', 'physical_activity_index', 'creative_expression_index', 'altruism_score']
    df['total_activity'] = df[activity_cols].sum(axis=1)
    df['support_guidance_combo'] = df['support_environment_score'] * (df['external_guidance_usage'] + 1)
    df['focus_efficiency'] = df['focus_intensity'] / (df['consistency_score'] + 1)
    df['consistency_gap'] = 30 - df['consistency_score']
    df['focus_sq'] = df['focus_intensity'] ** 2
    df['focus_X_consistency'] = df['focus_intensity'] * df['consistency_score']
    df['low_focus_high_consist'] = ((df['focus_intensity'] < 5) & (df['consistency_score'] > 24)).astype(int)
    return df

train_df = pd.read_csv('../dataset/train.csv')
test_df = pd.read_csv('../dataset/test.csv')

train_df = create_advanced_features(train_df)
test_df = create_advanced_features(test_df)

X = train_df.drop(['participant_id', 'personality_cluster'], axis=1)
y = train_df['personality_cluster']
test_ids = test_df['participant_id']
X_test = test_df.drop(['participant_id'], axis=1)

## 3. Preprocess Data (Encoding & Scaling)
Encode categorical features and scale numerical features for both train and test sets.

In [3]:
cat_cols = [
    'identity_code', 'cultural_background', 'age_group', 'upbringing_influence', 'support_environment_score',
    'hobby_engagement_level', 'physical_activity_index', 'creative_expression_index', 'altruism_score',
    'low_focus_high_consist'
]
for col in cat_cols:
    le = LabelEncoder()
    full_data = pd.concat([X[col], X_test[col]], axis=0).astype(str)
    le.fit(full_data)
    X[col] = le.transform(X[col].astype(str))
    X_test[col] = le.transform(X_test[col].astype(str))
num_cols = [c for c in X.columns if c not in cat_cols]
scaler = StandardScaler()
X[num_cols] = scaler.fit_transform(X[num_cols])
X_test[num_cols] = scaler.transform(X_test[num_cols])

## 4. Encode Target Labels
Encode target labels for training.

In [4]:
target_le = LabelEncoder()
y_encoded = target_le.fit_transform(y)
num_classes = len(target_le.classes_)

## 5. Train/Validation Split
Split the data for training and validation.

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y_encoded, test_size=0.2, stratify=y_encoded, random_state=SEED)

## 6. Train Logistic Regression Model (Softmax)
Train multinomial logistic regression using the processed features.

In [6]:
logreg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=500, random_state=SEED)
logreg.fit(X_train, y_train)



0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,42
,solver,'lbfgs'
,max_iter,500


## 7. Evaluate Model
Evaluate the model using F1 score, confusion matrix, and classification report.

In [7]:
val_preds = logreg.predict(X_val)
f1 = f1_score(y_val, val_preds, average='macro')
print(f'Validation Macro F1: {f1:.4f}')
print('Confusion Matrix:')
print(confusion_matrix(y_val, val_preds))
print('Classification Report:')
print(classification_report(y_val, val_preds, target_names=target_le.classes_))

Validation Macro F1: 0.5410
Confusion Matrix:
[[  4   8   1   1   3]
 [  1  14  21   2   6]
 [  2  10  33  14   2]
 [  0   1   8  38  19]
 [  0   0   2   9 184]]
Classification Report:
              precision    recall  f1-score   support

   Cluster_A       0.57      0.24      0.33        17
   Cluster_B       0.42      0.32      0.36        44
   Cluster_C       0.51      0.54      0.52        61
   Cluster_D       0.59      0.58      0.58        66
   Cluster_E       0.86      0.94      0.90       195

    accuracy                           0.71       383
   macro avg       0.59      0.52      0.54       383
weighted avg       0.70      0.71      0.70       383



## 8. Predict on Test Set & Save Submission
Predict test set labels and save the submission file.

In [8]:
test_probs = logreg.predict_proba(X_test)
test_preds = np.argmax(test_probs, axis=1)
test_labels = target_le.inverse_transform(test_preds)
submission_df = pd.DataFrame({
    'participant_id': test_ids,
    'personality_cluster': test_labels
})
submission_df.to_csv('logreg_submission.csv', index=False)
print("✅ Saved 'logreg_submission.csv'")

✅ Saved 'logreg_submission.csv'
