# IMPORTING LIB

In [1]:
pip install -r requirements.txt

Collecting xgboost (from -r requirements.txt (line 6))
  Downloading xgboost-3.0.3-py3-none-manylinux_2_28_x86_64.whl.metadata (2.1 kB)
Collecting lightgbm (from -r requirements.txt (line 7))
  Downloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl.metadata (17 kB)
Collecting nvidia-nccl-cu12 (from xgboost->-r requirements.txt (line 6))
  Downloading nvidia_nccl_cu12-2.27.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.0 kB)
Downloading xgboost-3.0.3-py3-none-manylinux_2_28_x86_64.whl (253.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.8/253.8 MB[0m [31m4.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading lightgbm-4.6.0-py3-none-manylinux_2_28_x86_64.whl (3.6 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m97.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading nvidia_nccl_cu12-2.27.6-py3-none-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (322.5 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [2]:
# Standard library
import pickle

# Data manipulation
import pandas as pd

# Scikit-learn modules
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.model_selection import RandomizedSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

# Boosting libraries
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# IMPORTING DATASET

### Load Dataset

In [3]:
# Load Data
train_df = pd.read_csv('train_processed.csv')
val_df = pd.read_csv('val_processed.csv')
test_df = pd.read_csv('test_processed.csv')

In [4]:
# Load Encoders
with open('preprocessing_pipeline.pkl', 'rb') as f:
    pipeline_objects = pickle.load(f)
label_encoders = pipeline_objects['label_encoders']
le_y = label_encoders['Personality']

In [5]:
# Split train/val
X_train = train_df.drop(columns='Personality')
y_train = le_y.transform(train_df['Personality'])

X_val = val_df.drop(columns='Personality')
y_val = le_y.transform(val_df['Personality'])

X_test = test_df.drop(columns='id')
test_ids = test_df['id']

## Modeling

### Models

In [6]:
# Define candidate models
models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier(),
    'LogisticRegression': LogisticRegression(max_iter=1000),
    'SVC': SVC(),
    'KNN': KNeighborsClassifier(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}

### Ensemble

In [7]:
# Add stacking ensemble
models['Stacking'] = StackingClassifier(
    estimators=[
        ('xgb', XGBClassifier(use_label_encoder=False, eval_metric='logloss')),
        ('lgbm', LGBMClassifier())
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

In [8]:
# Train and evaluate all models
accuracies = {}
for name, model in models.items():
    model.fit(X_train, y_train)
    preds = model.predict(X_val)
    acc = accuracy_score(y_val, preds)
    accuracies[name] = acc
    print(f"{name} Accuracy: {acc:.4f}")

RandomForest Accuracy: 0.9698
GradientBoosting Accuracy: 0.9719
LogisticRegression Accuracy: 0.9717
SVC Accuracy: 0.9719
KNN Accuracy: 0.9714
XGBoost Accuracy: 0.9717
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000789 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 244
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


LightGBM Accuracy: 0.9719
Stacking Accuracy: 0.9722


In [9]:
# Select best model
best_model_name = max(accuracies, key=accuracies.get)
best_model = models[best_model_name]
print(f"\n✅ Best Model (Before Tuning): {best_model_name}")


✅ Best Model (Before Tuning): Stacking


In [10]:
# Define tuning grid
param_grid = {}
if best_model_name == 'RandomForest':
    param_grid = {
        'n_estimators': [100, 200],
        'max_depth': [None, 10, 20]
    }
elif best_model_name == 'GradientBoosting':
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1]
    }
elif best_model_name == 'LogisticRegression':
    param_grid = {'C': [0.1, 1.0, 10.0]}
elif best_model_name == 'SVC':
    param_grid = {'C': [0.1, 1.0], 'kernel': ['linear', 'rbf']}
elif best_model_name == 'KNN':
    param_grid = {'n_neighbors': [3, 5, 7]}
elif best_model_name == 'XGBoost':
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'max_depth': [3, 5, 7]
    }
elif best_model_name == 'LightGBM':
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.05, 0.1],
        'num_leaves': [15, 31]
    }

In [11]:
# Tune with RandomizedSearch
search = RandomizedSearchCV(
    best_model,
    param_distributions=param_grid,
    n_iter=10,
    cv=5,
    scoring='accuracy',
    n_jobs=-1,
    random_state=42
)
search.fit(X_train, y_train)
final_tuned_model = search.best_estimator_



In [12]:
# Evaluate tuned model on val set
val_preds = final_tuned_model.predict(X_val)
val_acc = accuracy_score(y_val, val_preds)
print(f"\n✅ Tuned {best_model_name} Validation Accuracy: {val_acc:.4f}")
print(f"📌 Best Hyperparameters: {search.best_params_}")


✅ Tuned Stacking Validation Accuracy: 0.9722
📌 Best Hyperparameters: {}


In [13]:
# Retrain on full (train + val)
full_df = pd.concat([train_df, val_df], ignore_index=True)
X_full = full_df.drop(columns='Personality')
y_full = le_y.transform(full_df['Personality'])

final_tuned_model.fit(X_full, y_full)
print("🎯 Final model retrained on full dataset.")

🎯 Final model retrained on full dataset.


In [None]:
# Save full-data model
with open('../model/modelling/final_full_model.pkl', 'wb') as f:
    pickle.dump(final_tuned_model, f)
print("✅ Final model saved as 'final_full_model.pkl'.")

✅ Final model saved as 'final_full_model.pkl'.


In [15]:
# Predict and decode test set
full_test_preds = final_tuned_model.predict(X_test)
decoded_preds = le_y.inverse_transform(full_test_preds)

In [None]:
# Save new submission file
submission_full = pd.DataFrame({
    'id': test_ids,
    'Personality': decoded_preds
})
submission_full.to_csv('../data/submission/submission_full.csv', index=False)
print("📄 New submission saved as 'submission_full.csv'.")

📄 New submission saved as 'submission_full.csv'.
