# Predicting Introverts vs Extroverts 🎭 S5E7
--------------------------------------------------------------------------------------------------------------------------------------------------
## Submitted by Dur e yashfeen 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/playground-series-s5e7/sample_submission.csv
/kaggle/input/playground-series-s5e7/train.csv
/kaggle/input/playground-series-s5e7/test.csv


## 🧠 Part 1: Understanding the Dataset
### 🔍 Initial Steps

## 📌 Step 1: Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier, StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

import lightgbm as lgb
import xgboost as xgb
import catboost as cb
import optuna
import warnings
warnings.filterwarnings('ignore')

## 📌 Step 2: Load Dataset

In [3]:
train = pd.read_csv('/kaggle/input/playground-series-s5e7/train.csv')
test = pd.read_csv('/kaggle/input/playground-series-s5e7/test.csv')
submission = pd.read_csv('/kaggle/input/playground-series-s5e7/sample_submission.csv')

In [4]:
train.head(10)

Unnamed: 0,id,Time_spent_Alone,Stage_fear,Social_event_attendance,Going_outside,Drained_after_socializing,Friends_circle_size,Post_frequency,Personality
0,0,0.0,No,6.0,4.0,No,15.0,5.0,Extrovert
1,1,1.0,No,7.0,3.0,No,10.0,8.0,Extrovert
2,2,6.0,Yes,1.0,0.0,,3.0,0.0,Introvert
3,3,3.0,No,7.0,3.0,No,11.0,5.0,Extrovert
4,4,1.0,No,4.0,4.0,No,13.0,,Extrovert
5,5,2.0,No,8.0,5.0,No,,3.0,Extrovert
6,6,1.0,No,8.0,,No,,4.0,Extrovert
7,7,2.0,No,8.0,3.0,No,4.0,5.0,Extrovert
8,8,4.0,Yes,2.0,1.0,,0.0,2.0,Introvert
9,9,1.0,No,8.0,6.0,No,14.0,9.0,Extrovert


In [5]:
# 🔍 EDA
print("Shape of training data:", train.shape)
print("Missing values:\n", train.isnull().sum())

Shape of training data: (18524, 9)
Missing values:
 id                              0
Time_spent_Alone             1190
Stage_fear                   1893
Social_event_attendance      1180
Going_outside                1466
Drained_after_socializing    1149
Friends_circle_size          1054
Post_frequency               1264
Personality                     0
dtype: int64


## 📌 Step 3: Preprocessing

In [6]:
# Separate target and ID
X = train.drop(columns=['id', 'Personality'])
y = train['Personality']
X_test = test.drop(columns=['id'])

# Identify columns
cat_cols = ['Stage_fear', 'Drained_after_socializing']
num_cols = [col for col in X.columns if col not in cat_cols]

# Imputation
for col in cat_cols:
    X[col] = X[col].fillna(X[col].mode()[0])
    X_test[col] = X_test[col].fillna(X[col].mode()[0])

for col in num_cols:
    X[col] = X[col].fillna(X[col].median())
    X_test[col] = X_test[col].fillna(X[col].median())

# Encode categoricals
le = LabelEncoder()
for col in cat_cols:
    X[col] = le.fit_transform(X[col])
    X_test[col] = le.transform(X_test[col])

# Encode target (Introvert/Extrovert → 0/1)
y = le.fit_transform(y)

## 📌 Step 4: Hyperparameter Tuning for LightGBM using Optuna

In [7]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'accuracy',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'feature_fraction': trial.suggest_float('feature_fraction', 0.6, 1.0),
        'num_leaves': trial.suggest_int('num_leaves', 20, 100),
        'max_depth': trial.suggest_int('max_depth', 3, 12),
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.2),
        'n_estimators': trial.suggest_int('n_estimators', 100, 500)
    }
    model = lgb.LGBMClassifier(**params)
    return cross_val_score(model, X, y, cv=3, scoring='accuracy').mean()

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)
best_params_lgb = study.best_params

[I 2025-07-22 17:41:56,197] A new study created in memory with name: no-name-a4ba7584-348a-4962-a8bb-837bf31be939
[I 2025-07-22 17:41:58,003] Trial 0 finished with value: 0.9648024998830586 and parameters: {'feature_fraction': 0.956686252298093, 'num_leaves': 71, 'max_depth': 11, 'learning_rate': 0.08122432350121417, 'n_estimators': 252}. Best is trial 0 with value: 0.9648024998830586.
[I 2025-07-22 17:42:01,882] Trial 1 finished with value: 0.9637768588574174 and parameters: {'feature_fraction': 0.7159045604451, 'num_leaves': 93, 'max_depth': 11, 'learning_rate': 0.055825407460164546, 'n_estimators': 411}. Best is trial 0 with value: 0.9648024998830586.
[I 2025-07-22 17:42:03,566] Trial 2 finished with value: 0.9636148893076927 and parameters: {'feature_fraction': 0.6347849104570761, 'num_leaves': 77, 'max_depth': 9, 'learning_rate': 0.15792792214731427, 'n_estimators': 214}. Best is trial 0 with value: 0.9648024998830586.
[I 2025-07-22 17:42:04,351] Trial 3 finished with value: 0.966

## 📌 Step 5: Define Base Models

In [8]:
model_lgb = lgb.LGBMClassifier(**best_params_lgb)
model_xgb = xgb.XGBClassifier(eval_metric='logloss', use_label_encoder=False)
model_cb = cb.CatBoostClassifier(verbose=0)
model_rf = RandomForestClassifier(n_estimators=150, random_state=42)
model_et = ExtraTreesClassifier(n_estimators=150, random_state=42)

## 📌 Step 6: Build Stacking Ensemble Model

In [9]:
stack_model = StackingClassifier(
    estimators=[
        ('lgb', model_lgb),
        ('xgb', model_xgb),
        ('cb', model_cb),
        ('rf', model_rf),
        ('et', model_et),
    ],
    final_estimator=LogisticRegression(),
    cv=5,
    n_jobs=-1
)

stack_model.fit(X, y)

[LightGBM] [Info] Number of positive: 4825, number of negative: 13699
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000997 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the train set: 18524, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260473 -> initscore=-1.043512
[LightGBM] [Info] Start training from score -1.043512
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009011 seconds.
You can set `force_row_wise=true` to remove the overhead.
And 

## 📌 Step 7: Cross-Validation Score

In [10]:
cv_scores = cross_val_score(stack_model, X, y, cv=5, scoring='accuracy')
print("✅ Stacked Model CV Accuracy:", round(cv_scores.mean(), 4))

[LightGBM] [Info] Number of positive: 3860, number of negative: 10959
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000868 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the train set: 14819, number of used features: 7
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.260476 -> initscore=-1.043494
[LightGBM] [Info] Start training from score -1.043494
[LightGBM] [Info] Number of positive: 3088, number of negative: 8767
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003924 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Number of positive: 3088, number of negative: 8767
[LightGBM] [Info] Total Bins 62
[LightGBM] [Info] Number of data points in the tra

## 📌 Step 8: Make Predictions and Save Submission

In [11]:
preds = stack_model.predict(X_test)
submission['Personality'] = le.inverse_transform(preds)
submission.to_csv('submission.csv', index=False)