In [6]:
import os, sys
import lightgbm as lgb
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
notebooks_dir = os.getcwd()
project_root = os.path.abspath(os.path.join(notebooks_dir, os.pardir))
if project_root not in sys.path:
    sys.path.insert(0, project_root)

from src.data      import load_data
from src.utils     import set_seed
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
set_seed(42)
sns.set(style="whitegrid")


In [7]:
train = load_data('../data/processed/train_scaled.csv')
train['target'] = train['Personality'].map({'Extrovert': 0, 'Introvert': 1})

x = train.drop(columns=['id','Personality','target'])
y = train['target']

print("X shape:", x.shape)
print("y distribution:\n", y.value_counts(normalize=True))


X shape: (18524, 7)
y distribution:
 target
0    0.739527
1    0.260473
Name: proportion, dtype: float64


In [8]:
params = {
    'objective': 'binary',
    'metric': 'auc',
    'learning_rate': 0.05,
    'num_leaves': 31,
    'verbose': -1,
    'seed': 42
}


In [9]:
models    = []
oof_preds = np.zeros(len(x))

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (tr_idx, val_idx) in enumerate(skf.split(x, y)):
    X_tr, X_val = x.iloc[tr_idx], x.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]
    dtrain = lgb.Dataset(X_tr, label=y_tr)
    dval   = lgb.Dataset(X_val, label=y_val)
    callbacks = [
        lgb.early_stopping(stopping_rounds=50),
        lgb.log_evaluation(period=100)
    ]

    model = lgb.train(
        params,
        dtrain,
        num_boost_round=1000,
        valid_sets=[dval],
        callbacks=callbacks
    )

    oof_preds[val_idx] = model.predict(X_val)
    fold_auc = roc_auc_score(y_val, oof_preds[val_idx])
    print(f'Fold {fold} AUC: {fold_auc:.4f}')

    models.append(model)

oof_auc = roc_auc_score(y, oof_preds)
print(f'OOF ROC AUC: {oof_auc:.4f}')


Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[42]	valid_0's auc: 0.972125
Fold 0 AUC: 0.9721
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.96719
Early stopping, best iteration is:
[69]	valid_0's auc: 0.968827
Fold 1 AUC: 0.9688
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[19]	valid_0's auc: 0.965517
Fold 2 AUC: 0.9655
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.9693
Early stopping, best iteration is:
[56]	valid_0's auc: 0.969557
Fold 3 AUC: 0.9696
Training until validation scores don't improve for 50 rounds
[100]	valid_0's auc: 0.973508
[200]	valid_0's auc: 0.972946
Early stopping, best iteration is:
[157]	valid_0's auc: 0.97376
Fold 4 AUC: 0.9738
OOF ROC AUC: 0.9662


In [10]:
os.makedirs('../models', exist_ok=True)
np.save('../models/oof_preds.npy', oof_preds)

for i, m in enumerate(models):
    m.save_model(f'../models/lgbm_fold{i}.txt')

print("Modelos y OOF guardados en ../models/")

Modelos y OOF guardados en ../models/
