In [2]:
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import pickle

with open('models/preprocessed.pkl','rb') as f:
    x, y, X_test, test = pickle.load(f)

print(f"x shape: {x.shape}, y shape: {y.shape}, X_test shape: {X_test.shape}, test shape: {test.shape}")

# División en entrenamiento y validación (20% validación, estratificado)
X_train, X_val, y_train, y_val = train_test_split(
    x, y,
    test_size=0.2,
    stratify=y,
    random_state=42
)

rf = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200],
    'max_depth':    [4, 6, 8]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_rf = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=cv,
    scoring='accuracy',
    n_jobs=-1
)

grid_rf.fit(X_train, y_train)

print('Mejores parámetros RF:', grid_rf.best_params_)
print('Mejor score CV:   ', grid_rf.best_score_)

best_rf = grid_rf.best_estimator_
y_pred = best_rf.predict(X_val)

print('Accuracy en validación:', accuracy_score(y_val, y_pred))

best_rf.fit(x, y)

# Predicción final sobre el test set
preds = best_rf.predict(X_test)


submission = pd.DataFrame({
    'PassengerId': test['PassengerId'],
    'Survived':    preds
})

submission.to_csv('../submission/titanic_submission.csv', index=False)
print("Guardado submission/titanic_submission.csv")

x shape: (891, 26), y shape: (891,), X_test shape: (418, 26), test shape: (418, 11)
Mejores parámetros RF: {'max_depth': 4, 'n_estimators': 200}
Mejor score CV:    0.8329065300896288
Accuracy en validación: 0.7932960893854749
Guardado submission/titanic_submission.csv
