### data

In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb
import lightgbm as lgb
import warnings

warnings.filterwarnings("ignore")
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from catboost import CatBoostClassifier

In [2]:
N_FOLDS = 3
RANDOM_STATE = 42
TARGET_COL = "Survived"
ID_COL = "PassengerId"

In [3]:
def load_data():
    train = pd.read_csv("/kaggle/input/competitions/titanic/train.csv")
    test = pd.read_csv("/kaggle/input/competitions/titanic/test.csv")
    submission = pd.read_csv("/kaggle/input/competitions/titanic/gender_submission.csv")

    return train, test, submission

In [4]:
def create_features(df):
    return df

In [5]:
def preprocess_data(train, test):

    y = train[TARGET_COL].copy()

    if y.dtype == "object":
        le = LabelEncoder()
        y = le.fit_transform(y)
        print(f"  Target encoding: {dict(zip(le.classes_, le.transform(le.classes_)))}")

    cols_to_drop = [TARGET_COL, ID_COL, "Ticket", "Name"]

    X = train.drop(columns=[c for c in cols_to_drop if c in train.columns])
    X_test = test.drop(columns=[c for c in cols_to_drop if c in test.columns])

    common_cols = list(set(X.columns) & set(X_test.columns))
    X = X[common_cols]
    X_test = X_test[common_cols]

    print("\n[INFO] Aplicando Feature Engineering...")

    X = create_features(X)
    X_test = create_features(X_test)

    num_cols = X.select_dtypes(include=[np.number]).columns.tolist()
    cat_cols = X.select_dtypes(include=["object"]).columns.tolist()

    print(f"  Features numéricas: {len(num_cols)}")
    print(f"  Features categóricas: {len(cat_cols)}")
    print(f"  Total de features: {len(X.columns)}")

    imputer_num = SimpleImputer(strategy="median")
    imputer_cat = SimpleImputer(strategy="most_frequent")

    if num_cols:
        X[num_cols] = imputer_num.fit_transform(X[num_cols])
        X_test[num_cols] = imputer_num.transform(X_test[num_cols])

    if cat_cols:
        X[cat_cols] = imputer_cat.fit_transform(X[cat_cols])
        X_test[cat_cols] = imputer_cat.transform(X_test[cat_cols])

        X = pd.get_dummies(X, columns=cat_cols, drop_first=True)
        X_test = pd.get_dummies(X_test, columns=cat_cols, drop_first=True)

        X, X_test = X.align(X_test, join="inner", axis=1, fill_value=0)

    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X), columns=X.columns, index=X.index)
    X_test_scaled = pd.DataFrame(
        scaler.transform(X_test), columns=X_test.columns, index=X_test.index
    )

    return X_scaled, X_test_scaled, y

In [6]:
def get_models():
    models = {
        "CatBoost": CatBoostClassifier(
            iterations=1500,
            learning_rate=0.02,
            depth=8,
            l2_leaf_reg=3,
            border_count=128,
            random_state=RANDOM_STATE,
            verbose=False,
            thread_count=-1,
            eval_metric="AUC",
        ),
        "LightGBM": lgb.LGBMClassifier(
            n_estimators=1500,
            learning_rate=0.02,
            max_depth=8,
            num_leaves=31,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_samples=20,
            random_state=RANDOM_STATE,
            verbose=-1,
            n_jobs=-1,
            metric="auc",
        ),
        "XGBoost": xgb.XGBClassifier(
            n_estimators=1500,
            learning_rate=0.02,
            max_depth=8,
            subsample=0.8,
            colsample_bytree=0.8,
            min_child_weight=1,
            gamma=0,
            random_state=RANDOM_STATE,
            eval_metric="auc",
            n_jobs=-1,
        ),
        "LogisticRegression": LogisticRegression(
            C=0.5,
            penalty="l2",
            solver="lbfgs",
            max_iter=2000,
            random_state=RANDOM_STATE,
            n_jobs=-1,
        ),
    }
    return models

In [7]:
def get_oof_predictions(model, X, y, X_test, n_folds=N_FOLDS):
    """
    Retorna predições OOF com valores binários (0 ou 1)
    """
    skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=RANDOM_STATE)

    oof_train = np.zeros(len(X))
    oof_test_skf = np.zeros((n_folds, len(X_test)))

    fold_scores = []

    for fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        print(f"    Fold {fold + 1}/{n_folds}...", end=" ")

        X_tr, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_tr, y_val = (
            y.iloc[train_idx] if hasattr(y, "iloc") else y[train_idx],
            y.iloc[val_idx] if hasattr(y, "iloc") else y[val_idx],
        )

        model.fit(X_tr, y_tr)

        oof_train[val_idx] = model.predict(X_val)
        oof_test_skf[fold, :] = model.predict(X_test)

        fold_acc = accuracy_score(y_val, oof_train[val_idx])
        fold_scores.append(fold_acc)
        print(f"Accuracy: {fold_acc:.6f}")

    oof_test = np.round(oof_test_skf.mean(axis=0)).astype(int)

    mean_acc = np.mean(fold_scores)
    std_acc = np.std(fold_scores)
    print(f"    {'=' * 50}")
    print(f"    Mean CV Accuracy: {mean_acc:.6f} (+/- {std_acc:.6f})")
    print(f"    {'=' * 50}")

    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1), mean_acc

In [8]:
def save_submission(ids, predictions, filename):
    """
    Salva submissão com valores inteiros (0 ou 1)
    """

    predictions_int = predictions.astype(int).flatten()

    submission = pd.DataFrame({ID_COL: ids, TARGET_COL: predictions_int})
    submission.to_csv(filename, index=False)
    print(f"\n✓ Submissão salva: {filename}")
    print(f"  Shape: {submission.shape}")
    print(f"  Valores únicos: {sorted(submission[TARGET_COL].unique())}")
    print(f"  Distribuição: {submission[TARGET_COL].value_counts().to_dict()}")
    return submission

In [9]:
if __name__ == "__main__":
    print("=" * 60)
    print("LOADING DATA...")
    print("=" * 60)
    train, test, submission = load_data()

    print("\n" + "=" * 60)
    print("PREPROCESSING DATA...")
    print("=" * 60)
    X, X_test, y = preprocess_data(train, test)

    print("\n" + "=" * 60)
    print("TRAINING ALL MODELS...")
    print("=" * 60)

    models = get_models()

    all_oof_train = []
    all_oof_test = []
    model_scores = {}

    for model_name, model in models.items():
        print(f"\n>>> Training {model_name}...")
        print("-" * 60)

        oof_train, oof_test, cv_score = get_oof_predictions(model, X, y, X_test)

        all_oof_train.append(oof_train)
        all_oof_test.append(oof_test)
        model_scores[model_name] = cv_score

    print("\n" + "=" * 60)
    print("CREATING ENSEMBLE (MAJORITY VOTING)...")
    print("=" * 60)

    ensemble_train = np.hstack(all_oof_train)
    ensemble_test = np.hstack(all_oof_test)

    final_predictions_train = (ensemble_train.mean(axis=1) >= 0.5).astype(int)
    final_predictions_test = (ensemble_test.mean(axis=1) >= 0.5).astype(int)

    ensemble_train_acc = accuracy_score(y, final_predictions_train)

    print("\nMODEL SCORES:")
    for model_name, score in model_scores.items():
        print(f"  {model_name:20s}: {score:.6f}")

    print(f"\nENSEMBLE (Majority Voting): {ensemble_train_acc:.6f}")

    print("\n" + "=" * 60)
    print("SAVING SUBMISSIONS...")
    print("=" * 60)

    save_submission(test[ID_COL], final_predictions_test, "submission_ensemble.csv")

    for i, model_name in enumerate(models.keys()):
        filename = f"submission_{model_name.lower().replace(' ', '_')}.csv"
        save_submission(test[ID_COL], all_oof_test[i], filename)

    print("\n" + "=" * 60)

LOADING DATA...

PREPROCESSING DATA...

[INFO] Aplicando Feature Engineering...
  Features numéricas: 5
  Features categóricas: 3
  Total de features: 8

TRAINING ALL MODELS...

>>> Training CatBoost...
------------------------------------------------------------
    Fold 1/3... Accuracy: 0.845118
    Fold 2/3... Accuracy: 0.804714
    Fold 3/3... Accuracy: 0.838384
    Mean CV Accuracy: 0.829405 (+/- 0.017675)

>>> Training LightGBM...
------------------------------------------------------------
    Fold 1/3... Accuracy: 0.835017
    Fold 2/3... Accuracy: 0.781145
    Fold 3/3... Accuracy: 0.808081
    Mean CV Accuracy: 0.808081 (+/- 0.021993)

>>> Training XGBoost...
------------------------------------------------------------
    Fold 1/3... Accuracy: 0.841751
    Fold 2/3... Accuracy: 0.781145
    Fold 3/3... Accuracy: 0.814815
    Mean CV Accuracy: 0.812570 (+/- 0.024793)

>>> Training LogisticRegression...
------------------------------------------------------------
    Fold 1/3.