# Model Used for Kaggle

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import f1_score

from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.linear_model import LogisticRegression


# =========================
# Load data
# =========================
train = pd.read_csv("dataset-train-vf.csv")
test  = pd.read_csv("dataset-test-vf.csv")

X_full = train.drop(columns=["ID", "y"]).copy()
y_full = train["y"].astype(str).copy()

test_ids = test["ID"].copy()
X_test = test.drop(columns=["ID"]).copy()

cat_cols = ["f11"]
num_cols = [c for c in X_full.columns if c not in cat_cols]

preprocess = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median", add_indicator=True)),
            ("scaler", StandardScaler()),
        ]), num_cols),
        ("cat", Pipeline([
            ("imputer", SimpleImputer(strategy="most_frequent")),
            ("onehot", OneHotEncoder(handle_unknown="ignore")),
        ]), cat_cols),
    ],
    remainder="drop"
)

# Helper to get P(circle) even after calibration
def p_circle_from_estimator(est, X):
    proba = est.predict_proba(X)
    classes = est.classes_
    idx = list(classes).index("circle")
    return proba[:, idx]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# =========================
# Define base learners
# =========================
# MLP pipeline (already probabilistic)
mlp_pipe = Pipeline([
    ("preprocess", preprocess),
    ("model", MLPClassifier(
        hidden_layer_sizes=(50,),
        alpha=1e-4,
        learning_rate_init=0.01,
        max_iter=700,
        random_state=42
    ))
])

# RF and ET as pipelines, then calibrated on each fold
rf_pipe_raw = Pipeline([
    ("preprocess", preprocess),
    ("model", RandomForestClassifier(
        n_estimators=700,
        max_depth=None,
        min_samples_split=10,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

et_pipe_raw = Pipeline([
    ("preprocess", preprocess),
    ("model", ExtraTreesClassifier(
        n_estimators=900,
        max_depth=None,
        min_samples_split=10,
        min_samples_leaf=2,
        class_weight="balanced",
        random_state=42,
        n_jobs=-1
    ))
])

# =========================
# OOF meta features: [MLP, CalRF, CalET]
# =========================
X_meta_oof = np.zeros((len(X_full), 3), dtype=float)

for fold, (tr_idx, va_idx) in enumerate(skf.split(X_full, y_full), start=1):
    X_tr, X_va = X_full.iloc[tr_idx], X_full.iloc[va_idx]
    y_tr, y_va = y_full.iloc[tr_idx], y_full.iloc[va_idx]

    # Fit MLP
    mlp_pipe.fit(X_tr, y_tr)
    p_mlp = mlp_pipe.predict_proba(X_va)[:, list(mlp_pipe.named_steps["model"].classes_).index("circle")]

    # Fit and calibrate RF (sigmoid)
    rf_cal = CalibratedClassifierCV(rf_pipe_raw, method="sigmoid", cv=3)
    rf_cal.fit(X_tr, y_tr)
    p_rf = p_circle_from_estimator(rf_cal, X_va)

    # Fit and calibrate ET (sigmoid)
    et_cal = CalibratedClassifierCV(et_pipe_raw, method="sigmoid", cv=3)
    et_cal.fit(X_tr, y_tr)
    p_et = p_circle_from_estimator(et_cal, X_va)

    X_meta_oof[va_idx, 0] = p_mlp
    X_meta_oof[va_idx, 1] = p_rf
    X_meta_oof[va_idx, 2] = p_et

    p_avg = X_meta_oof[va_idx].mean(axis=1)
    pred_avg = np.where(p_avg >= 0.5, "circle", "square")
    print(f"Fold {fold} F1(circle) avg bases @0.50:", round(f1_score(y_va, pred_avg, pos_label="circle"), 4))

# =========================
# Meta model + threshold tuning on OOF
# =========================
meta = LogisticRegression(max_iter=3000, class_weight="balanced", random_state=42)
meta.fit(X_meta_oof, y_full)

p_meta_oof = meta.predict_proba(X_meta_oof)[:, list(meta.classes_).index("circle")]

thresholds = np.linspace(0.05, 0.95, 181)
best_t, best_f1 = None, -1.0
for t in thresholds:
    pred = np.where(p_meta_oof >= t, "circle", "square")
    f1 = f1_score(y_full, pred, pos_label="circle")
    if f1 > best_f1:
        best_f1, best_t = float(f1), float(t)

print("\nCalibrated stacking OOF best F1(circle):", round(best_f1, 4))
print("Best threshold:", best_t)

# =========================
# Fit on ALL data and predict test
# =========================
mlp_pipe.fit(X_full, y_full)

rf_cal_full = CalibratedClassifierCV(rf_pipe_raw, method="sigmoid", cv=3)
rf_cal_full.fit(X_full, y_full)

et_cal_full = CalibratedClassifierCV(et_pipe_raw, method="sigmoid", cv=3)
et_cal_full.fit(X_full, y_full)

p_mlp_test = mlp_pipe.predict_proba(X_test)[:, list(mlp_pipe.named_steps["model"].classes_).index("circle")]
p_rf_test  = p_circle_from_estimator(rf_cal_full, X_test)
p_et_test  = p_circle_from_estimator(et_cal_full, X_test)

X_meta_test = np.column_stack([p_mlp_test, p_rf_test, p_et_test])
p_meta_test = meta.predict_proba(X_meta_test)[:, list(meta.classes_).index("circle")]

y_test_label = np.where(p_meta_test >= best_t, "circle", "square")
y_test_bin = np.where(y_test_label == "circle", 1, 0)

unique, counts = np.unique(y_test_bin, return_counts=True)
print("\nTest prediction distribution:", dict(zip(unique, counts)))

submission = pd.DataFrame({"ID": test_ids, "y": y_test_bin})
submission.to_csv("Submission_STACKING_CALIBRATED.csv", index=False)
print("Saved Submission_STACKING_CALIBRATED.csv")

Fold 1 F1(circle) avg bases @0.50: 0.5227
Fold 2 F1(circle) avg bases @0.50: 0.6337
Fold 3 F1(circle) avg bases @0.50: 0.7129
Fold 4 F1(circle) avg bases @0.50: 0.6214
Fold 5 F1(circle) avg bases @0.50: 0.5435

Calibrated stacking OOF best F1(circle): 0.6647
Best threshold: 0.6549999999999999

Test prediction distribution: {np.int64(0): np.int64(707), np.int64(1): np.int64(85)}
Saved Submission_STACKING_CALIBRATED.csv
