In [1]:
!pip -q install -U catboost

In [2]:
import numpy as np
import pandas as pd
import sklearn
import catboost

print("numpy:", np.__version__)
print("pandas:", pd.__version__)
print("sklearn:", sklearn.__version__)
print("catboost:", catboost.__version__)

numpy: 1.26.4
pandas: 2.2.2
sklearn: 1.4.2
catboost: 1.2.8


In [3]:
import pandas as pd
import numpy as np

DATA_DIR = "/content/Project"

TRAIN_PATH = f"{DATA_DIR}/train.csv"
TEST_PATH  = f"{DATA_DIR}/test.csv"
SUB_PATH   = f"{DATA_DIR}/sample_submission.csv"

TARGET = "survived_to_18jan"
ID_COL = "apartment_id"

train = pd.read_csv(TRAIN_PATH)
test  = pd.read_csv(TEST_PATH)
sub   = pd.read_csv(SUB_PATH)

print("Shapes")
print("train:", train.shape)
print("test :", test.shape)
print("sub  :", sub.shape)

assert ID_COL in train.columns and ID_COL in test.columns, "Нет apartment_id в train/test"
assert TARGET in train.columns, "Нет target в train"
assert TARGET not in test.columns, "Target не должен быть в test"
assert list(sub.columns) == [ID_COL, TARGET], "sample_submission должен иметь колонки: apartment_id, survived_to_18jan"

train_feats = [c for c in train.columns if c != TARGET]
test_feats = list(test.columns)

assert set(train_feats) == set(test_feats), "Набор колонок train (без target) и test различается"

y = train[TARGET]
print("\nTarget stats")
print(y.value_counts(dropna=False))
print("target mean:", float(y.mean()))

X = train.drop(columns=[TARGET])
cat_cols = [c for c in X.columns if X[c].dtype == "object"]
num_cols = [c for c in X.columns if X[c].dtype != "object" and c != ID_COL]

print("\nColumns")
print("categorical:", cat_cols)
print("numeric (count):", len(num_cols))

print("\nTop-10 missing rate (train)")
print((train.isna().mean().sort_values(ascending=False).head(10)))

print("\nTop-10 missing rate (test)")
print((test.isna().mean().sort_values(ascending=False).head(10)))

print("\ntrain head")
display(train.head(3))

print("\ntest head")
display(test.head(3))

Shapes
train: (30000, 30)
test : (18000, 29)
sub  : (18000, 2)

Target stats
survived_to_18jan
0    15583
1    14417
Name: count, dtype: int64
target mean: 0.48056666666666664

Columns
categorical: ['apartment_id', 'wing', 'window_quality', 'heating_type', 'tree_species', 'tree_form', 'stand_type', 'tinsel_level']
numeric (count): 21

Top-10 missing rate (train)
humidity_pct             0.072533
waterings_per_week       0.058267
ornaments_weight_kg      0.051133
garland_hours_per_day    0.049333
radiator_distance_m      0.040833
ceiling_height_m         0.029733
apartment_id             0.000000
tree_species             0.000000
led_garland              0.000000
tinsel_level             0.000000
dtype: float64

Top-10 missing rate (test)
humidity_pct             0.069111
waterings_per_week       0.058944
ornaments_weight_kg      0.051722
garland_hours_per_day    0.046778
radiator_distance_m      0.040278
ceiling_height_m         0.028556
robot_vacuum             0.000000
led_garland   

Unnamed: 0,apartment_id,building_age_years,wing,entrance,floor,apartment_area_m2,ceiling_height_m,window_quality,heating_type,corner_apartment,...,stand_type,cut_days_before_jan1,potted_tree,waterings_per_week,mist_spray,tinsel_level,ornaments_weight_kg,led_garland,garland_hours_per_day,survived_to_18jan
0,apt_train_000001,27,east,6,16,58.7,2.73,new,central,1,...,bucket,10,0,1.0,1,high,4.53,0,11.3,1
1,apt_train_000002,27,west,9,20,61.9,2.72,normal,electric_heater,1,...,bucket,12,0,1.0,1,low,,0,2.9,0
2,apt_train_000003,27,north,2,9,48.0,2.48,normal,central,0,...,simple_stand,0,0,2.0,0,medium,3.17,1,15.2,0



test head


Unnamed: 0,apartment_id,building_age_years,wing,entrance,floor,apartment_area_m2,ceiling_height_m,window_quality,heating_type,corner_apartment,...,tree_form,stand_type,cut_days_before_jan1,potted_tree,waterings_per_week,mist_spray,tinsel_level,ornaments_weight_kg,led_garland,garland_hours_per_day
0,apt_000001,27,south,4,21,45.8,2.57,normal,central,0,...,dense,simple_stand,9,0,3.0,1,low,1.3,1,14.9
1,apt_000002,27,south,5,14,18.0,2.75,normal,central,1,...,dense,simple_stand,3,0,3.0,1,medium,3.32,1,5.6
2,apt_000003,27,west,8,29,70.1,2.75,new,central,0,...,normal,water_reservoir,15,0,7.0,0,high,3.92,1,9.0


In [4]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score

TARGET = "survived_to_18jan"
ID_COL = "apartment_id"
SEED = 42

X = train.drop(columns=[TARGET, ID_COL])
y = train[TARGET].astype(int)

X_test = test.drop(columns=[ID_COL])

cat_cols = [c for c in X.columns if X[c].dtype == "object"]

assert X.shape[0] == y.shape[0], "Несовпадение размеров X и y"
assert set(X.columns) == set(X_test.columns), "Колонки X и X_test различаются"
assert len(cat_cols) > 0, "Не найдены категориальные колонки"
assert y.isin([0, 1]).all(), "Target должен быть 0/1"

print("Train features:", X.shape, " Test features:", X_test.shape)
print("Categorical cols:", cat_cols)

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)

oof = np.zeros(len(X), dtype=float)
test_pred = np.zeros(len(X_test), dtype=float)
fold_scores = []

for fold, (tr_idx, va_idx) in enumerate(skf.split(X, y), 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    tr_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
    va_pool = Pool(X_va, y_va, cat_features=cat_cols)

    model = CatBoostClassifier(
        loss_function="Logloss",
        eval_metric="AUC",
        iterations=6000,
        learning_rate=0.03,
        depth=7,
        l2_leaf_reg=4.0,
        random_seed=SEED,
        od_type="Iter",
        od_wait=300,
        verbose=300
    )

    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    va_pred = model.predict_proba(va_pool)[:, 1]
    oof[va_idx] = va_pred

    fold_auc = roc_auc_score(y_va, va_pred)
    fold_scores.append(fold_auc)

    test_pred += model.predict_proba(Pool(X_test, cat_features=cat_cols))[:, 1] / skf.n_splits

    print(f"Fold {fold} AUC: {fold_auc:.6f} | best_iter: {model.get_best_iteration()}")

cv_auc = roc_auc_score(y, oof)
print("\nCV summary")
print("OOF AUC:", float(cv_auc))
print("Fold AUCs:", [float(s) for s in fold_scores])
print("Mean:", float(np.mean(fold_scores)), "Std:", float(np.std(fold_scores)))

submission = sub.copy()
submission[TARGET] = np.clip(test_pred, 0.0, 1.0)

assert submission.shape[0] == test.shape[0], "submission должен иметь 18000 строк"
assert list(submission.columns) == [ID_COL, TARGET], "Неверные колонки в submission"
assert submission[TARGET].between(0.0, 1.0).all(), "Вероятности должны быть в [0,1]"
assert submission[ID_COL].equals(test[ID_COL]), "apartment_id должен совпадать с test"

OUT_PATH = "/content/submission_catboost_baseline.csv"
submission.to_csv(OUT_PATH, index=False)

print("\nSaved:", OUT_PATH)
display(submission.head(5))

Train features: (30000, 28)  Test features: (18000, 28)
Categorical cols: ['wing', 'window_quality', 'heating_type', 'tree_species', 'tree_form', 'stand_type', 'tinsel_level']
0:	test: 0.6398915	best: 0.6398915 (0)	total: 262ms	remaining: 26m 11s
300:	test: 0.6694956	best: 0.6695891 (250)	total: 31.9s	remaining: 10m 4s
600:	test: 0.6690059	best: 0.6713741 (392)	total: 53.6s	remaining: 8m 1s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6713741122
bestIteration = 392

Shrink model to first 393 iterations.
Fold 1 AUC: 0.671374 | best_iter: 392
0:	test: 0.6351237	best: 0.6351237 (0)	total: 63.5ms	remaining: 6m 20s
300:	test: 0.6728201	best: 0.6728435 (299)	total: 20.6s	remaining: 6m 29s
600:	test: 0.6696979	best: 0.6731858 (377)	total: 41.5s	remaining: 6m 12s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6731858209
bestIteration = 377

Shrink model to first 378 iterations.
Fold 2 AUC: 0.673186 | best_iter: 377
0:	test: 0.6274414	best: 0.627441

Unnamed: 0,apartment_id,survived_to_18jan
0,apt_000001,0.43521
1,apt_000002,0.388434
2,apt_000003,0.355798
3,apt_000004,0.213985
4,apt_000005,0.533275


In [6]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np

TARGET = "survived_to_18jan"
ID_COL = "apartment_id"
SEED = 42

X = train.drop(columns=[TARGET, ID_COL]).copy()
y = train[TARGET].astype(int).copy()
X_test = test.drop(columns=[ID_COL]).copy()

cat_cols = [c for c in X.columns if X[c].dtype == "object"]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
folds = list(skf.split(X, y))

def fit_cv_catboost(X, y, X_test, cat_cols, folds, params):
    oof = np.zeros(len(X), dtype=float)
    test_pred = np.zeros(len(X_test), dtype=float)
    fold_scores = []

    for fold_id, (tr_idx, va_idx) in enumerate(folds, 1):
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        tr_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
        va_pool = Pool(X_va, y_va, cat_features=cat_cols)
        te_pool = Pool(X_test, cat_features=cat_cols)

        model = CatBoostClassifier(**params)
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

        va_pred = model.predict_proba(va_pool)[:, 1]
        oof[va_idx] = va_pred

        fold_auc = roc_auc_score(y_va, va_pred)
        fold_scores.append(fold_auc)

        test_pred += model.predict_proba(te_pool)[:, 1] / len(folds)

        print(f"Fold {fold_id} AUC: {fold_auc:.6f} | best_iter: {model.get_best_iteration()}")

    cv_auc = roc_auc_score(y, oof)
    return oof, test_pred, cv_auc, fold_scores

params_a = dict(
    loss_function="Logloss",
    eval_metric="AUC",
    iterations=6000,
    learning_rate=0.03,
    depth=7,
    l2_leaf_reg=4.0,
    random_seed=SEED,
    od_type="Iter",
    od_wait=300,
    verbose=400
)

params_b = dict(
    loss_function="Logloss",
    eval_metric="AUC",
    iterations=9000,
    learning_rate=0.02,
    depth=8,
    l2_leaf_reg=8.0,
    random_seed=SEED,
    od_type="Iter",
    od_wait=500,
    bootstrap_type="Bayesian",
    bagging_temperature=1.0,
    rsm=0.9,
    random_strength=1.0,
    verbose=500
)

print("Model A CV")
oof_a, test_a, auc_a, folds_a = fit_cv_catboost(X, y, X_test, cat_cols, folds, params_a)
print("\nModel B CV")
oof_b, test_b, auc_b, folds_b = fit_cv_catboost(X, y, X_test, cat_cols, folds, params_b)

print("\nSummary")
print("A OOF AUC:", float(auc_a), "Mean:", float(np.mean(folds_a)), "Std:", float(np.std(folds_a)))
print("B OOF AUC:", float(auc_b), "Mean:", float(np.mean(folds_b)), "Std:", float(np.std(folds_b)))

best_w = None
best_auc = -1.0

for w in np.linspace(0, 1, 101):
    oof_blend = w * oof_a + (1 - w) * oof_b
    auc = roc_auc_score(y, oof_blend)
    if auc > best_auc:
        best_auc = auc
        best_w = float(w)

print("\nBest blend")
print("best_w (A weight):", best_w)
print("best_blend_OOF_AUC:", float(best_auc))

test_blend = best_w * test_a + (1 - best_w) * test_b
test_blend = np.clip(test_blend, 0.0, 1.0)

submission_best = sub.copy()
submission_best[TARGET] = test_blend

assert submission_best.shape[0] == test.shape[0]
assert list(submission_best.columns) == [ID_COL, TARGET]
assert submission_best[TARGET].between(0.0, 1.0).all()
assert submission_best[ID_COL].equals(test[ID_COL])

OUT_PATH = "/content/submission_best_blend.csv"
submission_best.to_csv(OUT_PATH, index=False)

print("\nSaved:", OUT_PATH)
display(submission_best.head(5))

Model A CV
0:	test: 0.6398915	best: 0.6398915 (0)	total: 71.6ms	remaining: 7m 9s
400:	test: 0.6712343	best: 0.6713741 (392)	total: 27.4s	remaining: 6m 22s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6713741122
bestIteration = 392

Shrink model to first 393 iterations.
Fold 1 AUC: 0.671374 | best_iter: 392
0:	test: 0.6351237	best: 0.6351237 (0)	total: 61ms	remaining: 6m 5s
400:	test: 0.6728194	best: 0.6731858 (377)	total: 26.6s	remaining: 6m 11s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6731858209
bestIteration = 377

Shrink model to first 378 iterations.
Fold 2 AUC: 0.673186 | best_iter: 377
0:	test: 0.6274414	best: 0.6274414 (0)	total: 67.9ms	remaining: 6m 47s
400:	test: 0.6720023	best: 0.6721663 (385)	total: 26.5s	remaining: 6m 9s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6724566955
bestIteration = 494

Shrink model to first 495 iterations.
Fold 3 AUC: 0.672457 | best_iter: 494
0:	test: 0.6281969	best: 0.6

Unnamed: 0,apartment_id,survived_to_18jan
0,apt_000001,0.432833
1,apt_000002,0.378515
2,apt_000003,0.36319
3,apt_000004,0.217191
4,apt_000005,0.531434


In [9]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

TARGET = "survived_to_18jan"
ID_COL = "apartment_id"
SEED = 42

X = train.drop(columns=[TARGET, ID_COL]).copy()
y = train[TARGET].astype(int).copy()
X_test = test.drop(columns=[ID_COL]).copy()

cat_cols = [c for c in X.columns if X[c].dtype == "object"]

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
folds = list(skf.split(X, y))

def cv_auc_catboost(params: dict) -> float:
    oof = np.zeros(len(X), dtype=float)

    for tr_idx, va_idx in folds:
        X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
        y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

        tr_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
        va_pool = Pool(X_va, y_va, cat_features=cat_cols)

        model = CatBoostClassifier(**params)
        model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

        oof[va_idx] = model.predict_proba(va_pool)[:, 1]

    return float(roc_auc_score(y, oof))

base_common = dict(
    loss_function="Logloss",
    eval_metric="AUC",
    random_seed=SEED,
    od_type="Iter",
    verbose=False
)

candidates = [
    ("A_baseline", dict(base_common, iterations=6000, learning_rate=0.03, depth=7, l2_leaf_reg=4.0, od_wait=300)),
    ("C_deeper_reg", dict(base_common, iterations=8000, learning_rate=0.025, depth=9, l2_leaf_reg=10.0, od_wait=400,
                          bootstrap_type="Bayesian", bagging_temperature=0.5, rsm=0.9, random_strength=1.0)),
    ("D_shallower_fast", dict(base_common, iterations=8000, learning_rate=0.03, depth=6, l2_leaf_reg=6.0, od_wait=400,
                              bootstrap_type="Bayesian", bagging_temperature=1.0, rsm=0.95, random_strength=1.0)),
    ("E_balanced", dict(base_common, iterations=10000, learning_rate=0.02, depth=8, l2_leaf_reg=6.0, od_wait=500,
                        bootstrap_type="Bayesian", bagging_temperature=1.0, rsm=0.9, random_strength=1.0)),
]

results = []
for name, params in candidates:
    params_run = params.copy()
    params_run["verbose"] = 0
    auc = cv_auc_catboost(params_run)
    results.append((name, auc, params))

res_df = pd.DataFrame(results, columns=["name", "oof_auc", "params"]).sort_values("oof_auc", ascending=False)
display(res_df[["name", "oof_auc"]])

best_name = res_df.iloc[0]["name"]
best_params = res_df.iloc[0]["params"]
print("Best config:", best_name, "AUC:", float(res_df.iloc[0]["oof_auc"]))

oof = np.zeros(len(X), dtype=float)
test_pred = np.zeros(len(X_test), dtype=float)
fold_scores = []

for fold_id, (tr_idx, va_idx) in enumerate(folds, 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    tr_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
    va_pool = Pool(X_va, y_va, cat_features=cat_cols)
    te_pool = Pool(X_test, cat_features=cat_cols)

    model = CatBoostClassifier(**best_params)
    model.set_params(verbose=400)
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    va_pred = model.predict_proba(va_pool)[:, 1]
    oof[va_idx] = va_pred
    fold_auc = roc_auc_score(y_va, va_pred)
    fold_scores.append(float(fold_auc))

    test_pred += model.predict_proba(te_pool)[:, 1] / len(folds)

    print(f"Fold {fold_id} AUC: {fold_auc:.6f} | best_iter: {model.get_best_iteration()}")

final_auc = float(roc_auc_score(y, oof))
print("Final OOF AUC:", final_auc)
print("Fold mean:", float(np.mean(fold_scores)), "std:", float(np.std(fold_scores)))

submission_best_single = sub.copy()
submission_best_single[TARGET] = np.clip(test_pred, 0.0, 1.0)

assert submission_best_single.shape[0] == test.shape[0]
assert list(submission_best_single.columns) == [ID_COL, TARGET]
assert submission_best_single[TARGET].between(0.0, 1.0).all()
assert submission_best_single[ID_COL].equals(test[ID_COL])

OUT_PATH = "/content/submission_best_single_model.csv"
submission_best_single.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
display(submission_best_single.head(5))


Unnamed: 0,name,oof_auc
0,A_baseline,0.672022
2,D_shallower_fast,0.671506
3,E_balanced,0.670339
1,C_deeper_reg,0.670154


Best config: A_baseline AUC: 0.6720217012623126
0:	test: 0.6398915	best: 0.6398915 (0)	total: 91.3ms	remaining: 9m 7s
400:	test: 0.6712343	best: 0.6713741 (392)	total: 26.6s	remaining: 6m 12s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6713741122
bestIteration = 392

Shrink model to first 393 iterations.
Fold 1 AUC: 0.671374 | best_iter: 392
0:	test: 0.6351237	best: 0.6351237 (0)	total: 64.5ms	remaining: 6m 27s
400:	test: 0.6728194	best: 0.6731858 (377)	total: 25.7s	remaining: 5m 58s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6731858209
bestIteration = 377

Shrink model to first 378 iterations.
Fold 2 AUC: 0.673186 | best_iter: 377
0:	test: 0.6274414	best: 0.6274414 (0)	total: 69.5ms	remaining: 6m 56s
400:	test: 0.6720023	best: 0.6721663 (385)	total: 26.1s	remaining: 6m 4s
Stopped by overfitting detector  (300 iterations wait)

bestTest = 0.6724566955
bestIteration = 494

Shrink model to first 495 iterations.
Fold 3 AUC: 0.672457 | bes

Unnamed: 0,apartment_id,survived_to_18jan
0,apt_000001,0.43521
1,apt_000002,0.388434
2,apt_000003,0.355798
3,apt_000004,0.213985
4,apt_000005,0.533275


In [13]:
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd

TARGET = "survived_to_18jan"
ID_COL = "apartment_id"
SEED = 42

X_base = train.drop(columns=[TARGET, ID_COL]).copy()
y = train[TARGET].astype(int).copy()
X_test_base = test.drop(columns=[ID_COL]).copy()

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
folds = list(skf.split(X_base, y))

def add_stable_features(df: pd.DataFrame) -> pd.DataFrame:
    df = df.copy()

    #(без Interval)
    num_bins = {
        "floor": [1, 3, 6, 10, 15, 20, 25, 30],
        "apartment_area_m2": [15, 25, 35, 45, 55, 65, 80, 100],
        "room_temp_c": [15, 18, 20, 22, 24, 26, 28],
        "humidity_pct": [20, 30, 40, 50, 60, 70, 80],
        "tree_height_cm": [80, 120, 150, 180, 210, 240],
        "garland_hours_per_day": [0, 2, 4, 6, 8, 12, 16, 20, 24],
        "cut_days_before_jan1": [0, 2, 4, 7, 10, 14, 18, 22],
        "waterings_per_week": [0, 1, 2, 3, 5, 7, 10, 14],
    }

    for col, bins in num_bins.items():
        if col in df.columns:
            b = pd.cut(df[col], bins=bins, include_lowest=True)
            df[f"{col}_bin"] = b.map(lambda x: str(x) if pd.notna(x) else np.nan).astype("object")

    cat_pairs = [
        ("wing", "window_quality"),
        ("heating_type", "window_quality"),
        ("stand_type", "potted_tree"),
        ("tree_species", "tree_form"),
        ("tinsel_level", "led_garland"),
        ("cat_present", "children_count"),
    ]
    for a, b in cat_pairs:
        if a in df.columns and b in df.columns:
            df[f"{a}__{b}"] = df[a].astype("string") + "|" + df[b].astype("string")
            df[f"{a}__{b}"] = df[f"{a}__{b}"].astype("object")

    eps = 1e-6
    if "ornaments_weight_kg" in df.columns and "tree_height_cm" in df.columns:
        df["orn_w_per_h"] = df["ornaments_weight_kg"] / (df["tree_height_cm"] + eps)

    if "radiator_distance_m" in df.columns:
        df["radiator_inv"] = 1.0 / (df["radiator_distance_m"] + 0.1)

    if "room_temp_c" in df.columns and "humidity_pct" in df.columns:
        df["temp_minus_hum"] = df["room_temp_c"] - (df["humidity_pct"] / 10.0)

    return df

X = add_stable_features(X_base)
X_test = add_stable_features(X_test_base)

cat_cols = [c for c in X.columns if X[c].dtype == "object"]

CAT_NA = "__NA__"
for c in cat_cols:
    X[c] = X[c].astype("string").fillna(CAT_NA)
    X_test[c] = X_test[c].astype("string").fillna(CAT_NA)

assert list(X.columns) == list(X_test.columns), "Колонки X и X_test различаются после FE"
assert y.isin([0, 1]).all(), "Target должен быть 0/1"
assert X[cat_cols].isna().sum().sum() == 0, "В категориальных (train) остались NaN"
assert X_test[cat_cols].isna().sum().sum() == 0, "В категориальных (test) остались NaN"

print("Base features:", X_base.shape[1])
print("New features :", X.shape[1], "Added:", X.shape[1] - X_base.shape[1])
print("Categorical  :", len(cat_cols))

# CV обучение
params = dict(
    loss_function="Logloss",
    eval_metric="AUC",
    iterations=9000,
    learning_rate=0.02,
    depth=7,
    l2_leaf_reg=6.0,
    random_seed=SEED,
    od_type="Iter",
    od_wait=400,
    verbose=400
)

oof = np.zeros(len(X), dtype=float)
test_pred = np.zeros(len(X_test), dtype=float)
fold_scores = []

for fold_id, (tr_idx, va_idx) in enumerate(folds, 1):
    X_tr, X_va = X.iloc[tr_idx], X.iloc[va_idx]
    y_tr, y_va = y.iloc[tr_idx], y.iloc[va_idx]

    tr_pool = Pool(X_tr, y_tr, cat_features=cat_cols)
    va_pool = Pool(X_va, y_va, cat_features=cat_cols)
    te_pool = Pool(X_test, cat_features=cat_cols)

    model = CatBoostClassifier(**params)
    model.fit(tr_pool, eval_set=va_pool, use_best_model=True)

    va_pred = model.predict_proba(va_pool)[:, 1]
    oof[va_idx] = va_pred

    fold_auc = roc_auc_score(y_va, va_pred)
    fold_scores.append(float(fold_auc))

    test_pred += model.predict_proba(te_pool)[:, 1] / len(folds)

    print(f"Fold {fold_id} AUC: {fold_auc:.6f} | best_iter: {model.get_best_iteration()}")

final_auc = float(roc_auc_score(y, oof))
print("Final OOF AUC:", final_auc)
print("Fold mean:", float(np.mean(fold_scores)), "std:", float(np.std(fold_scores)))

# Сабмит
submission_fe = sub.copy()
submission_fe[TARGET] = np.clip(test_pred, 0.0, 1.0)

assert submission_fe.shape[0] == test.shape[0]
assert list(submission_fe.columns) == [ID_COL, TARGET]
assert submission_fe[TARGET].between(0.0, 1.0).all()
assert submission_fe[ID_COL].equals(test[ID_COL])

OUT_PATH = "/content/submission_catboost_stable_fe.csv"
submission_fe.to_csv(OUT_PATH, index=False)

print("Saved:", OUT_PATH)
display(submission_fe.head(5))


Base features: 28
New features : 45 Added: 17
Categorical  : 21
0:	test: 0.6389087	best: 0.6389087 (0)	total: 221ms	remaining: 33m 9s
400:	test: 0.6732260	best: 0.6734055 (366)	total: 1m 3s	remaining: 22m 35s
800:	test: 0.6744317	best: 0.6746523 (726)	total: 2m 10s	remaining: 22m 11s
Stopped by overfitting detector  (400 iterations wait)

bestTest = 0.6746523469
bestIteration = 726

Shrink model to first 727 iterations.
Fold 1 AUC: 0.674652 | best_iter: 726
0:	test: 0.6340191	best: 0.6340191 (0)	total: 167ms	remaining: 25m 4s
400:	test: 0.6737997	best: 0.6738597 (382)	total: 1m 2s	remaining: 22m 19s
800:	test: 0.6732296	best: 0.6740963 (682)	total: 2m 8s	remaining: 21m 55s
Stopped by overfitting detector  (400 iterations wait)

bestTest = 0.6740962933
bestIteration = 682

Shrink model to first 683 iterations.
Fold 2 AUC: 0.674096 | best_iter: 682
0:	test: 0.6277066	best: 0.6277066 (0)	total: 178ms	remaining: 26m 45s
400:	test: 0.6711039	best: 0.6712317 (379)	total: 1m 3s	remaining: 22m

Unnamed: 0,apartment_id,survived_to_18jan
0,apt_000001,0.440558
1,apt_000002,0.412524
2,apt_000003,0.303071
3,apt_000004,0.220038
4,apt_000005,0.509977
