In [1]:
import gc
import numpy as np
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import FunctionTransformer
import scipy.special

from catboost import CatBoostClassifier, Pool
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import HistGradientBoostingClassifier

# =====================
# CONFIG
# =====================
DATA_PATH = "/kaggle/input/playground-series-s4e8/"
N_SPLITS = 5
SEED = 42
THRESHOLD = 0.5
TARGET = "class"
USE_GPU = True
LABEL_SMOOTH = 0.05

# Важно: на этом соревновании сильнее всего работает ансамблирование и OOF-порог.
MODEL_SPECS = [
    {"seed": 1,  "params": dict(depth=8, l2_leaf_reg=6.0, random_strength=1.0, bagging_temperature=0.8)},
    {"seed": 7,  "params": dict(depth=9, l2_leaf_reg=10.0, random_strength=2.0, bagging_temperature=0.6)},
    {"seed": 42, "params": dict(depth=7, l2_leaf_reg=4.0, random_strength=1.5, bagging_temperature=1.0)},
]

# “джиттер” только на реальных numeric
JITTER_STRENGTH = 0.01

# сколько бинов для числовых (обычно 32-64 норм)
N_BINS = 48

# rare threshold для категорий (уменьшаем кардинальность)
RARE_MIN_COUNT = 50

# немного “хэширующих” кроссов, но БЕЗ строк (экономим память)
MAX_HASHED_CROSSES = 8


# =====================
# LOAD
# =====================
train = pd.read_csv(DATA_PATH + "train.csv")
test  = pd.read_csv(DATA_PATH + "test.csv")

y = (train["class"] == "p").astype(np.int8)
train_ids = train["id"].values
test_ids  = test["id"].values

X = train.drop(columns=["class", "id"])
X_test = test.drop(columns=["id"])

# =====================
# PREPROCESS + FE
# =====================
def replace_question_marks(df, cat_cols):
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].replace("?", np.nan)
    return df

def add_missing_flags_and_fill(df, cat_cols):
    df = df.copy()
    # флаги пропусков
    for c in cat_cols:
        df[c + "__isna"] = df[c].isna().astype(np.int8)
        df[c] = df[c].fillna("missing").astype("string")
    # общий счётчик пропусков (по исходным cat)
    df["missing_count"] = df[[c + "__isna" for c in cat_cols]].sum(axis=1).astype(np.int16)
    return df

def group_rare_categories(train_df, test_df, cat_cols, min_count=50):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in cat_cols:
        vc = train_df[c].value_counts(dropna=False)
        rare_vals = set(vc[vc < min_count].index.tolist())
        # rare в train
        train_df[c] = train_df[c].where(~train_df[c].isin(rare_vals), "rare")
        # unseen/rare в test
        seen_vals = set(vc.index.tolist())
        test_df[c] = test_df[c].where(test_df[c].isin(seen_vals), "unseen")
        test_df[c] = test_df[c].where(~test_df[c].isin(rare_vals), "rare")
    return train_df, test_df

def add_freq_enc(train_df, test_df, cat_cols):
    train_df = train_df.copy()
    test_df = test_df.copy()
    n = len(train_df)
    for c in cat_cols:
        vc = train_df[c].value_counts(dropna=False)
        train_df[c + "__freq"] = train_df[c].map(vc).fillna(0).astype(np.float32)
        test_df[c + "__freq"]  = test_df[c].map(vc).fillna(0).astype(np.float32)

        train_df[c + "__freq_norm"] = (train_df[c + "__freq"] / n).astype(np.float32)
        test_df[c + "__freq_norm"]  = (test_df[c + "__freq"] / n).astype(np.float32)

        train_df[c + "__logcnt"] = np.log1p(train_df[c + "__freq"]).astype(np.float32)
        test_df[c + "__logcnt"]  = np.log1p(test_df[c + "__freq"]).astype(np.float32)
    return train_df, test_df

def add_numeric_bins(train_df, test_df, num_cols, n_bins=48):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in num_cols:
        # квантили по train
        qs = np.quantile(train_df[c].values, np.linspace(0, 1, n_bins + 1))
        qs = np.unique(qs)
        # если мало уникальных — пропускаем биннинг
        if len(qs) <= 3:
            continue
        # digitize: 0..(len(qs)-2)
        cut_points = qs[1:-1]
        train_df[c + "__bin"] = np.digitize(train_df[c].values, cut_points, right=True).astype(np.int16)
        test_df[c + "__bin"]  = np.digitize(test_df[c].values,  cut_points, right=True).astype(np.int16)
    return train_df, test_df

def add_numeric_transforms(train_df, test_df, num_cols):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in num_cols:
        # безопасный log1p (на случай нулей)
        train_df[c + "__log1p"] = np.log1p(np.maximum(train_df[c].values, 0)).astype(np.float32)
        test_df[c + "__log1p"]  = np.log1p(np.maximum(test_df[c].values,  0)).astype(np.float32)

    # типичные взаимодействия для грибов (если такие колонки есть)
    def has(col): return col in train_df.columns
    if has("stem-height") and has("stem-width"):
        train_df["stem_hw"] = (train_df["stem-height"] * train_df["stem-width"]).astype(np.float32)
        test_df["stem_hw"]  = (test_df["stem-height"]  * test_df["stem-width"]).astype(np.float32)

        train_df["stem_h_div_w"] = (train_df["stem-height"] / (train_df["stem-width"] + 1e-6)).astype(np.float32)
        test_df["stem_h_div_w"]  = (test_df["stem-height"]  / (test_df["stem-width"]  + 1e-6)).astype(np.float32)

    if has("cap-diameter") and has("stem-height"):
        train_df["cap_div_stem_h"] = (train_df["cap-diameter"] / (train_df["stem-height"] + 1e-6)).astype(np.float32)
        test_df["cap_div_stem_h"]  = (test_df["cap-diameter"]  / (test_df["stem-height"]  + 1e-6)).astype(np.float32)

    return train_df, test_df

def add_hashed_crosses(train_df, test_df, cat_cols, max_pairs=8):
    """
    Делает “комбо”-фичи, но без строк:
    хэш от пары значений -> int64, и мы скажем CatBoost, что это categorical.
    """
    train_df = train_df.copy()
    test_df = test_df.copy()

    # берём умеренно низкокардинальные колонки, чтобы пары были осмысленными
    nun = train_df[cat_cols].nunique()
    chosen = nun.sort_values().index.tolist()[:10]  # топ-10 по низкой кардинальности
    pairs = []
    for i in range(len(chosen)):
        for j in range(i+1, len(chosen)):
            pairs.append((chosen[i], chosen[j]))
    pairs = pairs[:max_pairs]

    for a, b in pairs:
        name = f"{a}__X__{b}"
        train_df[name] = pd.util.hash_pandas_object(train_df[[a, b]], index=False).astype(np.int64)
        test_df[name]  = pd.util.hash_pandas_object(test_df[[a, b]],  index=False).astype(np.int64)

    return train_df, test_df

# --- определяем исходные типы
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# 1) ? -> NaN
X = replace_question_marks(X, cat_cols)
X_test = replace_question_marks(X_test, cat_cols)

# 2) numeric: median impute (сразу, чтобы дальше bins/трансформы работали)
for c in num_cols:
    med = X[c].median()
    X[c] = X[c].fillna(med)
    X_test[c] = X_test[c].fillna(med)

# 3) missing flags + fill cats
X = add_missing_flags_and_fill(X, cat_cols)
X_test = add_missing_flags_and_fill(X_test, cat_cols)

# 4) rare grouping
X, X_test = group_rare_categories(X, X_test, cat_cols, min_count=RARE_MIN_COUNT)

# 5) freq encodings
X, X_test = add_freq_enc(X, X_test, cat_cols)

# 6) numeric bins + transforms + interactions
X, X_test = add_numeric_bins(X, X_test, num_cols, n_bins=N_BINS)
X, X_test = add_numeric_transforms(X, X_test, num_cols)

# 7) hashed crosses (пара штук)
X, X_test = add_hashed_crosses(X, X_test, cat_cols, max_pairs=MAX_HASHED_CROSSES)

In [2]:
def train_base_model(X, y, X_test, model_name, params={}, n_splits=N_SPLITS, save_dir="models"):
    os.makedirs(save_dir, exist_ok=True)
    
    oof = np.zeros((X.shape[0], 2))
    test_pred = np.zeros((X_test.shape[0], 2))

    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=SEED)
    for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
        X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
        y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

        y_tr_smooth = y_tr

        model = None

        if model_name == "CatBoost":
            pool_tr = Pool(X_tr, y_tr_smooth, cat_features=cat_cols)
            pool_val = Pool(X_val, y_val, cat_features=cat_cols)
            model = CatBoostClassifier(
                iterations=10000,
                learning_rate=0.05,
                depth=8,
                task_type="GPU" if USE_GPU else "CPU",
                loss_function="Logloss",
                **params,
                verbose=500
            )
            model.fit(pool_tr, eval_set=pool_val, early_stopping_rounds=100)
            oof[val_idx] = model.predict_proba(X_val)
            test_pred += model.predict_proba(X_test) / n_splits

            # Сохраняем модель
            model.save_model(os.path.join(save_dir, f"catboost_fold{fold+1}.cbm"))

        elif model_name == "LightGBM":
            X_tr_lgb = cast_categoricals_for_lgb(X_tr, cat_cols)
            X_val_lgb = cast_categoricals_for_lgb(X_val, cat_cols)
            X_test_lgb = cast_categoricals_for_lgb(X_test, cat_cols)
        
            dtrain = lgb.Dataset(
                X_tr_lgb,
                label=y_tr_smooth,
                categorical_feature=cat_cols
            )
            dval = lgb.Dataset(
                X_val_lgb,
                label=y_val,
                categorical_feature=cat_cols,
                reference=dtrain
            )
        
            model = lgb.train(
                params,
                dtrain,
                num_boost_round=10000,
                valid_sets=[dtrain, dval],
                callbacks=[
                    lgb.early_stopping(stopping_rounds=100),
                    lgb.log_evaluation(period=500)
                ]
            )     
            val_pred_1 = model.predict(X_val_lgb)
            val_pred_0 = 1.0 - val_pred_1
            oof[val_idx] = np.column_stack([val_pred_0, val_pred_1])
            
            test_pred_1 = model.predict(X_test_lgb)
            test_pred_0 = 1.0 - test_pred_1
            test_pred += np.column_stack([test_pred_0, test_pred_1]) / n_splits

            # Сохраняем модель
            model.save_model(os.path.join(save_dir, f"lgb_fold{fold+1}.txt"))

        elif model_name == "XGBoost":
            X_tr_xgb   = cast_categoricals_for_lgb(X_tr, cat_cols)
            X_val_xgb  = cast_categoricals_for_lgb(X_val, cat_cols)
            X_test_xgb = cast_categoricals_for_lgb(X_test, cat_cols)
        
            dtrain = xgb.DMatrix(X_tr_xgb, label=y_tr_smooth, enable_categorical=True)
            dval   = xgb.DMatrix(X_val_xgb, label=y_val, enable_categorical=True)
            dtest  = xgb.DMatrix(X_test_xgb, enable_categorical=True)
        
            model = xgb.train(
                params,
                dtrain,
                num_boost_round=10000,
                evals=[(dtrain, "train"), (dval, "valid")],
                early_stopping_rounds=100,
                verbose_eval=500
            )
        
            val_pred_1 = model.predict(dval)
            oof[val_idx] = np.column_stack([1 - val_pred_1, val_pred_1])
        
            test_pred_1 = model.predict(dtest)
            test_pred += np.column_stack([1 - test_pred_1, test_pred_1]) / n_splits

            # Сохраняем модель
            model.save_model(os.path.join(save_dir, f"xgb_fold{fold+1}.json"))

        score = matthews_corrcoef(y_val, (oof[val_idx][:,1]>=0.5).astype(int))
        print(f"{model_name} fold {fold+1} MCC: {score:.6f}")

        del X_tr, X_val, y_tr, y_val, model
        gc.collect()

    return oof, test_pred

In [3]:
def cast_categoricals_for_lgb(df, cat_cols):
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].astype("category")
    return df

In [4]:
import os
model_names = ["XGBoost", "LightGBM", "CatBoost"]
oof_pred_probs = {}
test_pred_probs = {}
for mname in model_names:
    print(f"\n=== Training {mname} ===")
    oof_pred_probs[mname], test_pred_probs[mname] = train_base_model(X, y, X_test, mname)


=== Training XGBoost ===
[0]	train-rmse:0.40316	valid-rmse:0.40341
[500]	train-rmse:0.08747	valid-rmse:0.09195
[1000]	train-rmse:0.08255	valid-rmse:0.09007
[1500]	train-rmse:0.07954	valid-rmse:0.08941
[2000]	train-rmse:0.07724	valid-rmse:0.08928
[2373]	train-rmse:0.07582	valid-rmse:0.08922
XGBoost fold 1 MCC: 0.983690
[0]	train-rmse:0.40329	valid-rmse:0.40319
[500]	train-rmse:0.08738	valid-rmse:0.09168
[1000]	train-rmse:0.08259	valid-rmse:0.08978
[1500]	train-rmse:0.07962	valid-rmse:0.08926
[2000]	train-rmse:0.07734	valid-rmse:0.08909
[2151]	train-rmse:0.07673	valid-rmse:0.08911
XGBoost fold 2 MCC: 0.983682
[0]	train-rmse:0.40328	valid-rmse:0.40315
[500]	train-rmse:0.08776	valid-rmse:0.09189
[1000]	train-rmse:0.08286	valid-rmse:0.08991
[1500]	train-rmse:0.07982	valid-rmse:0.08934
[2000]	train-rmse:0.07747	valid-rmse:0.08918
[2409]	train-rmse:0.07584	valid-rmse:0.08908
XGBoost fold 3 MCC: 0.983875
[0]	train-rmse:0.40323	valid-rmse:0.40328
[500]	train-rmse:0.08716	valid-rmse:0.09194
[10

In [37]:
X_stack = np.column_stack(list(oof_pred_probs.values())).clip(1e-15, 1 - 1e-15)
X_test_stack = np.column_stack(list(test_pred_probs.values())).clip(1e-15, 1 - 1e-15)

stack_model = make_pipeline(
    FunctionTransformer(scipy.special.logit),
    LogisticRegression(
        max_iter=10000,
        penalty=None,
        class_weight="balanced"
    )
)

# skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=SEED)

# oof_stack = np.zeros((X_stack.shape[0], 2))
# test_stack = np.zeros((X_test_stack.shape[0], 2))

# for fold, (tr_idx, val_idx) in enumerate(skf.split(X_stack, y)):
#     X_tr, X_val = X_stack[tr_idx], X_stack[val_idx]
#     y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

#     stack_model.fit(X_tr, y_tr)

#     oof_stack[val_idx] = stack_model.predict_proba(X_val)
#     test_stack += stack_model.predict_proba(X_test_stack) / N_SPLITS

#     fold_mcc = matthews_corrcoef(
#         y_val,
#         (oof_stack[val_idx][:, 1] >= THRESHOLD).astype(int)
#     )
#     print(f"Stack fold {fold+1} MCC: {fold_mcc:.6f}")

stack_model.fit(X_stack, y)

oof_stack = stack_model.predict_proba(X_stack)
test_stack = stack_model.predict_proba(X_test_stack)

In [38]:
from sklearn.metrics import matthews_corrcoef

best_thr = 0.5
best_mcc = -1

for thr in np.linspace(0.2, 0.8, 301):
    mcc = matthews_corrcoef(y, (oof_stack[:,1] >= thr).astype(int))
    if mcc > best_mcc:
        best_mcc = mcc
        best_thr = thr

print(f"Best threshold = {best_thr:.4f}, MCC = {best_mcc:.6f}")

Best threshold = 0.4580, MCC = 0.984515


In [39]:
test_proba = test_stack[:, 1]
test_pred  = np.where(test_proba >= best_thr, "p", "e")

test_ids = test["id"].values

submission = pd.DataFrame({
    "id": test_ids,
    "class": test_pred
})

submission.to_csv("submission.csv", index=False)

In [37]:
!pip install kaggle



In [15]:
import json

os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

kaggle_json = {
    "username": "aleks9921",
    "key": "85a9b540183c46bec0906f4b4be20819"
}

# Сохраняем
with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as f:
    json.dump(kaggle_json, f)

# Устанавливаем права доступа
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

In [44]:
!kaggle competitions list

ref                                                                                 deadline             category                reward  teamCount  userHasEntered  
----------------------------------------------------------------------------------  -------------------  ---------------  -------------  ---------  --------------  
https://www.kaggle.com/competitions/ai-mathematical-olympiad-progress-prize-3       2026-04-15 23:59:00  Featured         2,207,152 Usd        784           False  
https://www.kaggle.com/competitions/vesuvius-challenge-surface-detection            2026-02-13 23:59:00  Research           200,000 Usd        344           False  
https://www.kaggle.com/competitions/hull-tactical-market-prediction                 2025-12-15 23:59:00  Featured           100,000 Usd       3688           False  
https://www.kaggle.com/competitions/google-tunix-hackathon                          2026-01-12 23:59:00  Featured           100,000 Usd         82           False  
https://ww

In [40]:
!kaggle competitions submit -c playground-series-s4e8 -f submission.csv -m "no kfold"

100%|██████████████████████████████████████| 19.8M/19.8M [00:01<00:00, 12.2MB/s]
Successfully submitted to Binary Prediction of Poisonous Mushrooms

In [30]:
g = pd.read_csv("submission.csv")
g

Unnamed: 0,id,class
0,3116945,0
1,3116946,1
2,3116947,1
3,3116948,1
4,3116949,0
...,...,...
2077959,5194904,1
2077960,5194905,1
2077961,5194906,1
2077962,5194907,0


In [31]:
submission.head()

Unnamed: 0,id,class
0,3116945,0
1,3116946,1
2,3116947,1
3,3116948,1
4,3116949,0
