In [7]:
import gc
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split


# =====================
# CONFIG
# =====================
DATA_PATH = "/kaggle/input/playground-series-s4e8/"
N_SPLITS = 3
USE_GPU = True

# Важно: на этом соревновании сильнее всего работает ансамблирование и OOF-порог.
# “джиттер” только на реальных numeric
JITTER_STRENGTH = 0.01

# сколько бинов для числовых (обычно 32-64 норм)
N_BINS = 48

# rare threshold для категорий (уменьшаем кардинальность)
RARE_MIN_COUNT = 50

# немного “хэширующих” кроссов, но БЕЗ строк (экономим память)
MAX_HASHED_CROSSES = 8


# =====================
# LOAD
# =====================
train = train_merged
test  = pd.read_csv(DATA_PATH + "test.csv")

y = (train["class"] == "p").astype(np.int8)
train_ids = train["id"].values
test_ids  = test["id"].values

X = train.drop(columns=["class", "id"])
X_test = test.drop(columns=["id"])

# =====================
# PREPROCESS + FE
# =====================
def replace_question_marks(df, cat_cols):
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].replace("?", np.nan)
    return df

def add_missing_flags_and_fill(df, cat_cols):
    df = df.copy()
    # флаги пропусков
    for c in cat_cols:
        df[c + "__isna"] = df[c].isna().astype(np.int8)
        df[c] = df[c].fillna("missing").astype("string")
    # общий счётчик пропусков (по исходным cat)
    df["missing_count"] = df[[c + "__isna" for c in cat_cols]].sum(axis=1).astype(np.int16)
    return df

def group_rare_categories(train_df, test_df, cat_cols, min_count=50):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in cat_cols:
        vc = train_df[c].value_counts(dropna=False)
        rare_vals = set(vc[vc < min_count].index.tolist())
        # rare в train
        train_df[c] = train_df[c].where(~train_df[c].isin(rare_vals), "rare")
        # unseen/rare в test
        seen_vals = set(vc.index.tolist())
        test_df[c] = test_df[c].where(test_df[c].isin(seen_vals), "unseen")
        test_df[c] = test_df[c].where(~test_df[c].isin(rare_vals), "rare")
    return train_df, test_df

def add_freq_enc(train_df, test_df, cat_cols):
    train_df = train_df.copy()
    test_df = test_df.copy()
    n = len(train_df)
    for c in cat_cols:
        vc = train_df[c].value_counts(dropna=False)
        train_df[c + "__freq"] = train_df[c].map(vc).fillna(0).astype(np.float32)
        test_df[c + "__freq"]  = test_df[c].map(vc).fillna(0).astype(np.float32)

        train_df[c + "__freq_norm"] = (train_df[c + "__freq"] / n).astype(np.float32)
        test_df[c + "__freq_norm"]  = (test_df[c + "__freq"] / n).astype(np.float32)

        train_df[c + "__logcnt"] = np.log1p(train_df[c + "__freq"]).astype(np.float32)
        test_df[c + "__logcnt"]  = np.log1p(test_df[c + "__freq"]).astype(np.float32)
    return train_df, test_df

def add_numeric_bins(train_df, test_df, num_cols, n_bins=48):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in num_cols:
        # квантили по train
        qs = np.quantile(train_df[c].values, np.linspace(0, 1, n_bins + 1))
        qs = np.unique(qs)
        # если мало уникальных — пропускаем биннинг
        if len(qs) <= 3:
            continue
        # digitize: 0..(len(qs)-2)
        cut_points = qs[1:-1]
        train_df[c + "__bin"] = np.digitize(train_df[c].values, cut_points, right=True).astype(np.int16)
        test_df[c + "__bin"]  = np.digitize(test_df[c].values,  cut_points, right=True).astype(np.int16)
    return train_df, test_df

def add_numeric_transforms(train_df, test_df, num_cols):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in num_cols:
        # безопасный log1p (на случай нулей)
        train_df[c + "__log1p"] = np.log1p(np.maximum(train_df[c].values, 0)).astype(np.float32)
        test_df[c + "__log1p"]  = np.log1p(np.maximum(test_df[c].values,  0)).astype(np.float32)

    # типичные взаимодействия для грибов (если такие колонки есть)
    def has(col): return col in train_df.columns
    if has("stem-height") and has("stem-width"):
        train_df["stem_hw"] = (train_df["stem-height"] * train_df["stem-width"]).astype(np.float32)
        test_df["stem_hw"]  = (test_df["stem-height"]  * test_df["stem-width"]).astype(np.float32)

        train_df["stem_h_div_w"] = (train_df["stem-height"] / (train_df["stem-width"] + 1e-6)).astype(np.float32)
        test_df["stem_h_div_w"]  = (test_df["stem-height"]  / (test_df["stem-width"]  + 1e-6)).astype(np.float32)

    if has("cap-diameter") and has("stem-height"):
        train_df["cap_div_stem_h"] = (train_df["cap-diameter"] / (train_df["stem-height"] + 1e-6)).astype(np.float32)
        test_df["cap_div_stem_h"]  = (test_df["cap-diameter"]  / (test_df["stem-height"]  + 1e-6)).astype(np.float32)

    return train_df, test_df

def add_hashed_crosses(train_df, test_df, cat_cols, max_pairs=8):
    """
    Делает “комбо”-фичи, но без строк:
    хэш от пары значений -> int64, и мы скажем CatBoost, что это categorical.
    """
    train_df = train_df.copy()
    test_df = test_df.copy()

    # берём умеренно низкокардинальные колонки, чтобы пары были осмысленными
    nun = train_df[cat_cols].nunique()
    chosen = nun.sort_values().index.tolist()[:10]  # топ-10 по низкой кардинальности
    pairs = []
    for i in range(len(chosen)):
        for j in range(i+1, len(chosen)):
            pairs.append((chosen[i], chosen[j]))
    pairs = pairs[:max_pairs]

    for a, b in pairs:
        name = f"{a}__X__{b}"
        train_df[name] = pd.util.hash_pandas_object(train_df[[a, b]], index=False).astype(np.int64)
        test_df[name]  = pd.util.hash_pandas_object(test_df[[a, b]],  index=False).astype(np.int64)

    return train_df, test_df

# --- определяем исходные типы
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# 1) ? -> NaN
X = replace_question_marks(X, cat_cols)
X_test = replace_question_marks(X_test, cat_cols)

# 2) numeric: median impute (сразу, чтобы дальше bins/трансформы работали)
for c in num_cols:
    med = X[c].median()
    X[c] = X[c].fillna(med)
    X_test[c] = X_test[c].fillna(med)

# 3) missing flags + fill cats
X = add_missing_flags_and_fill(X, cat_cols)
X_test = add_missing_flags_and_fill(X_test, cat_cols)

# 4) rare grouping
X, X_test = group_rare_categories(X, X_test, cat_cols, min_count=RARE_MIN_COUNT)

# 5) freq encodings
X, X_test = add_freq_enc(X, X_test, cat_cols)

# 6) numeric bins + transforms + interactions
X, X_test = add_numeric_bins(X, X_test, num_cols, n_bins=N_BINS)
X, X_test = add_numeric_transforms(X, X_test, num_cols)

# 7) hashed crosses (пара штук)
X, X_test = add_hashed_crosses(X, X_test, cat_cols, max_pairs=MAX_HASHED_CROSSES)

# --- список “категориальных” для CatBoost:
# исходные cat (string) + бины (int16) + hashed crosses (int64) считаем категориальными
bin_cols = [c for c in X.columns if c.endswith("__bin")]
cross_cols = [c for c in X.columns if "__X__" in c]
cat_like_cols = cat_cols + bin_cols + cross_cols

# индексы cat-фич
cat_idx = [X.columns.get_loc(c) for c in cat_like_cols if c in X.columns]

# jitter только на “реальных” num_cols (не трогаем freq/log/bin/cross)
NUMERIC_FOR_JITTER = [c for c in num_cols if c in X.columns]

def smooth_labels(y, eps=0.05):
    return y * (1 - eps) + 0.5 * eps

def cast_categoricals_for_lgb(df, cat_cols):
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].astype("category")
    return df

def jitter_inplace(df, cols, strength, seed):
    rng = np.random.default_rng(seed)
    for c in cols:
        std = df[c].std()
        if std and np.isfinite(std) and std > 0:
            df[c] = df[c].values + rng.normal(0, strength * std, size=len(df))
    return df

In [9]:
from catboost import CatBoostClassifier, Pool

cat_params = dict(
    iterations=4000,
    learning_rate=0.03,
    depth=8,
    loss_function="Logloss",
    eval_metric="MCC",
    random_seed=42,
    verbose=200,
    task_type="GPU" if USE_GPU else "CPU"
)

X_tr, X_val, y_tr, y_val = train_test_split(
    X,
    y,
    test_size=0.1,
    stratify=y,
    random_state=42
)

train_pool = Pool(
    X_tr,
    y_tr,
    cat_features=cat_idx
)

val_pool = Pool(
    X_val,
    y_val,
    cat_features=cat_idx
)

test_pool = Pool(
    X_test,
    cat_features=cat_idx
)


model = CatBoostClassifier(**cat_params)

model.fit(
    train_pool,
    eval_set=val_pool,
    use_best_model=True
)


oof = model.predict_proba(val_pool)[:, 1].astype(np.float32)
test_pred = model.predict_proba(test_pool)[:, 1].astype(np.float32)

CONF = 0.95
PSEUDO_WEIGHT = 0.6

mask = (test_pred > CONF) | (test_pred < 1 - CONF)

X_pseudo = X_test[mask]
y_pseudo = (test_pred[mask] > 0.5).astype(int)

X_aug = pd.concat([X_tr, X_pseudo], axis=0)
y_aug = pd.concat([y_tr, pd.Series(y_pseudo, index=X_pseudo.index)], axis=0)

weights = np.concatenate([
    np.ones(len(X_tr)),
    np.full(len(X_pseudo), PSEUDO_WEIGHT)
])

aug_pool = Pool(
    X_aug,
    y_aug,
    cat_features=cat_idx,
    weight=weights
)

final_model = CatBoostClassifier(**cat_params)

final_model.fit(
    aug_pool,
    eval_set=val_pool,
    use_best_model=True
)

oof = final_model.predict_proba(val_pool)[:, 1].astype(np.float32)
test_pred = final_model.predict_proba(test_pool)[:, 1].astype(np.float32)

0:	learn: 0.8584850	test: 0.8580257	best: 0.8580257 (0)	total: 5.12s	remaining: 5h 41m 21s
200:	learn: 0.9827653	test: 0.9828011	best: 0.9828011 (200)	total: 2m 10s	remaining: 41m 5s
400:	learn: 0.9837283	test: 0.9837859	best: 0.9837859 (399)	total: 4m 8s	remaining: 37m 8s
600:	learn: 0.9839649	test: 0.9839701	best: 0.9839765 (588)	total: 6m 4s	remaining: 34m 20s
800:	learn: 0.9841427	test: 0.9841481	best: 0.9841482 (778)	total: 7m 59s	remaining: 31m 54s
1000:	learn: 0.9842761	test: 0.9842561	best: 0.9842561 (997)	total: 9m 57s	remaining: 29m 49s
1200:	learn: 0.9843862	test: 0.9843387	best: 0.9843768 (1153)	total: 11m 59s	remaining: 27m 56s
1400:	learn: 0.9844914	test: 0.9844150	best: 0.9844213 (1365)	total: 14m 1s	remaining: 26m
1600:	learn: 0.9845619	test: 0.9844723	best: 0.9844787 (1584)	total: 16m	remaining: 23m 58s
1800:	learn: 0.9846523	test: 0.9845928	best: 0.9845992 (1793)	total: 18m 6s	remaining: 22m 6s
2000:	learn: 0.9847249	test: 0.9846689	best: 0.9846690 (1927)	total: 20m 9

NameError: name 'X_tr_lgb' is not defined

In [13]:
# #без 
# oof = model.predict_proba(val_pool)[:, 1].astype(np.float32)
# test_pred = model.predict_proba(test_pool)[:, 1].astype(np.float32)

In [14]:
ths = np.linspace(0.01, 0.99, 981)
mccs = [matthews_corrcoef(y_val, (oof >= t).astype(np.int8)) for t in ths]
best_t = float(ths[int(np.argmax(mccs))])
best_mcc = float(np.max(mccs))

print(f"OOF MCC = {best_mcc:.6f} at threshold t = {best_t:.4f}")

# =====================
# SUBMISSION
# =====================
sub = pd.DataFrame({
    "id": test_ids,
    "class": np.where(test_pred >= best_t, "p", "e")
})
sub.to_csv("submission.csv", index=False)
print(sub.head())

OOF MCC = 0.984930 at threshold t = 0.5090
        id class
0  3116945     e
1  3116946     p
2  3116947     p
3  3116948     p
4  3116949     e


In [11]:
import json
import os

os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

kaggle_json = {
    "username": "aleks9921",
    "key": "85a9b540183c46bec0906f4b4be20819"
}

# Сохраняем
with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as f:
    json.dump(kaggle_json, f)

# Устанавливаем права доступа
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

In [15]:
!kaggle competitions submit -c playground-series-s4e8 -f submission.csv -m "catboost"

100%|██████████████████████████████████████| 19.8M/19.8M [00:02<00:00, 9.94MB/s]
Successfully submitted to Binary Prediction of Poisonous Mushrooms

In [3]:
import pandas as pd

import kagglehub

# Download latest version
path = kagglehub.dataset_download("shrutisaxena/secondary-mushroom-dataset-data-set")
train_secondary = pd.read_csv(
    "/kaggle/input/secondary-mushroom-dataset-data-set/MushroomDataset/secondary_data.csv",
    sep=";"
)
train_competition = pd.read_csv(
    "/kaggle/input/playground-series-s4e8/train.csv",
    index_col="id"
)

In [4]:
train_merged = pd.concat(
    [train_competition.reset_index(), train_secondary],
    ignore_index=True
).sample(frac=1, random_state=42).reset_index(drop=True)