In [9]:
import gc
import numpy as np
import pandas as pd
from lightgbm import LGBMClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import matthews_corrcoef
from sklearn.model_selection import train_test_split

from catboost import CatBoostClassifier, Pool


# =====================
# CONFIG
# =====================
DATA_PATH = "/kaggle/input/playground-series-s4e8/"
N_SPLITS = 3
USE_GPU = True

lgb_params = dict(
    n_estimators= 4000,
    random_state=42,
    max_bin=1024,
    colsample_bytree=0.6,
    reg_lambda = 80,
)

# Важно: на этом соревновании сильнее всего работает ансамблирование и OOF-порог.
# “джиттер” только на реальных numeric
JITTER_STRENGTH = 0.01

# сколько бинов для числовых (обычно 32-64 норм)
N_BINS = 48

# rare threshold для категорий (уменьшаем кардинальность)
RARE_MIN_COUNT = 50

# немного “хэширующих” кроссов, но БЕЗ строк (экономим память)
MAX_HASHED_CROSSES = 8


# =====================
# LOAD
# =====================
train = train_merged
test  = pd.read_csv(DATA_PATH + "test.csv")

y = (train["class"] == "p").astype(np.int8)
train_ids = train["id"].values
test_ids  = test["id"].values

X = train.drop(columns=["class", "id"])
X_test = test.drop(columns=["id"])

# =====================
# PREPROCESS + FE
# =====================
def replace_question_marks(df, cat_cols):
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].replace("?", np.nan)
    return df

def add_missing_flags_and_fill(df, cat_cols):
    df = df.copy()
    # флаги пропусков
    for c in cat_cols:
        df[c + "__isna"] = df[c].isna().astype(np.int8)
        df[c] = df[c].fillna("missing").astype("string")
    # общий счётчик пропусков (по исходным cat)
    df["missing_count"] = df[[c + "__isna" for c in cat_cols]].sum(axis=1).astype(np.int16)
    return df

def group_rare_categories(train_df, test_df, cat_cols, min_count=50):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in cat_cols:
        vc = train_df[c].value_counts(dropna=False)
        rare_vals = set(vc[vc < min_count].index.tolist())
        # rare в train
        train_df[c] = train_df[c].where(~train_df[c].isin(rare_vals), "rare")
        # unseen/rare в test
        seen_vals = set(vc.index.tolist())
        test_df[c] = test_df[c].where(test_df[c].isin(seen_vals), "unseen")
        test_df[c] = test_df[c].where(~test_df[c].isin(rare_vals), "rare")
    return train_df, test_df

def add_freq_enc(train_df, test_df, cat_cols):
    train_df = train_df.copy()
    test_df = test_df.copy()
    n = len(train_df)
    for c in cat_cols:
        vc = train_df[c].value_counts(dropna=False)
        train_df[c + "__freq"] = train_df[c].map(vc).fillna(0).astype(np.float32)
        test_df[c + "__freq"]  = test_df[c].map(vc).fillna(0).astype(np.float32)

        train_df[c + "__freq_norm"] = (train_df[c + "__freq"] / n).astype(np.float32)
        test_df[c + "__freq_norm"]  = (test_df[c + "__freq"] / n).astype(np.float32)

        train_df[c + "__logcnt"] = np.log1p(train_df[c + "__freq"]).astype(np.float32)
        test_df[c + "__logcnt"]  = np.log1p(test_df[c + "__freq"]).astype(np.float32)
    return train_df, test_df

def add_numeric_bins(train_df, test_df, num_cols, n_bins=48):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in num_cols:
        # квантили по train
        qs = np.quantile(train_df[c].values, np.linspace(0, 1, n_bins + 1))
        qs = np.unique(qs)
        # если мало уникальных — пропускаем биннинг
        if len(qs) <= 3:
            continue
        # digitize: 0..(len(qs)-2)
        cut_points = qs[1:-1]
        train_df[c + "__bin"] = np.digitize(train_df[c].values, cut_points, right=True).astype(np.int16)
        test_df[c + "__bin"]  = np.digitize(test_df[c].values,  cut_points, right=True).astype(np.int16)
    return train_df, test_df

def add_numeric_transforms(train_df, test_df, num_cols):
    train_df = train_df.copy()
    test_df = test_df.copy()
    for c in num_cols:
        # безопасный log1p (на случай нулей)
        train_df[c + "__log1p"] = np.log1p(np.maximum(train_df[c].values, 0)).astype(np.float32)
        test_df[c + "__log1p"]  = np.log1p(np.maximum(test_df[c].values,  0)).astype(np.float32)

    # типичные взаимодействия для грибов (если такие колонки есть)
    def has(col): return col in train_df.columns
    if has("stem-height") and has("stem-width"):
        train_df["stem_hw"] = (train_df["stem-height"] * train_df["stem-width"]).astype(np.float32)
        test_df["stem_hw"]  = (test_df["stem-height"]  * test_df["stem-width"]).astype(np.float32)

        train_df["stem_h_div_w"] = (train_df["stem-height"] / (train_df["stem-width"] + 1e-6)).astype(np.float32)
        test_df["stem_h_div_w"]  = (test_df["stem-height"]  / (test_df["stem-width"]  + 1e-6)).astype(np.float32)

    if has("cap-diameter") and has("stem-height"):
        train_df["cap_div_stem_h"] = (train_df["cap-diameter"] / (train_df["stem-height"] + 1e-6)).astype(np.float32)
        test_df["cap_div_stem_h"]  = (test_df["cap-diameter"]  / (test_df["stem-height"]  + 1e-6)).astype(np.float32)

    return train_df, test_df

def add_hashed_crosses(train_df, test_df, cat_cols, max_pairs=8):
    """
    Делает “комбо”-фичи, но без строк:
    хэш от пары значений -> int64, и мы скажем CatBoost, что это categorical.
    """
    train_df = train_df.copy()
    test_df = test_df.copy()

    # берём умеренно низкокардинальные колонки, чтобы пары были осмысленными
    nun = train_df[cat_cols].nunique()
    chosen = nun.sort_values().index.tolist()[:10]  # топ-10 по низкой кардинальности
    pairs = []
    for i in range(len(chosen)):
        for j in range(i+1, len(chosen)):
            pairs.append((chosen[i], chosen[j]))
    pairs = pairs[:max_pairs]

    for a, b in pairs:
        name = f"{a}__X__{b}"
        train_df[name] = pd.util.hash_pandas_object(train_df[[a, b]], index=False).astype(np.int64)
        test_df[name]  = pd.util.hash_pandas_object(test_df[[a, b]],  index=False).astype(np.int64)

    return train_df, test_df

# --- определяем исходные типы
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# 1) ? -> NaN
X = replace_question_marks(X, cat_cols)
X_test = replace_question_marks(X_test, cat_cols)

# 2) numeric: median impute (сразу, чтобы дальше bins/трансформы работали)
for c in num_cols:
    med = X[c].median()
    X[c] = X[c].fillna(med)
    X_test[c] = X_test[c].fillna(med)

# 3) missing flags + fill cats
X = add_missing_flags_and_fill(X, cat_cols)
X_test = add_missing_flags_and_fill(X_test, cat_cols)

# 4) rare grouping
X, X_test = group_rare_categories(X, X_test, cat_cols, min_count=RARE_MIN_COUNT)

# 5) freq encodings
X, X_test = add_freq_enc(X, X_test, cat_cols)

# 6) numeric bins + transforms + interactions
X, X_test = add_numeric_bins(X, X_test, num_cols, n_bins=N_BINS)
X, X_test = add_numeric_transforms(X, X_test, num_cols)

# 7) hashed crosses (пара штук)
X, X_test = add_hashed_crosses(X, X_test, cat_cols, max_pairs=MAX_HASHED_CROSSES)

# --- список “категориальных” для CatBoost:
# исходные cat (string) + бины (int16) + hashed crosses (int64) считаем категориальными
bin_cols = [c for c in X.columns if c.endswith("__bin")]
cross_cols = [c for c in X.columns if "__X__" in c]
cat_like_cols = cat_cols + bin_cols + cross_cols

# индексы cat-фич
cat_idx = [X.columns.get_loc(c) for c in cat_like_cols if c in X.columns]

# jitter только на “реальных” num_cols (не трогаем freq/log/bin/cross)
NUMERIC_FOR_JITTER = [c for c in num_cols if c in X.columns]



def cast_categoricals_for_lgb(df, cat_cols):
    df = df.copy()
    for c in cat_cols:
        df[c] = df[c].astype("category")
    return df

def jitter_inplace(df, cols, strength, seed):
    rng = np.random.default_rng(seed)
    for c in cols:
        std = df[c].std()
        if std and np.isfinite(std) and std > 0:
            df[c] = df[c].values + rng.normal(0, strength * std, size=len(df))
    return df

oof = np.zeros(len(X), dtype=np.float32)
pred_test = np.zeros(len(X_test), dtype=np.float32)

skf = StratifiedKFold(n_splits=N_SPLITS, shuffle=True, random_state=42)

for fold, (tr_idx, val_idx) in enumerate(skf.split(X, y)):
    print(f"FOLD {fold+1}")
    X_tr, X_val = X.iloc[tr_idx], X.iloc[val_idx]
    y_tr, y_val = y.iloc[tr_idx], y.iloc[val_idx]

    X_tr_lgb = cast_categoricals_for_lgb(X_tr, cat_cols)
    X_val_lgb = cast_categoricals_for_lgb(X_val, cat_cols)
    X_test_lgb_tmp = cast_categoricals_for_lgb(X_test, cat_cols)

    X_tr_lgb = jitter_inplace(X_tr_lgb, NUMERIC_FOR_JITTER, JITTER_STRENGTH, seed=42+fold)

    model = LGBMClassifier(**lgb_params)
    model.fit(
        X_tr_lgb, y_tr,
    )

    oof[val_idx] = model.predict_proba(X_val_lgb)[:,1]

    pred_test += model.predict_proba(X_test_lgb_tmp)[:,1] / N_SPLITS

FOLD 1
[LightGBM] [Info] Number of positive: 1159523, number of negative: 959153
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.595347 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10163
[LightGBM] [Info] Number of data points in the train set: 2118676, number of used features: 105
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.547287 -> initscore=0.189713
[LightGBM] [Info] Start training from score 0.189713
FOLD 2
[LightGBM] [Info] Number of positive: 1159523, number of negative: 959153
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.584501 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 10158
[LightGBM] [Info] Number of data points in the train set: 2118676, number of used featur

In [10]:
ths = np.linspace(0.01, 0.99, 981)
mccs = [
    matthews_corrcoef(y, (oof >= t).astype(np.int8))
    for t in ths
]

best_t = float(ths[np.argmax(mccs)])
best_mcc = float(np.max(mccs))

print(f"VAL MCC = {best_mcc:.6f} at threshold t = {best_t:.4f}")

VAL MCC = 0.984815 at threshold t = 0.4980


In [12]:
sub = pd.DataFrame({
    "id": test_ids,
    "class": np.where(pred_test >= best_t, "p", "e")
})
sub.to_csv("submission.csv", index=False)
print(sub.head())

        id class
0  3116945     e
1  3116946     p
2  3116947     p
3  3116948     p
4  3116949     e


In [14]:
import json
import os

os.makedirs(os.path.expanduser("~/.kaggle"), exist_ok=True)

kaggle_json = {
    "username": "aleks9921",
    "key": "85a9b540183c46bec0906f4b4be20819"
}

# Сохраняем
with open(os.path.expanduser("~/.kaggle/kaggle.json"), "w") as f:
    json.dump(kaggle_json, f)

# Устанавливаем права доступа
os.chmod(os.path.expanduser("~/.kaggle/kaggle.json"), 0o600)

In [15]:
!kaggle competitions submit -c playground-series-s4e8 -f submission.csv -m "lgbm kfold"

100%|██████████████████████████████████████| 19.8M/19.8M [00:02<00:00, 7.71MB/s]
Successfully submitted to Binary Prediction of Poisonous Mushrooms

In [2]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("shrutisaxena/secondary-mushroom-dataset-data-set")

In [5]:
import pandas as pd

train_secondary = pd.read_csv(
    "/kaggle/input/secondary-mushroom-dataset-data-set/MushroomDataset/secondary_data.csv",
    sep=";"
)
train_competition = pd.read_csv(
    "/kaggle/input/playground-series-s4e8/train.csv",
    index_col="id"
)

In [6]:
train_competition

Unnamed: 0_level_0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,e,8.80,f,s,u,f,a,c,w,4.51,...,,,w,,,f,f,,d,a
1,p,4.51,x,h,o,f,a,c,n,4.79,...,,y,o,,,t,z,,d,w
2,e,6.94,f,s,b,f,x,c,w,6.85,...,,s,n,,,f,f,,l,w
3,e,3.88,f,y,g,f,s,,g,4.16,...,,,w,,,f,f,,d,u
4,e,5.85,x,l,w,f,d,,w,3.37,...,,,w,,,f,f,,g,a
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3116940,e,9.29,f,,n,t,,,w,12.14,...,b,,w,u,w,t,g,,d,u
3116941,e,10.88,s,,w,t,d,c,p,6.65,...,,,w,,,f,f,,d,u
3116942,p,7.82,x,e,e,f,a,,w,9.51,...,,,y,,w,t,z,,d,a
3116943,e,9.45,p,i,n,t,e,,p,9.13,...,,y,w,,,t,p,,d,u


In [7]:
train_secondary

Unnamed: 0,class,cap-diameter,cap-shape,cap-surface,cap-color,does-bruise-or-bleed,gill-attachment,gill-spacing,gill-color,stem-height,...,stem-root,stem-surface,stem-color,veil-type,veil-color,has-ring,ring-type,spore-print-color,habitat,season
0,p,15.26,x,g,o,f,e,,w,16.95,...,s,y,w,u,w,t,g,,d,w
1,p,16.60,x,g,o,f,e,,w,17.99,...,s,y,w,u,w,t,g,,d,u
2,p,14.07,x,g,o,f,e,,w,17.80,...,s,y,w,u,w,t,g,,d,w
3,p,14.17,f,h,e,f,e,,w,15.77,...,s,y,w,u,w,t,p,,d,w
4,p,14.64,x,h,o,f,e,,w,16.53,...,s,y,w,u,w,t,p,,d,w
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61064,p,1.18,s,s,y,f,f,f,f,3.93,...,,,y,,,f,f,,d,a
61065,p,1.27,f,s,y,f,f,f,f,3.18,...,,,y,,,f,f,,d,a
61066,p,1.27,s,s,y,f,f,f,f,3.86,...,,,y,,,f,f,,d,u
61067,p,1.24,f,s,y,f,f,f,f,3.56,...,,,y,,,f,f,,d,u


In [6]:
train_merged = pd.concat(
    [train_competition.reset_index(), train_secondary],
    ignore_index=True
).sample(frac=1, random_state=42).reset_index(drop=True)