In [1]:
import os

import numpy as np
import pandas as pd
from PIL import Image

import torch
import torch.nn as nn
from torchvision import models, transforms

from tqdm import tqdm

from catboost import CatBoostRegressor, CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    mean_squared_error,
    mean_absolute_error,
    accuracy_score,
    f1_score
)


2. Конфиг (пути, колонки, тип задачи)

In [2]:
# Путь к train/test таблицам
train_path = "train.csv"
test_path = "test.csv"

# Название колонки с ID картинки
IMAGE_ID_COL = "Id"     # поменяй, если у тебя другой столбец

# Название колонки с таргетом
TARGET_COL = "Pawpularity"         # сюда поставь имя своей целевой переменной

# Тип задачи: "regression" или "classification"
TASK_TYPE = "regression"      # если классификация, поставь "classification"

# Папки с картинками
train_images_dir = "train"
test_images_dir = "test"

# Расширение картинок
# Если в таблице image_id уже содержит ".jpg", поставь IMAGE_EXT = ""
IMAGE_EXT = ".jpg"

# Имя файла сабмита
submission_path = "submission.csv"


3. Читаем данные

In [3]:
train = pd.read_csv(train_path)
test = pd.read_csv(test_path)

print("Train shape:", train.shape)
print("Test shape:", test.shape)
print("Train columns:", train.columns.tolist())
print("Test columns:", test.columns.tolist())
# Если в IMAGE_ID_COL уже лежит something.jpg, можно сделать IMAGE_EXT = "" и не дописывать расширение.

Train shape: (9912, 14)
Test shape: (8, 13)
Train columns: ['Id', 'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur', 'Pawpularity']
Test columns: ['Id', 'Subject Focus', 'Eyes', 'Face', 'Near', 'Action', 'Accessory', 'Group', 'Collage', 'Human', 'Occlusion', 'Info', 'Blur']


4. Настройка устройства (GPU)

In [4]:
if torch.cuda.is_available():
    device = "cuda"
else:
    device = "cpu"

print("Using device:", device)


Using device: cuda


**ФОТКИ**


5. Загружаем ResNet и режем классификатор

In [5]:
# Загружаем предобученную ResNet50
resnet = models.resnet50(weights=models.ResNet50_Weights.DEFAULT)

# Обрезаем последний полносвязный слой, чтобы модель возвращала эмбеддинг (2048 чисел)
resnet.fc = nn.Identity()

# Переносим модель на GPU/CPU
resnet.to(device)

# Включаем eval-режим (инференс, без обучения)
resnet.eval()


ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): Bottleneck(
      (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (conv3): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
      (bn3): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (downsample): Sequential(
        (0): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 

6. Трансформации для картинок

In [6]:
image_transform = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    )
])


7. Функция: получить L2-нормализованный эмбеддинг

In [7]:
def get_normalized_embedding(image_path):
    """
    1. Загружает картинку по пути image_path.
    2. Применяет стандартные трансформации под ResNet.
    3. Прогоняет через ResNet на GPU/CPU.
    4. Возвращает L2-нормализованный вектор (numpy, shape = (2048,)).
    """
    # Загружаем изображение и приводим к RGB
    image = Image.open(image_path).convert("RGB")
    
    # Применяем трансформации
    image = image_transform(image)
    
    # Добавляем batch dimension: [C, H, W] -> [1, C, H, W]
    image = image.unsqueeze(0)
    
    # Переносим на устройство
    image = image.to(device)
    
    # Прогоняем через модель
    with torch.no_grad():
        emb_tensor = resnet(image)   # shape: [1, 2048]
    
    # Переводим в numpy и убираем размер батча
    emb = emb_tensor.cpu().numpy().reshape(-1)  # shape: (2048,)
    
    # L2-нормализация
    norm = np.linalg.norm(emb)
    if norm > 0:
        emb = emb / norm
    
    return emb


8. Извлекаем эмбеддинги для train

In [8]:
train_embeddings = []

for image_id in tqdm(train[IMAGE_ID_COL], desc="Train embeddings"):
    # Если в image_id уже есть ".jpg", используем IMAGE_EXT = ""
    img_name = str(image_id) + IMAGE_EXT
    img_path = os.path.join(train_images_dir, img_name)
    
    emb = get_normalized_embedding(img_path)
    train_embeddings.append(emb)

train_embeddings = np.array(train_embeddings)

print("Train embeddings shape:", train_embeddings.shape)


Train embeddings: 100%|██████████| 9912/9912 [02:19<00:00, 71.06it/s]


Train embeddings shape: (9912, 2048)


9. Извлекаем эмбеддинги для test

In [9]:
test_embeddings = []

for image_id in tqdm(test[IMAGE_ID_COL], desc="Test embeddings"):
    img_name = str(image_id) + IMAGE_EXT
    img_path = os.path.join(test_images_dir, img_name)
    
    emb = get_normalized_embedding(img_path)
    test_embeddings.append(emb)

test_embeddings = np.array(test_embeddings)

print("Test embeddings shape:", test_embeddings.shape)


Test embeddings: 100%|██████████| 8/8 [00:00<00:00, 113.99it/s]

Test embeddings shape: (8, 2048)





10. Добавляем эмбеддинги в таблицы

In [10]:
emb_dim = train_embeddings.shape[1]
emb_cols = [f"f_{i}" for i in range(emb_dim)]

train_emb_df = pd.DataFrame(train_embeddings, columns=emb_cols)
test_emb_df = pd.DataFrame(test_embeddings, columns=emb_cols)

train_full = pd.concat([train.reset_index(drop=True), train_emb_df], axis=1)
test_full = pd.concat([test.reset_index(drop=True), test_emb_df], axis=1)

print("Train full shape:", train_full.shape)
print("Test full shape:", test_full.shape)


Train full shape: (9912, 2062)
Test full shape: (8, 2061)


11. (Опционально) фича косинусной похожести до среднего эмбеддинга

In [11]:
# Считаем средний эмбеддинг по train
mean_emb = train_embeddings.mean(axis=0)

# L2-нормализуем центр
mean_norm = np.linalg.norm(mean_emb)
if mean_norm > 0:
    mean_emb = mean_emb / mean_norm

def cosine_similarity(a, b):
    """
    Косинусная похожесть между двумя векторами a и b.
    a и b могут быть уже нормализованы, но мы перестрахуемся.
    """
    denom = (np.linalg.norm(a) * np.linalg.norm(b))
    if denom == 0:
        return 0.0
    value = np.dot(a, b) / denom
    return value

# Добавляем фичу для train
train_cos_sim = []
for emb in train_embeddings:
    sim = cosine_similarity(emb, mean_emb)
    train_cos_sim.append(sim)

train_full["cos_sim_to_mean"] = train_cos_sim

# Добавляем фичу для test
test_cos_sim = []
for emb in test_embeddings:
    sim = cosine_similarity(emb, mean_emb)
    test_cos_sim.append(sim)

test_full["cos_sim_to_mean"] = test_cos_sim

print("Added feature cos_sim_to_mean")


Added feature cos_sim_to_mean


12. Явно задаём списки фич по типам

In [None]:
train

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9907,ffbfa0383c34dc513c95560d6e1fdb57,0,0,0,1,0,0,0,0,0,0,0,1,15
9908,ffcc8532d76436fc79e50eb2e5238e45,0,1,1,1,0,0,0,0,0,0,0,0,70
9909,ffdf2e8673a1da6fb80342fa3b119a20,0,1,1,1,0,0,0,0,1,1,0,0,20
9910,fff19e2ce11718548fa1c5d039a5192a,0,1,1,1,0,0,0,0,1,0,0,0,20


In [12]:
#Тут важное место: как раз про категориальные фичи — их оставляем строками и отмечаем индексы колонок, а не значения.
# Эмбеддинги (все колонки, начинающиеся с "f_")
embedding_features = []
for col in train_full.columns:
    if col.startswith("f_"):
        embedding_features.append(col)

# Числовые фичи (пример — добавь свои)
numeric_features = [
    # "price",
    # "age",
]

# Бинарные фичи (0/1)
binary_features = [
    # "is_new",
    # "has_discount",
    'Subject Focus',
    'Eyes',
    'Face',
    'Near',
    'Action',
    'Accessory',
    'Group',
    'Collage',
    'Human',
    'Occlusion',
    'Info',
    'Blur'
]

# Фичи расстояний (если добавил cos_sim_to_mean — он числовой)
distance_features = [
    "cos_sim_to_mean"  # убери, если не использовал блок с косинусом
]

# Категориальные фичи (ОБЯЗАТЕЛЬНО ОСТАВИТЬ ИХ СТРОКАМИ)
categorical_features = [
    # "color",
    # "store_type",
]

# Собираем общий список фич в правильном порядке
feature_cols = []
feature_cols.extend(embedding_features)
feature_cols.extend(numeric_features)
feature_cols.extend(binary_features)
feature_cols.extend(distance_features)
feature_cols.extend(categorical_features)

print("Всего фич:", len(feature_cols))
print("Первые 10 фич:", feature_cols[:10])


Всего фич: 2061
Первые 10 фич: ['f_0', 'f_1', 'f_2', 'f_3', 'f_4', 'f_5', 'f_6', 'f_7', 'f_8', 'f_9']


13. Индексы категориальных фичей (именно колонок, не значений!)

In [13]:
cat_feature_indices = []

for cat_col in categorical_features:
    if cat_col in feature_cols:
        idx = feature_cols.index(cat_col)
        cat_feature_indices.append(idx)

print("Категориальные столбцы:", categorical_features)
print("Индексы категориальных столбцов:", cat_feature_indices)
#Здесь мы как раз делаем то, о чём говорили: передаём CatBoost индексы колонок, а сами значения оставляем строками.

Категориальные столбцы: []
Индексы категориальных столбцов: []


14. Готовим X, y, train/valid split

In [14]:
X = train_full[feature_cols]
y = train_full[TARGET_COL]

X_test = test_full[feature_cols]


если запускаешь блок с оптуной, то после вот этого(верхнего) шага - стоп

In [None]:
# Для классификации лучше использовать stratify=y (но здесь пока простой split):
if TASK_TYPE == "classification":
    stratify_param = y
else:
    stratify_param = None

X_train, X_valid, y_train, y_valid = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    shuffle=True,
    stratify=stratify_param
)

print("X_train shape:", X_train.shape)
print("X_valid shape:", X_valid.shape)


X_train shape: (7929, 2061)
X_valid shape: (1983, 2061)


15. Создаём CatBoost Pool (GPU)

In [None]:
train_pool = Pool(
    data=X_train,
    label=y_train,
    cat_features=cat_feature_indices
)

valid_pool = Pool(
    data=X_valid,
    label=y_valid,
    cat_features=cat_feature_indices
)

test_pool = Pool(
    data=X_test,
    cat_features=cat_feature_indices
)


16. Обучаем CatBoost на GPU

Вариант A: регрессия

In [None]:
if TASK_TYPE == "regression":
    model = CatBoostRegressor(
        iterations=1500,
        learning_rate=0.03,
        depth=6,
        loss_function="RMSE",
        eval_metric="RMSE",
        task_type="GPU",   # использование GPU
        devices="0",
        verbose=100
    )
    
    model.fit(train_pool, eval_set=valid_pool)
    
    # Оценка
    y_pred_valid = model.predict(valid_pool)
    
    rmse = mean_squared_error(y_valid, y_pred_valid, squared=False)
    mae = mean_absolute_error(y_valid, y_pred_valid)
    
    print(f"Validation RMSE: {rmse:.4f}")
    print(f"Validation MAE:  {mae:.4f}")


0:	learn: 20.3992422	test: 20.9437192	best: 20.9437192 (0)	total: 153ms	remaining: 3m 48s
100:	learn: 17.6270768	test: 18.6580389	best: 18.6580389 (100)	total: 13.5s	remaining: 3m 6s
200:	learn: 16.7546571	test: 18.4063336	best: 18.4063336 (200)	total: 26.1s	remaining: 2m 48s
300:	learn: 16.1064923	test: 18.3234788	best: 18.3199075 (298)	total: 38.8s	remaining: 2m 34s
400:	learn: 15.4959616	test: 18.2638497	best: 18.2638497 (400)	total: 51.4s	remaining: 2m 20s
500:	learn: 14.9246098	test: 18.2334244	best: 18.2313189 (487)	total: 1m 4s	remaining: 2m 7s
600:	learn: 14.3902424	test: 18.2155794	best: 18.2144278 (565)	total: 1m 16s	remaining: 1m 53s
700:	learn: 13.9271420	test: 18.1965719	best: 18.1923867 (680)	total: 1m 28s	remaining: 1m 41s
800:	learn: 13.4811928	test: 18.1826251	best: 18.1805761 (765)	total: 1m 40s	remaining: 1m 27s
900:	learn: 13.0723778	test: 18.1747866	best: 18.1636985 (852)	total: 1m 52s	remaining: 1m 14s
1000:	learn: 12.6910641	test: 18.1604508	best: 18.1599779 (999



Вариант B: классификация

In [None]:
# if TASK_TYPE == "classification":
#     model = CatBoostClassifier(
#         iterations=500,
#         learning_rate=0.05,
#         depth=6,
#         loss_function="Logloss",
#         eval_metric="AUC",
#         task_type="GPU",   # использование GPU
#         devices="0",
#         verbose=100
#     )
    
#     model.fit(train_pool, eval_set=valid_pool)
    
#     # Предикт на валидации
#     y_pred_proba_valid = model.predict_proba(valid_pool)[:, 1]  # для бинарной
#     y_pred_label_valid = (y_pred_proba_valid > 0.5).astype(int)
    
#     acc = accuracy_score(y_valid, y_pred_label_valid)
#     f1 = f1_score(y_valid, y_pred_label_valid, average="macro")
    
#     print(f"Validation ACC: {acc:.4f}")
#     print(f"Validation F1-macro: {f1:.4f}")


17. Предсказания на test

In [None]:
if TASK_TYPE == "regression":
    test_pred = model.predict(test_pool)

elif TASK_TYPE == "classification":
    # Для сабмита обычно полезны либо вероятности, либо классы.
    # Здесь возьмём вероятности класса 1.
    test_pred = model.predict_proba(test_pool)[:, 1]

print("Test predictions shape:", test_pred.shape)


Test predictions shape: (8,)


18. Сабмит id, prediction

In [None]:
submission = pd.DataFrame()
submission["Id"] = test[IMAGE_ID_COL]      # колонка id
submission["Pawpularity"] = test_pred       # колонка с предсказаниями

print(submission.head())

submission.to_csv(submission_path, index=False)
print("Saved submission to:", submission_path)


                                 Id  Pawpularity
0  4128bae22183829d2b5fea10effdb0c3    47.770455
1  43a2262d7738e3d420d453815151079e    46.018941
2  4e429cead1848a298432a0acad014c9d    50.059173
3  80bc3ccafcc51b66303c2c263aa38486    49.549322
4  8f49844c382931444e68dffbe20228f4    48.551166
Saved submission to: submission.csv


----------------------------------------------------------------------------------------------------------------------------------------------------------------

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split

train = pd.read_csv('train.csv')
sample = pd.read_csv('submission.csv')

train_df, test_df = train_test_split(train, test_size=0.25, random_state=42, shuffle=True)

print(train_df.shape, test_df.shape)


# тут оценка метрики подставьте ваши предсказания в sample
rmse = np.sqrt(np.mean((test_df['Pawpularity'] - sample['Pawpularity']) ** 2))
print(f"RMSE на тестовой выборке: {rmse:.4f}")

(7434, 14) (2478, 14)
RMSE на тестовой выборке: 26.6983


In [None]:
train

Unnamed: 0,Id,Subject Focus,Eyes,Face,Near,Action,Accessory,Group,Collage,Human,Occlusion,Info,Blur,Pawpularity
0,0007de18844b0dbbb5e1f607da0606e0,0,1,1,1,0,0,1,0,0,0,0,0,63
1,0009c66b9439883ba2750fb825e1d7db,0,1,1,0,0,0,0,0,0,0,0,0,42
2,0013fd999caf9a3efe1352ca1b0d937e,0,1,1,1,0,0,0,0,1,1,0,0,28
3,0018df346ac9c1d8413cfcc888ca8246,0,1,1,1,0,0,0,0,0,0,0,0,15
4,001dc955e10590d3ca4673f034feeef2,0,0,0,1,0,0,1,0,0,0,0,0,72
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9907,ffbfa0383c34dc513c95560d6e1fdb57,0,0,0,1,0,0,0,0,0,0,0,1,15
9908,ffcc8532d76436fc79e50eb2e5238e45,0,1,1,1,0,0,0,0,0,0,0,0,70
9909,ffdf2e8673a1da6fb80342fa3b119a20,0,1,1,1,0,0,0,0,1,1,0,0,20
9910,fff19e2ce11718548fa1c5d039a5192a,0,1,1,1,0,0,0,0,1,0,0,0,20


In [None]:
sample

Unnamed: 0,Id,Pawpularity
0,4128bae22183829d2b5fea10effdb0c3,47.770455
1,43a2262d7738e3d420d453815151079e,46.018941
2,4e429cead1848a298432a0acad014c9d,50.059173
3,80bc3ccafcc51b66303c2c263aa38486,49.549322
4,8f49844c382931444e68dffbe20228f4,48.551166
5,b03f7041962238a7c9d6537e22f9b017,50.513183
6,c978013571258ed6d4637f6e8cc9d6a3,49.052743
7,e0de453c1bffc20c22b072b34b54e50f,50.560605


**Блок: Optuna + KFold/StratifiedKFold + сабмит**

2.1. Подвыборка + сплит для Optuna

In [15]:
from sklearn.model_selection import train_test_split
from catboost import CatBoostRegressor, Pool
from sklearn.metrics import mean_squared_error
import optuna
import gc
import numpy as np

# Берём подвыборку для Optuna (чтобы быстрее и безопаснее)
X_sample, _, y_sample, _ = train_test_split(
    X,
    y,
    train_size=3000,      # 3k строк вполне достаточно для подбора
    random_state=42,
    shuffle=True
)

X_train_opt, X_valid_opt, y_train_opt, y_valid_opt = train_test_split(
    X_sample,
    y_sample,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

train_pool_opt = Pool(
    data=X_train_opt,
    label=y_train_opt,
    cat_features=cat_feature_indices
)

valid_pool_opt = Pool(
    data=X_valid_opt,
    label=y_valid_opt,
    cat_features=cat_feature_indices
)

print("Optuna train shape:", X_train_opt.shape)
print("Optuna valid shape:", X_valid_opt.shape)

optuna.logging.set_verbosity(optuna.logging.INFO)


Optuna train shape: (2400, 2061)
Optuna valid shape: (600, 2061)


2.2. Objective на CPU + прогресс

In [18]:
def objective(trial):
    depth = trial.suggest_int("depth", 4, 9)
    learning_rate = trial.suggest_float("learning_rate", 0.01, 0.1, log=True)
    l2_leaf_reg = trial.suggest_float("l2_leaf_reg", 1e-3, 10.0, log=True)
    random_strength = trial.suggest_float("random_strength", 0.1, 2.0)
    bagging_temperature = trial.suggest_float("bagging_temperature", 0.0, 1.0)

    model = CatBoostRegressor(
        iterations=400,               # на CPU + маленькая выборка это нормально
        depth=depth,
        learning_rate=learning_rate,
        l2_leaf_reg=l2_leaf_reg,
        random_strength=random_strength,
        bagging_temperature=bagging_temperature,
        loss_function="RMSE",
        eval_metric="RMSE",
        task_type="CPU",             # <<< ВАЖНО: CPU, НЕ GPU
        verbose=100,
        od_type="Iter",
        od_wait=40
    )

    model.fit(train_pool_opt, eval_set=valid_pool_opt)

    y_pred_valid = model.predict(valid_pool_opt)
    rmse = mean_squared_error(y_valid_opt, y_pred_valid, squared=False)

    print(f"[Trial {trial.number}] RMSE = {rmse:.4f}")

    del model
    gc.collect()

    return rmse


2.3. Запускаем Optuna (ограничение по времени и количеству трейлов)

In [None]:
study = optuna.create_study(direction="minimize")

study.optimize(
    objective,
    n_trials=15,        # начни с 10–15, потом можно поднять
    timeout=1200        # максимум ~20 минут
)

print("Лучшее значение RMSE:", study.best_value)
print("Лучшие параметры:", study.best_trial.params)

best_params = study.best_trial.params


3. K-Fold уже на GPU с лучшими параметрами
Теперь, когда best_params есть, делаем K-Fold по всему X, y.
ЭТОТ блок ставим после Optuna:

In [None]:
from sklearn.model_selection import KFold

n_splits = 5
kf = KFold(n_splits=n_splits, shuffle=True, random_state=42)

oof_preds = np.zeros(len(train_full))
test_preds = np.zeros(len(test_full))

fold = 0

for train_idx, valid_idx in kf.split(X, y):
    fold += 1
    print(f"\n===== Fold {fold}/{n_splits} =====")

    X_tr = X.iloc[train_idx]
    y_tr = y.iloc[train_idx]
    X_val = X.iloc[valid_idx]
    y_val = y.iloc[valid_idx]

    train_pool_fold = Pool(
        data=X_tr,
        label=y_tr,
        cat_features=cat_feature_indices
    )

    valid_pool_fold = Pool(
        data=X_val,
        label=y_val,
        cat_features=cat_feature_indices
    )

    model_fold = CatBoostRegressor(
        iterations=1000,                             # побольше, финальная модель
        depth=best_params["depth"],
        learning_rate=best_params["learning_rate"],
        l2_leaf_reg=best_params["l2_leaf_reg"],
        random_strength=best_params["random_strength"],
        bagging_temperature=best_params["bagging_temperature"],
        loss_function="RMSE",
        eval_metric="RMSE",
        task_type="GPU",                             # <<< здесь уже GPU
        devices="0",
        verbose=100,
        od_type="Iter",
        od_wait=60
    )

    model_fold.fit(train_pool_fold, eval_set=valid_pool_fold)

    oof_fold = model_fold.predict(valid_pool_fold)
    oof_preds[valid_idx] = oof_fold

    test_pool = Pool(X_test, cat_features=cat_feature_indices)
    test_fold = model_fold.predict(test_pool)
    test_preds += test_fold / n_splits

    del model_fold
    gc.collect()


OOF RMSE + сабмит

In [None]:
from sklearn.metrics import mean_squared_error

rmse_oof = mean_squared_error(y, oof_preds, squared=False)
print(f"\nOOF RMSE (по {n_splits} фолдам): {rmse_oof:.4f}")

submission = pd.DataFrame()
submission["Id"] = test_full["Id"]      # проверь имя колонки
submission["Pawpularity"] = test_preds

submission.to_csv("submission_optuna_cpu_kfold_gpu.csv", index=False)
print("Saved submission_optuna_cpu_kfold_gpu.csv")
