In [None]:
!pip install lightautoml

In [None]:
# =======================================================
# 1. Установка библиотек
# =======================================================

import os
import gc
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

import torch
import torch.nn as nn
from PIL import Image
import timm
import torchvision.transforms as T

from lightautoml.tasks import Task
from lightautoml.automl.presets.tabular_presets import TabularAutoML

import warnings
warnings.filterwarnings("ignore")

In [None]:
# =======================================================
# 2. Пути и загрузка табличных данных
# =======================================================

img_train_dir = '/kaggle/input/petfinder-pawpularity-score/train'
img_test_dir  = '/kaggle/input/petfinder-pawpularity-score/test'

train_file = '/kaggle/input/petfinder-pawpularity-score/train.csv'
test_file  = '/kaggle/input/petfinder-pawpularity-score/test.csv'

train = pd.read_csv(train_file)
test  = pd.read_csv(test_file)

print(train.head())
print('train shape:', train.shape, 'test shape:', test.shape)

In [None]:
# =======================================================
# 3. Модель для извлечения эмбеддингов из изображений
# =======================================================

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print('Using device:', device)

# Создаём модель без классификатора, чтобы получать эмбеддинги
model_name = 'eva02_base_patch14_448.mim_in22k_ft_in22k_in1k'
model = timm.create_model(model_name, pretrained=True, num_classes=0, global_pool='avg')
model.to(device)
model.eval()

# Трансформации как для ImageNet
img_size = 448
transform = T.Compose([
    T.Resize((img_size, img_size)),
    T.ToTensor(),
    T.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225]
    ),
])

In [None]:
# =======================================================
# 4. Функция для генерации эмбеддингов для DataFrame
# =======================================================

def get_image_embeddings(df, img_dir, batch_size=32):
    """
    df: DataFrame с колонкой 'Id'
    img_dir: папка с картинками .jpg
    возвращает DataFrame: Id + img_emb_0..img_emb_(D-1)
    """
    ids = df['Id'].values
    n = len(ids)

    all_embeddings = []

    # Чтобы не дёргать модель по одной картинке — делаем батчами
    for i in tqdm(range(0, n, batch_size), desc=f"Image embeddings from {os.path.basename(img_dir)}"):
        batch_ids = ids[i:i+batch_size]
        batch_imgs = []

        for img_id in batch_ids:
            img_path = os.path.join(img_dir, img_id + '.jpg')
            # Читаем картинку
            img = Image.open(img_path).convert('RGB')
            img_t = transform(img)
            batch_imgs.append(img_t)

        batch_tensor = torch.stack(batch_imgs).to(device)

        with torch.no_grad():
            emb = model(batch_tensor)  # shape: (batch, D)

        emb = emb.cpu().numpy()
        all_embeddings.append(emb)

        # на всякий пожарный — чистим память
        del batch_tensor, emb
        gc.collect()

    all_embeddings = np.vstack(all_embeddings)  # (N, D)
    emb_dim = all_embeddings.shape[1]

    # Создаём имена колонок
    emb_cols = [f'img_emb_{j}' for j in range(emb_dim)]

    emb_df = pd.DataFrame(all_embeddings, columns=emb_cols)
    emb_df.insert(0, 'Id', ids)

    return emb_df

In [None]:
# =======================================================
# 5. Считаем эмбеддинги для train и test
# =======================================================

train_emb = get_image_embeddings(train, img_train_dir, batch_size=32)
test_emb  = get_image_embeddings(test, img_test_dir, batch_size=32)

print('train_emb shape:', train_emb.shape)
print('test_emb shape:', test_emb.shape)

In [None]:
#test_full.to_csv('test_full_petfinder.csv', index=False)

In [None]:
# =======================================================
# 6. Мерджим эмбеддинги с табличными данными
# =======================================================

train_full = train.merge(train_emb, on='Id', how='inner')
test_full  = test.merge(test_emb, on='Id', how='inner')

print('train_full shape:', train_full.shape)
print('test_full shape:', test_full.shape)
train_full.head()

In [None]:
tr_f = train_full.copy(deep=True)

In [None]:
sample = pd.read_csv('/kaggle/input/smeshnoi-dataset/sample_submission_new_test.csv').drop(['Unnamed: 0'], axis=1)

test_ids = set(sample['Id'])

# test – все объекты, чьи Id есть в sample
test_full = train_full[train_full['Id'].isin(test_ids)].reset_index(drop=True)

# train – все остальные
train_full = train_full[~train_full['Id'].isin(test_ids)].reset_index(drop=True)

In [None]:
# =======================================================
# 7. Настройка задачи и AutoML
# =======================================================

TARGET = 'Pawpularity'
task = Task('reg', metric='mse')

roles = {
    'target': TARGET,
    'drop': ['Id'],  # Id нам как фича не нужен
}

RANDOM_STATE = 42

cpu_limit = min(os.cpu_count(), 4)

automl = TabularAutoML(
    task=task,
    timeout=10 * 60,      # можно уменьшить, например до 1800
    cpu_limit=cpu_limit
)

# =======================================================
# 8. Обучение
# =======================================================

oof_pred = automl.fit_predict(train_full, roles=roles, verbose=1)

from sklearn.metrics import mean_squared_error

rmse = mean_squared_error(
    train_full[TARGET].values,
    oof_pred.data[:, 0],
    squared=False
)

print(f"Train OOF RMSE (tabular + image embeddings): {rmse:.4f}")

In [None]:
# =======================================================
# 9. Предсказания для test и создание submission
# =======================================================

test_pred = automl.predict(test_full)
test_pawpularity = test_pred.data[:, 0]
rmse = np.sqrt(np.mean((test_full['Pawpularity'] - test_pawpularity) ** 2))
print(f"RMSE на тестовой выборке: {rmse:.4f}")