In [2]:
def custom_epsilon_loss_grad_hess(y_raw, tr_data=None, y_true=None, alpha=1.2, beta=0.9, epsilon=1, p=2):
    """
    Улучшенная версия epsilon-insensitive asymmetric loss с диагностикой.
    """
    # Получаем истинные значения
    if y_true is None:
        y_true = tr_data.get_label()
    
    # Проверка размерностей
    assert len(y_raw) == len(y_true), f"Размеры не совпадают: y_raw {len(y_raw)}, y_true {len(y_true)}"
    
    residuals = y_raw - y_true
    
    # Диагностика: выводим статистику по остаткам
    #print("\nДиагностика лосса:")
    #print(f"Сырые предсказания: min={np.min(y_raw):.2f}, max={np.max(y_raw):.2f}")
    #print(f"Истинные значения: min={np.min(y_true):.2f}, max={np.max(y_true):.2f}")
    #print(f"Остатки: min={np.min(residuals):.2f}, max={np.max(residuals):.2f}")
    
    # Инициализация градиента и гессиана
    grad = np.zeros_like(residuals)
    hess = np.zeros_like(residuals)
    
    # Маски для разных случаев
    within_epsilon = np.abs(residuals) <= epsilon
    overestimation = residuals > epsilon
    underestimation = residuals < -epsilon
    
    #print(f"Точек в epsilon: {np.sum(within_epsilon)}")
    #print(f"Переоценок: {np.sum(overestimation)}")
    #print(f"Недооценок: {np.sum(underestimation)}")
    
    # Вычисление градиента
    grad[overestimation] = alpha * p * (residuals[overestimation] - epsilon)**(p-1)
    grad[underestimation] = -beta * p * (-residuals[underestimation] - epsilon)**(p-1)
    
    # Вычисление гессиана
    if p > 1:
        hess[overestimation] = alpha * p * (p-1) * (residuals[overestimation] - epsilon)**(p-2)
        hess[underestimation] = beta * p * (p-1) * (-residuals[underestimation] - epsilon)**(p-2)
    
    # Защита от численной нестабильности
    grad = np.nan_to_num(grad, nan=0.0, posinf=0.0, neginf=0.0)
    hess = np.nan_to_num(hess, nan=0.0, posinf=0.0, neginf=0.0)
    
    #print(f"Градиент: min={np.min(grad):.2f}, max={np.max(grad):.2f}")
    #print(f"Гессиан: min={np.min(hess):.2f}, max={np.max(hess):.2f}")
    
    return grad, hess

In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import KFold
from sklearn.preprocessing import TargetEncoder
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import lightgbm as lgb
import seaborn as sns
import optuna
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold
from functools import partial


import warnings
warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

# Загрузка данных
train = pd.read_csv('train_contest.csv')
test = pd.read_csv('test_contest.csv')

# Подготовка данных
features = list(train.drop('target', axis=1).columns)
cat_features = train.select_dtypes(include=['object', 'category']).columns.tolist()

for feat in cat_features:
    train[feat] = train[feat].astype('category')
    test[feat] = test[feat].astype('category')

# Разделение данных для валидации
X_train, X_val, y_train, y_val = train_test_split(
    train[features], train['target'], test_size=0.2, random_state=42
)

# Создание Dataset для LightGBM
train_dataset = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
val_dataset = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_features, reference=train_dataset)

# Функция для оптимизации через Optuna
def objective(trial):
    params = {
        'objective': partial(custom_epsilon_loss_grad_hess,
                       alpha=1.4,     # штраф за переоценку
                       beta=1.55,      # УВЕЛИЧЕННЫЙ штраф за недооценку
                       epsilon=25,
                       p=1.83          # Более близко к L2
    	),
        'metric': 'rmse',
        'verbosity': -1,
        'boosting_type': 'gbdt',
        'feature_pre_filter': False,
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'lambda_l1': trial.suggest_float('lambda_l1', 1e-8, 10.0, log=True),
        'lambda_l2': trial.suggest_float('lambda_l2', 1e-8, 10.0, log=True),
    }
    
    model = lgb.train(
        params,
        train_dataset,
        valid_sets=[val_dataset],
        callbacks=[lgb.early_stopping(stopping_rounds=50, verbose=False)]
    )
    
    preds = model.predict(X_val)
    mse = mean_squared_error(y_val, preds)
    rmse = np.sqrt(mse)  # Вручную вычисляем RMSE, если параметр squared не поддерживается
    
    return rmse

# Оптимизация гиперпараметров
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=150, timeout=2*60*60)

print('Best trial:')
trial = study.best_trial
print(f'  RMSE: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

# Обучение модели с лучшими параметрами на всех данных
best_params = trial.params
best_params['objective'] = 'regression'

full_train_dataset = lgb.Dataset(train[features], label=train['target'], categorical_feature=cat_features)
final_model = lgb.train(best_params, full_train_dataset)

# Предсказание на тестовых данных
test['target'] = final_model.predict(test[features])
test[['index', 'target']].to_csv('test.csv', index=False)

[I 2025-05-06 17:49:55,272] A new study created in memory with name: no-name-90f54a2c-cb4f-4d4d-8768-4e650d583fc3
[I 2025-05-06 17:49:57,282] Trial 0 finished with value: 1877.229658514508 and parameters: {'num_leaves': 181, 'learning_rate': 0.07545449094242128, 'feature_fraction': 0.5193764937602081, 'bagging_fraction': 0.9637123953212995, 'bagging_freq': 7, 'min_child_samples': 99, 'lambda_l1': 0.003977933352850825, 'lambda_l2': 9.416876841992265e-08}. Best is trial 0 with value: 1877.229658514508.
[I 2025-05-06 17:49:58,958] Trial 1 finished with value: 1884.1471299442544 and parameters: {'num_leaves': 212, 'learning_rate': 0.08869632352789596, 'feature_fraction': 0.5842007441742708, 'bagging_fraction': 0.7034361308411357, 'bagging_freq': 1, 'min_child_samples': 31, 'lambda_l1': 8.339236350051185e-07, 'lambda_l2': 0.00474674985130095}. Best is trial 0 with value: 1877.229658514508.
[I 2025-05-06 17:50:01,111] Trial 2 finished with value: 1891.9130494705055 and parameters: {'num_leav

Best trial:
  RMSE: 1866.089164255119
  Params: 
    num_leaves: 146
    learning_rate: 0.07470350805520398
    feature_fraction: 0.411448374406298
    bagging_fraction: 0.9740684045160495
    bagging_freq: 2
    min_child_samples: 35
    lambda_l1: 1.844912451192401e-05
    lambda_l2: 0.031981947771846216
