In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
import optuna
from catboost import CatBoostRegressor, Pool
import warnings

warnings.filterwarnings('ignore')

# Загрузка данных
train = pd.read_csv('train_contest.csv')
test = pd.read_csv('test_contest.csv')

# Подготовка данных
features = list(train.drop('target', axis=1).columns)
cat_features = train.select_dtypes(include=['object', 'category']).columns.tolist()

'''for col in cat_features:
    freq = train[col].value_counts(normalize=True)
    rare_categories = freq[freq < 0.01].index
    train[col] = train[col].replace(rare_categories, 'Other')
    test[col] = test[col].replace(rare_categories, 'Other')'''
    
# Разделение данных для валидации
X_train, X_val, y_train, y_val = train_test_split(
    train[features], train['target'], test_size=0.2, random_state=42
)

# Создание Pool для CatBoost
train_pool = Pool(X_train, y_train, cat_features=cat_features)
val_pool = Pool(X_val, y_val, cat_features=cat_features)

# Функция для оптимизации через Optuna
def objective(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 500, 2000),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'depth': trial.suggest_int('depth', 4, 10),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-8, 10.0, log=True),
        'random_strength': trial.suggest_float('random_strength', 1e-8, 10.0, log=True),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),
        'leaf_estimation_iterations': trial.suggest_int('leaf_estimation_iterations', 1, 10),
        'verbose': False,
        'loss_function': 'RMSE',
        'eval_metric': 'RMSE',
        'random_seed': 42,
        'early_stopping_rounds': 50
    }
    
    model = CatBoostRegressor(**params)
    
    model.fit(
        train_pool,
        eval_set=val_pool,
        early_stopping_rounds=50,
        verbose=False
    )
    
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    
    return rmse

# Оптимизация гиперпараметров
study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50, timeout=3*60*60)

print('Best trial:')
trial = study.best_trial
print(f'  RMSE: {trial.value}')
print('  Params: ')
for key, value in trial.params.items():
    print(f'    {key}: {value}')

# Обучение модели с лучшими параметрами на всех данных
best_params = study.best_params
best_params.update({
    'loss_function': 'RMSE',
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': False
})

full_train_pool = Pool(train[features], train['target'], cat_features=cat_features)
final_model = CatBoostRegressor(**best_params)
final_model.fit(full_train_pool)

# Предсказание на тестовых данных
test['target'] = final_model.predict(test[features])
test[['index', 'target']].to_csv('test_catboost.csv', index=False)

[I 2025-05-05 18:13:18,952] A new study created in memory with name: no-name-8d0dd8e9-a967-4392-ad9a-acd96288cab9
[I 2025-05-05 18:22:25,464] Trial 0 finished with value: 1877.7178036674095 and parameters: {'iterations': 1865, 'learning_rate': 0.04649971541220742, 'depth': 4, 'l2_leaf_reg': 3.2072468235372445e-08, 'random_strength': 9.238766794434823e-06, 'bagging_temperature': 0.49890105258536466, 'leaf_estimation_iterations': 2}. Best is trial 0 with value: 1877.7178036674095.
[I 2025-05-05 18:29:56,924] Trial 1 finished with value: 1853.280729030385 and parameters: {'iterations': 1463, 'learning_rate': 0.0898269619399365, 'depth': 7, 'l2_leaf_reg': 8.201151975959542e-07, 'random_strength': 0.028882804806115072, 'bagging_temperature': 0.7976115324392776, 'leaf_estimation_iterations': 8}. Best is trial 1 with value: 1853.280729030385.
[I 2025-05-05 18:41:37,438] Trial 2 finished with value: 1886.042235987376 and parameters: {'iterations': 1604, 'learning_rate': 0.02432171327452264, 'd

Best trial:
  RMSE: 1853.280729030385
  Params: 
    iterations: 1463
    learning_rate: 0.0898269619399365
    depth: 7
    l2_leaf_reg: 8.201151975959542e-07
    random_strength: 0.028882804806115072
    bagging_temperature: 0.7976115324392776
    leaf_estimation_iterations: 8
