In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder, TargetEncoder
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostRegressor
import optuna
from optuna.samplers import TPESampler
import seaborn as sns
import warnings
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Ridge

warnings.filterwarnings('ignore', category=pd.errors.SettingWithCopyWarning)

# Загрузка данных
train = pd.read_csv('train_contest.csv')
test = pd.read_csv('test_contest.csv')
features = list(train.drop('target', axis=1).columns)
cat_features = train.select_dtypes(include=['object', 'category']).columns.tolist()

In [27]:
def label_encode_data(train_df, test_df, cat_features):
    le = LabelEncoder()
    for feat in cat_features:
        combined = pd.concat([train_df[feat], test_df[feat]], axis=0)
        le.fit(combined)
        train_df[feat] = le.transform(train_df[feat])
        test_df[feat] = le.transform(test_df[feat])
    return train_df, test_df

# Применяем Label Encoding
train, test = label_encode_data(train, test, cat_features)

# Разделение данных для валидации
X_train, X_val, y_train, y_val = train_test_split(
    train[features], train['target'], test_size=0.2, random_state=42
)

# Функции для обучения отдельных моделей
def train_lightgbm(X_train, y_train, X_val, y_val, params):
    train_data = lgb.Dataset(X_train, label=y_train)
    val_data = lgb.Dataset(X_val, label=y_val, reference=train_data)
    
    # Callback для ранней остановки
    callbacks = [
        lgb.early_stopping(stopping_rounds=50, verbose=False),
        lgb.log_evaluation(period=0)  # Отключаем логирование
    ]
    
    model = lgb.train(
        params,
        train_data,
        valid_sets=[val_data],
        callbacks=callbacks
    )
    return model

def train_xgboost(X_train, y_train, X_val, y_val, params):
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dval = xgb.DMatrix(X_val, label=y_val)
    
    # Callback для ранней остановки
    watchlist = [(dval, 'eval')]
    
    model = xgb.train(
        params,
        dtrain,
        evals=watchlist,
        early_stopping_rounds=50,
        verbose_eval=False
    )
    return model

def train_catboost(X_train, y_train, X_val, y_val, params):
    # Callback для CatBoost (уже встроен в метод fit)
    model = CatBoostRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50,
        verbose=False
    )
    return model

def train_catboost(X_train, y_train, X_val, y_val, params):
    model = CatBoostRegressor(**params)
    model.fit(
        X_train, y_train,
        eval_set=(X_val, y_val),
        early_stopping_rounds=50,
        verbose=False
    )
    return model

def train_random_forest(X_train, y_train, params):
    model = RandomForestRegressor(**params)
    model.fit(X_train, y_train)
    return model

def train_ridge(X_train, y_train, params):
    model = Ridge(**params)
    model.fit(X_train, y_train)
    return model

# Функции для оптимизации гиперпараметров с помощью Optuna
def optimize_lightgbm(trial):
    params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'num_leaves': trial.suggest_int('num_leaves', 20, 300),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'feature_fraction': trial.suggest_float('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_float('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 7),
        'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),
        'verbose': -1
    }
    
    model = train_lightgbm(X_train, y_train, X_val, y_val, params)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))  # Вместо squared=False вычисляем корень вручную
    return rmse

def optimize_xgboost(trial):
    params = {
        'objective': 'reg:squarederror',
        'eval_metric': 'rmse',
        'eta': trial.suggest_float('eta', 0.001, 0.1),
        'max_depth': trial.suggest_int('max_depth', 3, 10),
        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
    }
    
    model = train_xgboost(X_train, y_train, X_val, y_val, params)
    preds = model.predict(xgb.DMatrix(X_val))
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

def optimize_catboost(trial):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 1000),
        'depth': trial.suggest_int('depth', 4, 10),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
        'random_strength': trial.suggest_int('random_strength', 0, 100),
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 10.0),
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-5, 10.0, log=True),
        'verbose': False
    }
    
    model = train_catboost(X_train, y_train, X_val, y_val, params)
    preds = model.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    return rmse

# Оптимизация моделей
def optimize_models():
    study_lgb = optuna.create_study(direction='minimize', sampler=TPESampler())
    study_lgb.optimize(optimize_lightgbm, n_trials=20, show_progress_bar=True)
    
    study_xgb = optuna.create_study(direction='minimize', sampler=TPESampler())
    study_xgb.optimize(optimize_xgboost, n_trials=20, show_progress_bar=True)
    
    study_cb = optuna.create_study(direction='minimize', sampler=TPESampler())
    study_cb.optimize(optimize_catboost, n_trials=20, show_progress_bar=True)
    
    return {
        'lightgbm': study_lgb.best_params,
        'xgboost': study_xgb.best_params,
        'catboost': study_cb.best_params
    }

In [28]:
# Получаем оптимальные параметры
best_params = optimize_models()

# Добавляем фиксированные параметры для каждого алгоритма
best_params['lightgbm'].update({
    'objective': 'regression',
    'metric': 'rmse',
    'verbose': -1
})

best_params['xgboost'].update({
    'objective': 'reg:squarederror',
    'eval_metric': 'rmse'
})

best_params['catboost'].update({
    'loss_function': 'RMSE',
    'verbose': False
})

# Обучение моделей с лучшими параметрами на всех данных
def train_all_models(X, y, params):
    models = {}
    
    # LightGBM
    lgb_params = params['lightgbm'].copy()
    lgb_params['verbose'] = -1
    models['lightgbm'] = train_lightgbm(X, y, X_val, y_val, lgb_params)
    
    # XGBoost
    xgb_params = params['xgboost'].copy()
    models['xgboost'] = train_xgboost(X, y, X_val, y_val, xgb_params)
    
    # CatBoost
    cb_params = params['catboost'].copy()
    cb_params['verbose'] = False
    models['catboost'] = train_catboost(X, y, X_val, y_val, cb_params)
    
    # Random Forest (без оптимизации для примера)
    rf_params = {
        'n_estimators': 100,
        'max_depth': 10,
        'random_state': 42
    }
    models['random_forest'] = train_random_forest(X, y, rf_params)
    
    # Ridge Regression (без оптимизации для примера)
    ridge_params = {
        'alpha': 1.0,
        'random_state': 42
    }
    models['ridge'] = train_ridge(X, y, ridge_params)
    
    return models

# Обучение всех моделей
models = train_all_models(train[features], train['target'], best_params)

# Получение предсказаний от всех моделей
def get_predictions(models, X):
    predictions = {}
    for name, model in models.items():
        if name == 'xgboost':
            predictions[name] = model.predict(xgb.DMatrix(X))
        else:
            predictions[name] = model.predict(X)
    return predictions

# Блендинг предсказаний (простое усреднение)
def blend_predictions(predictions):
    return np.mean(np.array(list(predictions.values())), axis=0)

# Получаем предсказания на тестовых данных
test_predictions = get_predictions(models, test[features])
test['target'] = blend_predictions(test_predictions)

# Сохраняем результаты
test[['index', 'target']].to_csv('test_blended.csv', index=False)

print("Blending completed and predictions saved to test_blended.csv")

[I 2025-05-03 20:19:00,482] A new study created in memory with name: no-name-e4e510ad-84e5-4187-ac1f-fe4635202f3a


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-05-03 20:19:01,179] Trial 0 finished with value: 1879.5804103577336 and parameters: {'num_leaves': 74, 'learning_rate': 0.09332547107715061, 'feature_fraction': 0.7607356733695185, 'bagging_fraction': 0.8698128628524322, 'bagging_freq': 4, 'min_child_samples': 96}. Best is trial 0 with value: 1879.5804103577336.
[I 2025-05-03 20:19:02,267] Trial 1 finished with value: 1892.7005896165106 and parameters: {'num_leaves': 209, 'learning_rate': 0.07205300652931702, 'feature_fraction': 0.9926484125049213, 'bagging_fraction': 0.6510508288691708, 'bagging_freq': 3, 'min_child_samples': 44}. Best is trial 0 with value: 1879.5804103577336.
[I 2025-05-03 20:19:03,106] Trial 2 finished with value: 1887.82630461998 and parameters: {'num_leaves': 175, 'learning_rate': 0.054256690027979934, 'feature_fraction': 0.8027060485917454, 'bagging_fraction': 0.409343259958572, 'bagging_freq': 3, 'min_child_samples': 40}. Best is trial 0 with value: 1879.5804103577336.
[I 2025-05-03 20:19:04,029] Trial 

[I 2025-05-03 20:19:17,797] A new study created in memory with name: no-name-f2639e0c-023f-4555-a277-1b50543c537d


[I 2025-05-03 20:19:17,796] Trial 19 finished with value: 1886.5404947185018 and parameters: {'num_leaves': 188, 'learning_rate': 0.0841008743805426, 'feature_fraction': 0.8499679229087533, 'bagging_fraction': 0.830208985162429, 'bagging_freq': 6, 'min_child_samples': 52}. Best is trial 16 with value: 1877.3427505704863.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-05-03 20:19:18,131] Trial 0 finished with value: 2553.942292171551 and parameters: {'eta': 0.03218689825159262, 'max_depth': 9, 'subsample': 0.6869183440303537, 'colsample_bytree': 0.5658856552042141, 'lambda': 0.0020198586734529265, 'alpha': 0.011869952752399835}. Best is trial 0 with value: 2553.942292171551.
[I 2025-05-03 20:19:18,447] Trial 1 finished with value: 2218.115208904397 and parameters: {'eta': 0.08452061955641244, 'max_depth': 9, 'subsample': 0.7665154696414427, 'colsample_bytree': 0.8801792188768286, 'lambda': 0.0023798714846240696, 'alpha': 1.1055089718149416}. Best is trial 1 with value: 2218.115208904397.
[I 2025-05-03 20:19:18,721] Trial 2 finished with value: 2662.439901854962 and parameters: {'eta': 0.02014894027101138, 'max_depth': 7, 'subsample': 0.9316496018689759, 'colsample_bytree': 0.8829073240294543, 'lambda': 0.04606104112984548, 'alpha': 0.18606429819288478}. Best is trial 1 with value: 2218.115208904397.
[I 2025-05-03 20:19:18,976] Trial 3 finish

[I 2025-05-03 20:19:23,801] A new study created in memory with name: no-name-c331dacb-be65-4dec-9f73-5bf217d7c7d8


[I 2025-05-03 20:19:23,800] Trial 19 finished with value: 2507.002420853856 and parameters: {'eta': 0.037267753153581204, 'max_depth': 10, 'subsample': 0.6392707031578031, 'colsample_bytree': 0.6712645304088417, 'lambda': 1.228582413162995, 'alpha': 2.4036270903479866}. Best is trial 13 with value: 2147.871484834316.


  0%|          | 0/20 [00:00<?, ?it/s]

[I 2025-05-03 20:19:29,811] Trial 0 finished with value: 1872.94472227954 and parameters: {'iterations': 841, 'depth': 8, 'learning_rate': 0.0928695543907521, 'random_strength': 36, 'bagging_temperature': 7.955910895995565, 'l2_leaf_reg': 0.004428900503354116}. Best is trial 0 with value: 1872.94472227954.
[I 2025-05-03 20:19:33,424] Trial 1 finished with value: 1876.7813740673662 and parameters: {'iterations': 546, 'depth': 6, 'learning_rate': 0.09979307652099334, 'random_strength': 26, 'bagging_temperature': 3.8792170783994417, 'l2_leaf_reg': 0.0011850144813360317}. Best is trial 0 with value: 1872.94472227954.
[I 2025-05-03 20:19:41,334] Trial 2 finished with value: 1877.4140694911298 and parameters: {'iterations': 734, 'depth': 8, 'learning_rate': 0.04913549193636657, 'random_strength': 27, 'bagging_temperature': 9.308723265212034, 'l2_leaf_reg': 4.6259003749861795e-05}. Best is trial 0 with value: 1872.94472227954.
[I 2025-05-03 20:19:54,103] Trial 3 finished with value: 1980.1124