In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import Ridge
from sklearn.preprocessing import StandardScaler
import lightgbm as lgb
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
import warnings

warnings.filterwarnings('ignore')

In [7]:
# Загрузка данных
train = pd.read_csv('train_contest.csv')
test = pd.read_csv('test_contest.csv')
features = list(train.drop('target', axis=1).columns)
cat_features = train.select_dtypes(include=['object', 'category']).columns.tolist()

# Простое Label Encoding для всех категориальных признаков
le = LabelEncoder()
for col in cat_features:
    # Объединяем train и test для консистентного кодирования
    combined = pd.concat([train[col], test[col]], axis=0)
    le.fit(combined)
    train[col] = le.transform(train[col])
    test[col] = le.transform(test[col])
    # Для LightGBM явно указываем категориальный тип
    train[col] = train[col].astype('category')
    test[col] = test[col].astype('category')

In [19]:
# Разделение данных
X_train, X_val, y_train, y_val = train_test_split(
    train[features], train['target'], test_size=0.2, random_state=42
)

# Альтернативный вариант с Dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_features)
val_data = lgb.Dataset(X_val, label=y_val, categorical_feature=cat_features)

params = {
    'objective': 'regression',
    'metric': 'rmse',
    'num_leaves': 31,
    'learning_rate': 0.05,
    'feature_fraction': 0.9
}

# LightGBM без ранней остановки
lgb_model = lgb.train(
    params,
    train_data,
    num_boost_round=1000,
    valid_sets=[val_data],
)

# XGBoost без ранней остановки
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    random_state=42,
    tree_method='hist',
	enable_categorical=True  # Включаем поддержку категориальных признаков
)
xgb_model.fit(X_train, y_train)

# Random Forest
rf_model = RandomForestRegressor(
    n_estimators=300,
    max_depth=10,
    random_state=42,
    n_jobs=-1
)
rf_model.fit(X_train, y_train)

# Ridge Regression (требует масштабирования)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

ridge_model = Ridge(alpha=1.0, random_state=42)
ridge_model.fit(X_train_scaled, y_train)

# Создаем мета-признаки
val_preds = np.column_stack([
    lgb_model.predict(X_val),
    xgb_model.predict(X_val),
    rf_model.predict(X_val),
    ridge_model.predict(X_val_scaled)
])

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.007230 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3609
[LightGBM] [Info] Number of data points in the train set: 96075, number of used features: 128
[LightGBM] [Info] Start training from score 3040.124976


In [20]:

# Мета-модель
meta_model = LinearRegression()
meta_model.fit(val_preds, y_val)

# Предсказание на тесте
test_preds = np.column_stack([
    lgb_model.predict(test[features]),
    xgb_model.predict(test[features]),
    rf_model.predict(test[features]),
    ridge_model.predict(scaler.transform(test[features]))
])

# Финальное предсказание
test['target'] = meta_model.predict(test_preds)
test[['index', 'target']].to_csv('ensemble_test.csv', index=False)

print("Ансамбль успешно обучен! Результаты сохранены в ensemble_test.csv")

Ансамбль успешно обучен! Результаты сохранены в ensemble_test.csv
