In [3]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

# Модули из других проектов
from HW_2.data_loader import DataLoader

In [4]:
# === 1. Загрузка данных ===
cwd = os.getcwd()
file_path = os.path.join(cwd, r'hw_5_data\train.csv')  # Замените на путь к вашему CSV файлу

df = DataLoader.load_from_csv(file_path)

X = df.drop("SalePrice", axis=1)
y = df["SalePrice"]

In [5]:
# === 2. Выделение признаков ===
numeric_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

In [6]:
# === 3. Предобработка для всех моделей, за исключением CatBoost ===
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore', sparse_output=False))
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

In [7]:
# === 4. Разделение на обучающую и тестовую выборки ===
X_train_raw, X_test_raw, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [8]:
# === 5. Единоразовая предобработка ===
X_train_processed = preprocessor.fit_transform(X_train_raw)
X_test_processed = preprocessor.transform(X_test_raw)

In [None]:
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.linear_model import Ridge, Lasso
from lightgbm import LGBMRegressor
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# === 6. Словарь моделей и их гиперпараметров ===

regressors = {
    "RandomForest": {
        "model": RandomForestRegressor,
        "space": {
            "n_estimators": hp.choice("n_estimators_rf", range(50, 300, 10)),
            "max_depth": hp.choice("max_depth_rf", range(3, 30)),
            "min_samples_split": hp.choice("min_samples_split_rf", range(2, 10)),
        }
    },
    "GradientBoosting": {
        "model": GradientBoostingRegressor,
        "space": {
            "n_estimators": hp.choice("n_estimators_gb", range(50, 300, 10)),
            "learning_rate": hp.uniform("learning_rate_gb", 0.01, 0.3),
            "max_depth": hp.choice("max_depth_gb", range(3, 10))
        }
    },
    "AdaBoost": {
        "model": AdaBoostRegressor,
        "space": {
            "n_estimators": hp.choice("n_estimators_ab", range(50, 300, 10)),
            "learning_rate": hp.uniform("learning_rate_ab", 0.01, 1.0)
        }
    },
    "ExtraTrees": {
        "model": ExtraTreesRegressor,
        "space": {
            "n_estimators": hp.choice("n_estimators_et", range(50, 300, 10)),
            "max_depth": hp.choice("max_depth_et", range(3, 30))
        }
    },
    "Lasso": {
        "model": Lasso,
        "space": {
            "alpha": hp.loguniform("alpha_lasso", np.log(1e-4), np.log(1.0))
        }
    },
    "Ridge": {
        "model": Ridge,
        "space": {
            "alpha": hp.loguniform("alpha_ridge", np.log(1e-4), np.log(1.0))
        }
    },
    "LGBM": {
        "model": LGBMRegressor,
        "space": {
            "n_estimators": hp.choice("n_estimators_lgb", range(100, 500, 10)),
            "learning_rate": hp.uniform("learning_rate_lgb", 0.01, 0.1),  # Оптимизируем по более узкому диапазону
            "num_leaves": hp.choice("num_leaves_lgb", range(20, 100)),  # Ограничиваем диапазон
            "max_depth": hp.choice("max_depth_lgb", range(3, 15)),  # Параметр max_depth ограничим от 3 до 15
            "min_data_in_leaf": hp.choice("min_data_in_leaf", range(10, 50)),  # Обязательный параметр для предотвращения переобучения
            "reg_alpha": hp.uniform("reg_alpha", 0.0, 0.1),  # Регуляризация
            "reg_lambda": hp.uniform("reg_lambda", 0.0, 0.1)  # Регуляризация
        }
    }
}

results = {}

for name, config in regressors.items():
    print(f"\nПоиск для модели: {name}")

    # Функция подбора гиперпараметров

    def objective(params):
        model = config["model"](**params, random_state=42)
        score = cross_val_score(model, X_train_processed, y_train,
                                scoring="neg_root_mean_squared_error", cv=3).mean()
        return {"loss": -score, "status": STATUS_OK}

    trials = Trials()
    best = fmin(
        fn=objective,
        space=config["space"],
        algo=tpe.suggest,
        max_evals=30,
        trials=trials,
        rstate=np.random.default_rng(42)
    )

    best_score = min(trials.losses())
    results[name] = {"best_params": best, "rmse": round(best_score, 4)}

# Вывод результатов
df_results = pd.DataFrame.from_dict(results, orient="index")
df_results = df_results.sort_values(by="rmse", ascending=False)
print("\nФинальные результаты:")
display(df_results)


Поиск для модели: RandomForest
 33%|███▎      | 10/30 [01:19<03:08,  9.42s/trial, best loss: 29532.675634852727]

In [None]:
# Некоторые модели требуют обработки индексов после hp.choice
def decode_hyperopt_params(name, raw_params):
    space = regressors[name]["space"]
    decoded = {}
    for key, val in raw_params.items():
        if isinstance(space[key], hp.choice):
            options = space[key].pos_args[0]
            decoded[key] = options[val]
        else:
            decoded[key] = val
    return decoded

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

final_results = []

for model_name in df_results.index:
    print(f"Обработка модели: {model_name}")

    best_params = df_results.loc[model_name, "best_params"]
    decoded_params = decode_hyperopt_params(model_name, best_params)

    ModelClass = regressors[model_name]["model"]
    model = ModelClass(**decoded_params, random_state=42)
    model.fit(X_train_processed, y_train)

    y_pred = model.predict(X_test_processed)

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = mean_squared_error(y_test, y_pred, squared=False)
    r2 = r2_score(y_test, y_pred)

    final_results.append({
        "Model": model_name,
        "MAE": round(mae, 2),
        "MSE": round(mse, 2),
        "RMSE": round(rmse, 2),
        "R²": round(r2, 2)
    })

final_results_df = pd.DataFrame(final_results).sort_values("RMSE", ascending=False)
print("\nИтоговое сравнение моделей:")
display(final_results_df)