In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import PowerTransformer, StandardScaler
from sklearn.feature_selection import SelectFromModel
from scipy import stats
import joblib

# Загрузка данных
data = pd.read_csv("../data/processed/train.csv")

# Очистка названий столбцов
data.columns = data.columns.str.strip()

# Проверка наличия необходимых столбцов
required_cols = ['area', 'price', 'rooms_1', 'rooms_2', 'rooms_3', 'first_floor', 'last_floor']
missing_cols = [col for col in required_cols if col not in data.columns]
if missing_cols:
    raise KeyError(f"Отсутствуют необходимые столбцы: {missing_cols}")

# Продвинутая предобработка
pt = PowerTransformer(method='yeo-johnson')
data['area_scaled'] = pt.fit_transform(data[['area']])

# Генерация новых признаков
data['area_squared'] = data['area_scaled'] ** 2
data['area_rooms1'] = data['area_scaled'] * data['rooms_1']
data['area_rooms2'] = data['area_scaled'] * data['rooms_2']
data['floor_combo'] = data['first_floor'] & data['last_floor']

# Формирование матрицы признаков
features = [
    'area_scaled', 'area_squared', 'area_rooms1', 'area_rooms2',
    'rooms_1', 'rooms_2', 'rooms_3', 'first_floor', 'last_floor', 'floor_combo'
]
X = data[features].astype(float)
y = data['price']

# Разделение данных
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Масштабирование признаков
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Отбор признаков
selector = SelectFromModel(
    estimator=RandomForestRegressor(n_estimators=100),
    threshold="1.25*median"
)
X_train_selected = selector.fit_transform(X_train_scaled, y_train)
X_test_selected = selector.transform(X_test_scaled)

# Определение моделей и параметров для поиска
models = {
    'GradientBoosting': {
        'model': GradientBoostingRegressor(),
        'params': {
            'n_estimators': [100, 200],
            'learning_rate': [0.05, 0.1],
            'max_depth': [3, 5]
        }
    },
    'RandomForest': {
        'model': RandomForestRegressor(),
        'params': {
            'n_estimators': [100, 200],
            'max_depth': [None, 5]
        }
    },
    'SVR': {
        'model': SVR(),
        'params': {
            'C': [0.1, 1, 10],
            'kernel': ['linear', 'rbf']
        }
    }
}

best_score = np.inf
best_model = None

# Поиск лучшей модели
for name, mp in models.items():
    grid = GridSearchCV(
        mp['model'],
        mp['params'],
        cv=5,
        scoring='neg_mean_squared_error',
        n_jobs=-1
    )
    grid.fit(X_train_selected, y_train)
    
    y_pred = grid.predict(X_test_selected)
    mse = mean_squared_error(y_test, y_pred)
    
    if mse < best_score:
        best_score = mse
        best_model = grid.best_estimator_
        best_model_name = name

# Обучение лучшей модели
best_model.fit(X_train_selected, y_train)
y_pred = best_model.predict(X_test_selected)

# Вывод информации о лучшей модели
print(f"Лучшая модель: {best_model_name}")
print("Параметры модели:")
print(best_model.get_params())

# Оценка метрик
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
r2 = r2_score(y_test, y_pred)
mae = np.mean(np.abs(y_test - y_pred))

print(f"\nMSE: {mse:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²: {r2:.4f}")
print(f"MAE: {mae:.2f} руб.")

# Сохранение артефактов
joblib.dump(best_model, '../models/best_model.pkl')
joblib.dump(pt, '../models/power_transformer.pkl')
joblib.dump(scaler, '../models/scaler.pkl')
joblib.dump(selector, '../models/feature_selector.pkl')

print("\nМодель и препроцессоры успешно сохранены")


Лучшая модель: SVR
Параметры модели:
{'C': 10, 'cache_size': 200, 'coef0': 0.0, 'degree': 3, 'epsilon': 0.1, 'gamma': 'scale', 'kernel': 'linear', 'max_iter': -1, 'shrinking': True, 'tol': 0.001, 'verbose': False}

MSE: 36478729055543.43
RMSE: 6039762.33
R²: -0.2480
MAE: 5192330.52 руб.

Модель и препроцессоры успешно сохранены
