### ПРЕДОБРАБОТКА И ГЕНЕРАЦИЯ НОВЫХ ПРИЗНАКОВ

Необходимые импорты

In [25]:
from dotenv import load_dotenv
import os
load_dotenv()
os.environ["MLFLOW_S3_ENDPOINT_URL"] = "https://storage.yandexcloud.net"
os.environ["AWS_ACCESS_KEY_ID"] = os.getenv("AWS_ACCESS_KEY_ID")
os.environ["AWS_SECRET_ACCESS_KEY"] = os.getenv("AWS_SECRET_ACCESS_KEY")


In [29]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import RobustScaler, KBinsDiscretizer, PolynomialFeatures, OrdinalEncoder
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from autofeat import AutoFeatRegressor
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import mlflow
import mlflow.catboost
import pandas as pd
from mlflow.models.signature import infer_signature
from catboost import Pool

In [2]:
df_path = "/home/mle-user/mle_projects/mle-project-sprint-2-v001/df_filtered.csv"
df = pd.read_csv(df_path)
df.head(5)

Unnamed: 0,id,flat_id,building_id,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,...,is_apartment,studio,total_area,price,decade,distance_from_moscow_center,district,okrug,okrug_num,log_price
0,2,1,18012,2001,2,55.794849,37.608013,3.0,97,10,...,False,False,43.0,13500000.0,2000,4.623059,Марьина Роща,Северо-Восточный,0,16.4182
1,3,2,17821,2000,4,55.74004,37.761742,2.7,80,10,...,False,False,56.0,13500000.0,2000,8.954583,Перово,Восточный,1,16.4182
2,4,3,18579,2002,4,55.672016,37.570877,2.64,771,17,...,False,False,76.0,20000000.0,2000,9.62713,Черёмушки,Юго-Западный,2,16.811243
3,5,4,9293,1971,1,55.808807,37.707306,2.6,208,9,...,False,False,24.0,5200000.0,1970,8.156988,Богородское,Восточный,1,15.464169
4,6,5,23964,2017,4,55.724728,37.743069,2.7,192,17,...,False,False,51.009998,8490104.0,2010,8.312821,Рязанский,Юго-Восточный,3,15.954412


In [3]:
# чек на всякий случай
df.dtypes

id                               int64
flat_id                          int64
building_id                      int64
build_year                       int64
building_type_int                int64
latitude                       float64
longitude                      float64
ceiling_height                 float64
flats_count                      int64
floors_total                     int64
has_elevator                      bool
floor                            int64
kitchen_area                   float64
living_area                    float64
rooms                            int64
is_apartment                      bool
studio                            bool
total_area                     float64
price                          float64
decade                           int64
distance_from_moscow_center    float64
district                        object
okrug                           object
okrug_num                        int64
log_price                      float64
dtype: object

### Скейлинг признаков

Перейдем к автогенерации признаков. Используем:
- RobustScaler и PolynomialFeatures для колонок с комната, этажами и площадями
- KBinsDiscretizer для колонки расстоянием до центра Москвы
- OrdinalEncoder возьмем для колонки с десятилетием построения, т.к. нам надо, чтобы порядок (=новизна) учитывался моделью.
- OnehotEncoder не будем использовать ни для округа, ни для района, ни для типа здания. Мы их передадим в catboost как есть.

In [20]:
#  Колонки 
poly_cols = ['ceiling_height', 'flats_count', 'floors_total', 'floor',
             'kitchen_area', 'living_area', 'rooms', 'total_area']
kbins_cols = ['distance_from_moscow_center']
ordinal_cols = ['decade']
bool_cols = ['has_elevator', 'is_apartment', 'studio']

passthrough_cols = [
    'id', 'flat_id', 'building_id', 'build_year',
    'latitude', 'longitude',  'okrug_num', 'log_price',
    'okrug', 'district', 'building_type_int'
]

target_col = 'price'

# X и y 
X = df.drop(columns=[target_col])
y = df[target_col]

#  Сохраняем passthrough_cols отдельно 
passthrough_df = X[passthrough_cols].copy()

#  Остальные колонки для трансформаций 
X_to_transform = X.drop(columns=passthrough_cols)

# --- ColumnTransformer ---
preprocessor = ColumnTransformer(transformers=[
    ('poly', PolynomialFeatures(degree=2, include_bias=False), poly_cols + kbins_cols),
    ('robust', RobustScaler(), poly_cols + kbins_cols),
    ('kbins', KBinsDiscretizer(n_bins=5, encode='onehot-dense', strategy='quantile'), kbins_cols),
    ('ordinal', OrdinalEncoder(), ordinal_cols),
    ('bool', 'passthrough', bool_cols)
], remainder='drop')

pipeline = Pipeline([('preprocessor', preprocessor)])

#  Фитим и трансформируем 
X_transformed = pipeline.fit_transform(X_to_transform, y)

#  Имена колонок 
try:
    transformed_names = pipeline.get_feature_names_out()
except:
    transformed_names = pipeline.named_steps['preprocessor'].get_feature_names_out()

X_transformed_df = pd.DataFrame(X_transformed, columns=transformed_names, index=X.index)

print("Форма финального X_transformed_df:", X_transformed_df.shape)



Форма финального X_transformed_df: (127711, 72)


### Автогенерация признаков

Переходим к автогенерации признаков через AutoFeatRegressor. Чтобы не плодить 500 колонок, выберем самые распространенные трансформации: ['log', '1/', 'sqrt', куб, квадрат].

In [19]:
#  AutoFeatRegressor на числовых исходных колонках 
af_reg = AutoFeatRegressor(
    verbose=1,
    feateng_steps=1,
    transformations = ["1/", "exp", "log", "sqrt", "^2", "^3"] 

)

numeric_for_autofeat = poly_cols + kbins_cols
X_autofeat = af_reg.fit_transform(X[numeric_for_autofeat], y)
X_autofeat_new = X_autofeat.drop(columns=numeric_for_autofeat, errors='ignore')

# --- Финальный датасет: passthrough + трансформированные + autofeat ---
X_final = pd.concat([passthrough_df, X_transformed_df, X_autofeat_new], axis=1)

print("Форма финального X_final:", X_final.shape)
print("Колонки passthrough на месте:", all(c in X_final.columns for c in passthrough_cols))

2025-09-07 21:59:05,209 INFO: [AutoFeat] The 1 step feature engineering process could generate up to 54 features.
2025-09-07 21:59:05,209 INFO: [AutoFeat] With 127711 data points this new feature matrix would use about 0.03 gb of space.
2025-09-07 21:59:05,221 INFO: [feateng] Step 1: transformation of original features


[feateng]               0/              9 features transformed

2025-09-07 21:59:06,586 INFO: [feateng] Generated 47 transformed features from 9 original features - done.
2025-09-07 21:59:06,618 INFO: [feateng] Generated altogether 47 new features in 1 steps
2025-09-07 21:59:06,618 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-09-07 21:59:06,681 INFO: [feateng] Generated a total of 25 additional features
2025-09-07 21:59:06,747 INFO: [featsel] Feature selection run 1/5


[featsel] Scaling data...done.


2025-09-07 21:59:11,084 INFO: [featsel] Feature selection run 2/5
2025-09-07 21:59:15,608 INFO: [featsel] Feature selection run 3/5
2025-09-07 21:59:20,014 INFO: [featsel] Feature selection run 4/5
2025-09-07 21:59:24,220 INFO: [featsel] Feature selection run 5/5
2025-09-07 21:59:28,571 INFO: [featsel] 23 features after 5 feature selection runs
2025-09-07 21:59:28,812 INFO: [featsel] 23 features after correlation filtering
2025-09-07 21:59:30,203 INFO: [featsel] 23 features after noise filtering
2025-09-07 21:59:30,204 INFO: [AutoFeat] Computing 15 new features.


[AutoFeat]    12/   15 new features

2025-09-07 21:59:31,885 INFO: [AutoFeat]    15/   15 new features ...done.
2025-09-07 21:59:31,897 INFO: [AutoFeat] Final dataframe with 24 feature columns (15 new).


[AutoFeat]    14/   15 new features

2025-09-07 21:59:31,898 INFO: [AutoFeat] Training final regression model.
2025-09-07 21:59:32,709 INFO: [AutoFeat] Trained model: largest coefficients:
2025-09-07 21:59:32,710 INFO: -30208983.86746934
2025-09-07 21:59:32,710 INFO: 220023440.678809 * 1/total_area
2025-09-07 21:59:32,711 INFO: 27746544.056481 * 1/kitchen_area
2025-09-07 21:59:32,714 INFO: 26723379.176289 * 1/distance_from_moscow_center
2025-09-07 21:59:32,715 INFO: 17085750.235476 * 1/living_area
2025-09-07 21:59:32,715 INFO: -9658145.541523 * 1/floors_total
2025-09-07 21:59:32,717 INFO: 5328753.236285 * ceiling_height
2025-09-07 21:59:32,717 INFO: 2461026.463136 * 1/flats_count
2025-09-07 21:59:32,719 INFO: -1686146.296427 * 1/floor
2025-09-07 21:59:32,719 INFO: -569392.507101 * rooms
2025-09-07 21:59:32,720 INFO: 387432.887176 * total_area
2025-09-07 21:59:32,722 INFO: 378445.671934 * kitchen_area
2025-09-07 21:59:32,723 INFO: -288342.332404 * distance_from_moscow_center
2025-09-07 21:59:32,723 INFO: 91710.438880 * log

Форма финального X_final: (127711, 98)
Колонки passthrough на месте: True


Сохраняем это для дальнейшего логирования

In [27]:
# Создаем папку для артефактов, если ее нет
ARTIFACTS_DIR = "feature_generation_artifacts"
os.makedirs(ARTIFACTS_DIR, exist_ok=True)

# Путь к CSV файлу
X_FINAL_PATH = os.path.join(ARTIFACTS_DIR, "X_final.csv")

# Сохраняем DataFrame
X_final.to_csv(X_FINAL_PATH, index=False)

print(f"X_final сохранен по пути: {X_FINAL_PATH}")


X_final сохранен по пути: feature_generation_artifacts/X_final.csv


### Обучение и оценка качества модели

In [22]:
X_model = X_final.drop(columns=['id', 'flat_id', 'building_id', 'latitude', 'longitude', 'okrug_num', 'log_price', 'build_year'])

# Категориальные признаки для CatBoost
cat_features = ['okrug', 'district', 'building_type_int']

# Разбивка данных

X_train, X_test, y_train, y_test = train_test_split(
    X_model, y, test_size=0.2, random_state=42
)


# CatBoostRegressor 

model_params = {
    'depth': 3,
    'learning_rate': 0.01,
    'iterations': 1000,
    'l2_leaf_reg': 1,
    'eval_metric': 'RMSE',
    'random_seed': 42,
    'verbose': 100
}

catboost_model = CatBoostRegressor(**model_params)

catboost_model.fit(
    X_train, y_train,
    eval_set=(X_test, y_test),
    use_best_model=True,
    cat_features=cat_features
)

# Обучение и метрики

y_pred = catboost_model.predict(X_test)

rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"RMSE: {rmse:.2f}")
print(f"MAE: {mae:.2f}")
print(f"R2: {r2:.3f}")

0:	learn: 11168135.7547952	test: 11164745.1438300	best: 11164745.1438300 (0)	total: 49.6ms	remaining: 49.6s
100:	learn: 6958441.4458984	test: 6975137.4663734	best: 6975137.4663734 (100)	total: 4.34s	remaining: 38.6s
200:	learn: 5622255.3118469	test: 5671764.5352352	best: 5671764.5352352 (200)	total: 8.72s	remaining: 34.7s
300:	learn: 5118447.3888229	test: 5192864.4388674	best: 5192864.4388674 (300)	total: 12.9s	remaining: 30.1s
400:	learn: 4918557.9280155	test: 5001245.1309304	best: 5001245.1309304 (400)	total: 17.1s	remaining: 25.5s
500:	learn: 4820682.7852472	test: 4910251.8737149	best: 4910251.8737149 (500)	total: 21.9s	remaining: 21.8s
600:	learn: 4759987.8146356	test: 4854372.9664018	best: 4854372.9664018 (600)	total: 26s	remaining: 17.3s
700:	learn: 4713214.2071112	test: 4812201.4743504	best: 4812201.4743504 (700)	total: 30.1s	remaining: 12.8s
800:	learn: 4674995.7763295	test: 4776396.7567846	best: 4776396.7567846 (800)	total: 34.2s	remaining: 8.49s
900:	learn: 4642952.7938629	te



### Логирование

In [33]:

# Настройка MLflow 
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("feature_generation")

#  Пути к артефактам 
X_FINAL_PATH = "X_final.csv"
NOTEBOOK_PATH = "feature_generation.ipynb"


#  Категориальные признаки 
cat_features = ['okrug', 'district', 'building_type_int']

#  Для сигнатуры преобразуем категориальные колонки в строки 
X_sig = X_final.copy()
X_sig[cat_features] = X_sig[cat_features].astype(str)

#  Получение сигнатуры
signature = infer_signature(X_sig, catboost_model.predict(Pool(X_final, cat_features=cat_features)))

#  Логирование в MLflow 
with mlflow.start_run(run_name="feature_generation_run"):

    # 1. Параметры модели (если есть словарь с ними)
    mlflow.log_params(model_params)

    # 2. Метрики (если они уже посчитаны)
    mlflow.log_metric("RMSE", rmse)
    mlflow.log_metric("MAE", mae)
    mlflow.log_metric("R2", r2)

    # 3. Модель с сигнатурой и регистрация в Model Registry
    mlflow.catboost.log_model(
        catboost_model,
        artifact_path="catboost_model",
        signature=signature,
        registered_model_name="FeatureGen_CatBoost_Model"
    )

    # 4. Артефакты: датасет и ноутбук
    mlflow.log_artifact(X_FINAL_PATH, artifact_path="data")
    mlflow.log_artifact(NOTEBOOK_PATH, artifact_path="notebook")

print("Модель, сигнатура и артефакты успешно залогированы в MLflow!")

  inputs = _infer_schema(model_input) if model_input is not None else None
Registered model 'FeatureGen_CatBoost_Model' already exists. Creating a new version of this model...
2025/09/07 22:44:10 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: FeatureGen_CatBoost_Model, version 2
Created version '2' of model 'FeatureGen_CatBoost_Model'.


Модель, сигнатура и артефакты успешно залогированы в MLflow!
