### ПРЕДОБРАБОТКА И ГЕНЕРАЦИЯ НОВЫХ ПРИЗНАКОВ

In [1]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, RobustScaler, FunctionTransformer, KBinsDiscretizer, PolynomialFeatures, OrdinalEncoder, OneHotEncoder
from sklearn.impute import SimpleImputer
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import FunctionTransformer
from autofeat import AutoFeatRegressor


In [2]:
df_path = "/home/mle-user/mle_projects/mle-project-sprint-2-v001/df_filtered.csv"
df = pd.read_csv(df_path)
df.head(5)

Unnamed: 0,id,flat_id,building_id,build_year,building_type_int,latitude,longitude,ceiling_height,flats_count,floors_total,...,is_apartment,studio,total_area,price,decade,distance_from_moscow_center,district,okrug,okrug_num,log_price
0,2,1,18012,2001,2,55.794849,37.608013,3.0,97,10,...,False,False,43.0,13500000.0,2000,4.623059,Марьина Роща,Северо-Восточный,0,16.4182
1,3,2,17821,2000,4,55.74004,37.761742,2.7,80,10,...,False,False,56.0,13500000.0,2000,8.954583,Перово,Восточный,1,16.4182
2,4,3,18579,2002,4,55.672016,37.570877,2.64,771,17,...,False,False,76.0,20000000.0,2000,9.62713,Черёмушки,Юго-Западный,2,16.811243
3,5,4,9293,1971,1,55.808807,37.707306,2.6,208,9,...,False,False,24.0,5200000.0,1970,8.156988,Богородское,Восточный,1,15.464169
4,6,5,23964,2017,4,55.724728,37.743069,2.7,192,17,...,False,False,51.009998,8490104.0,2010,8.312821,Рязанский,Юго-Восточный,3,15.954412


In [3]:
# чек на всякий случай
df.dtypes

id                               int64
flat_id                          int64
building_id                      int64
build_year                       int64
building_type_int                int64
latitude                       float64
longitude                      float64
ceiling_height                 float64
flats_count                      int64
floors_total                     int64
has_elevator                      bool
floor                            int64
kitchen_area                   float64
living_area                    float64
rooms                            int64
is_apartment                      bool
studio                            bool
total_area                     float64
price                          float64
decade                           int64
distance_from_moscow_center    float64
district                        object
okrug                           object
okrug_num                        int64
log_price                      float64
dtype: object

In [17]:
# --- Определяем колонки ---
poly_cols = ['ceiling_height', 'flats_count', 'floors_total', 'floor',
             'kitchen_area', 'living_area', 'rooms', 'total_area']

kbins_cols = ['distance_from_moscow_center']

ordinal_cols = ['decade']

onehot_cols = ['okrug']

bool_cols = ['has_elevator', 'is_apartment', 'studio']

numeric_cols = poly_cols + kbins_cols

leave_cols = ['building_type_int']  # просто оставляем без изменений

# --- ColumnTransformer ---
preprocessor = ColumnTransformer(transformers=[
    ('poly', PolynomialFeatures(degree=2, include_bias=False), poly_cols),
    ('robust', RobustScaler(), numeric_cols),
    ('kbins', KBinsDiscretizer(n_bins=5, encode='onehot-dense', strategy='quantile'), kbins_cols),
    ('ordinal', OrdinalEncoder(), ordinal_cols),
    ('onehot', OneHotEncoder(sparse=False, handle_unknown='ignore'), onehot_cols),
    ('passthrough_bool', 'passthrough', bool_cols)
])

# --- Pipeline ---
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])

# --- Разделяем X и y ---
X = df.drop(columns=['price'])
y = df['price']

# --- Фитим и трансформируем ---
pipeline.fit(X, y)
X_transformed = pipeline.transform(X)

# --- Преобразуем в DataFrame ---
X_transformed_df = pd.DataFrame(
    X_transformed,
    columns=pipeline.get_feature_names_out(),
    index=X.index
)
print("Форма после Pipeline:", X_transformed_df.shape)

# --- Настройка Autofeat на оригинальные числовые колонки ---
numeric_for_autofeat = numeric_cols  # только исходные числовые колонки

af_reg = AutoFeatRegressor(
    verbose=1,
    feateng_steps=1,  # один шаг генерации
    transformations=['log', '1/', 'sqrt']  # log1p, 1/x и sqrt
)

# --- Фитим Autofeat ---
af_reg.fit(X[numeric_for_autofeat], y)

# --- Генерируем новые признаки ---
X_autofeat = af_reg.transform(X[numeric_for_autofeat])

# --- Объединяем с трансформированным DataFrame ---
X_final = pd.concat([X_transformed_df, X_autofeat.drop(columns=numeric_for_autofeat)], axis=1)

print("Форма после генерации новых признаков:", X_final.shape)


2025-09-05 21:49:47,388 INFO: [AutoFeat] It is much more efficient to call fit_transform() instead of fit() and transform()!
2025-09-05 21:49:47,393 INFO: [AutoFeat] The 1 step feature engineering process could generate up to 27 features.
2025-09-05 21:49:47,394 INFO: [AutoFeat] With 127711 data points this new feature matrix would use about 0.01 gb of space.
2025-09-05 21:49:47,398 INFO: [feateng] Step 1: transformation of original features


Форма после Pipeline: (127711, 74)
[feateng]               0/              9 features transformed

2025-09-05 21:49:48,204 INFO: [feateng] Generated 27 transformed features from 9 original features - done.
2025-09-05 21:49:48,217 INFO: [feateng] Generated altogether 27 new features in 1 steps
2025-09-05 21:49:48,218 INFO: [feateng] Removing correlated features, as well as additions at the highest level
2025-09-05 21:49:48,252 INFO: [feateng] Generated a total of 11 additional features
2025-09-05 21:49:48,273 INFO: [featsel] Feature selection run 1/5


[featsel] Scaling data...done.


2025-09-05 21:49:50,820 INFO: [featsel] Feature selection run 2/5
2025-09-05 21:49:54,375 INFO: [featsel] Feature selection run 3/5
2025-09-05 21:49:57,284 INFO: [featsel] Feature selection run 4/5
2025-09-05 21:49:59,815 INFO: [featsel] Feature selection run 5/5
2025-09-05 21:50:02,345 INFO: [featsel] 17 features after 5 feature selection runs
2025-09-05 21:50:02,491 INFO: [featsel] 17 features after correlation filtering
2025-09-05 21:50:03,452 INFO: [featsel] 16 features after noise filtering
2025-09-05 21:50:03,453 INFO: [AutoFeat] Computing 7 new features.


[AutoFeat]     4/    7 new features

2025-09-05 21:50:04,113 INFO: [AutoFeat]     7/    7 new features ...done.
2025-09-05 21:50:04,119 INFO: [AutoFeat] Final dataframe with 16 feature columns (7 new).
2025-09-05 21:50:04,119 INFO: [AutoFeat] Training final regression model.


[AutoFeat]     6/    7 new features

2025-09-05 21:50:04,413 INFO: [AutoFeat] Trained model: largest coefficients:
2025-09-05 21:50:04,414 INFO: -27398362.907787353
2025-09-05 21:50:04,414 INFO: 136502481.811261 * 1/total_area
2025-09-05 21:50:04,415 INFO: 28868569.081994 * 1/distance_from_moscow_center
2025-09-05 21:50:04,416 INFO: 28331216.861449 * 1/living_area
2025-09-05 21:50:04,417 INFO: 27235538.931887 * 1/kitchen_area
2025-09-05 21:50:04,418 INFO: -9675958.514971 * 1/floors_total
2025-09-05 21:50:04,419 INFO: 5422611.594548 * ceiling_height
2025-09-05 21:50:04,419 INFO: 1984721.204774 * 1/flats_count
2025-09-05 21:50:04,420 INFO: -1758618.920005 * rooms
2025-09-05 21:50:04,422 INFO: -1685312.291948 * 1/floor
2025-09-05 21:50:04,422 INFO: 351515.742925 * total_area
2025-09-05 21:50:04,423 INFO: 306512.222809 * kitchen_area
2025-09-05 21:50:04,424 INFO: -191786.572307 * distance_from_moscow_center
2025-09-05 21:50:04,424 INFO: 49304.571785 * living_area
2025-09-05 21:50:04,425 INFO: 42064.349090 * floors_total
2025-

Форма после генерации новых признаков: (127711, 81)
