In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.base import BaseEstimator, TransformerMixin

# Трансформер для преобразования категориальных признаков

In [5]:
class TypeConverter(BaseEstimator, TransformerMixin):
    def __init__(self, columns):
        self.columns = columns

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X = X.copy()
        for col in self.columns:
            X[col] = X[col].astype(str)
        return X

In [6]:
df = pd.read_excel("df_filtered.xlsx")

current_year = pd.Timestamp.now().year
df_cleaned = df[
    (df['Общая площадь'] <= 200) &
    (df['Этаж'] <= 50) &
    (df['Год постройки'] >= (current_year - 100)) &
    (df['Высота потолков'] <= 5) &
    (df['Цена'] <= 100000000) &
    (df['Площадь кухни'] <= 50)
]

X = df_cleaned.drop(columns=['Цена'])
y = df_cleaned['Цена']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

categorical_features = X_train.select_dtypes(include=['object', 'category']).columns.tolist()

# Пайплайн

In [7]:
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(
            drop='first',
            handle_unknown='ignore',
            sparse_output=False
        ), categorical_features)
    ],
    remainder='passthrough'
)

pipeline = Pipeline([
    ('type_converter', TypeConverter(columns=categorical_features)),
    ('preprocessor', preprocessor),
    ('model', RandomForestRegressor(
        random_state=42,
        n_estimators=100,
        n_jobs=-1
    ))
])

In [8]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

r2 = r2_score(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mape = np.mean(np.abs((y_test - y_pred) / y_test)) * 100

print(f"R^2: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAPE: {mape:.2f}%")



R^2: 0.9603
RMSE: 2186516.0462
MAPE: 4.75%
