In [131]:
# Разделяем train/test до препроцессинга
# Pipeline гарантирует, что StandardScaler и OneHotEncoder обучаются только на train
# Test данные проходят только transform → предотвращаем утечку данных
# После обучения весь Pipeline можно сохранить и использовать на новых данных

In [132]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor

In [133]:
df = pd.read_csv("./data/Input_data_filled.csv")
X = df.drop(columns=["price"])
y = df["price"]
df.head(3)

Unnamed: 0,postal_code,price,number_of_bedrooms,living_area,equiped_kitchen,furnished,terrace,garden,swimming_pool,property_type
0,2800,329000,3.0,104,0,0,1,0,0,Apartment_Apartment
1,2200,425000,3.0,378,0,0,1,1,0,House_Residence
2,2840,264700,1.0,69,0,0,1,0,0,Apartment_Apartment


In [134]:
def evaluate_model(model, X_test, y_test):
    predictions = model.predict(X_test)
    r2 = r2_score(y_test, predictions)
    return {"r2": r2}

In [135]:
# Разделение на train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
numeric_features = ["number_of_bedrooms", "living_area","equiped_kitchen","furnished","terrace","garden","swimming_pool"]
categorical_features = ["postal_code", "property_type"]

In [136]:
preprocessor = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_features),  # масштабируем только train
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_features)
    ]
)

In [137]:
pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("model", XGBRegressor(
        n_estimators=750,
        learning_rate=0.0125,
        max_depth=6,
        objective="reg:squarederror",
        random_state=42
    ))
])

In [138]:
pipeline.fit(X_train, y_train)

In [139]:
y_pred = pipeline.predict(X_test)

In [140]:
r2 = r2_score(y_test, y_pred)
print("R²:", r2)

R²: 0.6698138117790222


In [141]:
import joblib
joblib.dump(pipeline, "xgb_pipeline.pkl")

['xgb_pipeline.pkl']

In [142]:
loaded_pipeline = joblib.load("xgb_pipeline.pkl")
y_pred = loaded_pipeline.predict(X_test)