In [136]:
import pandas as pd
import numpy as np
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
import joblib

df = pd.read_csv("./data/Input_data_filled_encoded.csv")

X = df.drop(columns=["price"])
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,random_state=42)

# 2. Масштабирование признаков
scaler = StandardScaler()
scaler.fit(X_train)

X_train_scaled = scaler.transform(X_train)
X_test_scaled = scaler.transform(X_test)

# 3. Построение модели XGBoost
model = XGBRegressor(
    n_estimators=650,    # 100 - 1000
    learning_rate=0.013, # 0.01 - 0.03
    max_depth=6,         # 3-10
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

# 4. Обучение модели
model.fit(X_train_scaled, y_train)

# 5. Оценка качества модели
def evaluate(model, X, y):
    y_pred = model.predict(X)
    return {
        "R2": r2_score(y, y_pred),
        # "MAE": mean_absolute_error(y, y_pred),
        # "RMSE": np.sqrt(mean_squared_error(y, y_pred))
    }

print("Train metrics:", evaluate(model, X_train_scaled, y_train))
print("Test metrics:", evaluate(model, X_test_scaled, y_test))

# 6. Проверка переобучения
def overfitting(model, X_train, y_train, X_test, y_test):
    r2_train = r2_score(y_train, model.predict(X_train))
    r2_test = r2_score(y_test, model.predict(X_test))
    print(f"R2 train - R2 test: {r2_train-r2_test:.4f}")

overfitting(model, X_train_scaled, y_train, X_test_scaled, y_test)

joblib.dump(model, "xgb_model.pkl")
joblib.dump(scaler, "scaler.pkl")

loaded_model = joblib.load("xgb_model.pkl")
loaded_scaler = joblib.load("scaler.pkl")

# Final training the model on the entire sample
loaded_scaler.fit(X)
X_full_scaled = loaded_scaler.transform(X)

final_model = XGBRegressor(
    n_estimators=650,
    learning_rate=0.013,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    objective="reg:squarederror",
    random_state=42
)

final_model.fit(X_full_scaled, y)

joblib.dump(final_model, "xgb_model_final.pkl")
joblib.dump(loaded_scaler, "scaler_final.pkl")

Train metrics: {'R2': 0.7887498140335083}
Test metrics: {'R2': 0.7002766728401184}
R2 train - R2 test: 0.0885


['scaler_final.pkl']