In [7]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Загрузка данных
df = pd.read_csv("../data/processed/new_df.csv")
df

Unnamed: 0,total_meters,price
0,32.0,13590000
1,41.0,22600000
2,79.0,75000000
3,40.1,35500000
4,42.5,7150000
...,...,...
1486,90.1,67400000
1487,117.0,65000000
1488,130.0,61900000
1489,67.1,50000000


In [8]:
y = df["price"]
X = df.drop(columns="price")

In [10]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [11]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Инициализация моделей
models = {
    "Linear Regression": LinearRegression(),
    "Random Forest": RandomForestRegressor(random_state=42),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
    "K-Neighbors": KNeighborsRegressor(),
}

# Обучение и оценка моделей
results = []
for name, model in models.items():
    # Обучение
    if name in ["K-Neighbors"]:
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_test_scaled)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    # Расчет метрик
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    results.append({"Model": name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2})

# Создание DataFrame с результатами
results_df = pd.DataFrame(results)
print(results_df.sort_values(by="R2", ascending=False))

               Model           MAE           MSE          RMSE        R2
2  Gradient Boosting  1.056417e+07  2.274623e+14  1.508185e+07  0.564824
0  Linear Regression  1.093351e+07  2.325226e+14  1.524869e+07  0.555143
1      Random Forest  1.112832e+07  2.484030e+14  1.576081e+07  0.524761
3        K-Neighbors  1.138547e+07  2.572431e+14  1.603880e+07  0.507848


In [12]:
from joblib import dump, load

best_model = RandomForestRegressor(random_state=42)
best_model.fit(X_train, y_train)
dump(best_model, "best_model.joblib")

['best_model.joblib']