In [1]:
# %pip install category_encoders
# %pip install xgboost

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [3]:
db = pd.read_csv('C:\\Users\\DOR CO\\Desktop\\Real state ML\\Real-Estate-Price-Estimator\\DataSet\\DataCleaned.csv')
x = db.drop(columns=['price'])
y = db['price']

In [4]:
import category_encoders as ce
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('target', ce.TargetEncoder(cols=["Province", "City"]), ["Province", "City"]),
    ('scaler', StandardScaler(), ['property_quality','rooms','living_area','bathrooms','garden_sqm',
                                  'terrace_sqm','land_area','distance_from_airport','Skiresort_distance',
                                  'terrace','garden','pool','car_box','land'])
], remainder='drop')

In [5]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(
        booster='gbtree',
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        objective='reg:squarederror',
        eval_metric='rmse'
    ))
])



In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__n_estimators': [500, 1000],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [4, 6, 8],
    'model__min_child_weight': [1, 5, 10],
    'model__subsample': [0.7, 0.9, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0]
}

reg = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

grid = GridSearchCV(
    estimator=reg,
    param_grid={k.replace('model__', 'regressor__model__'): v for k, v in param_grid.items()},
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)


In [8]:
grid.fit(x_train, y_train)
best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)

Fitting 3 folds for each of 486 candidates, totalling 1458 fits


In [9]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

[[1043026.75  780000.  ]
 [ 832320.62  480000.  ]
 [ 114952.47   83000.  ]
 ...
 [ 406776.    320000.  ]
 [  76214.76   33000.  ]
 [ 100944.78  300000.  ]]


In [10]:
comp = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}, index=y_test.index)
comp['Error'] = comp['Predicted'] - comp['Actual']
comp['AbsPctError_%'] = (comp['Error'].abs() / comp['Actual'].replace(0, np.nan)) * 100
print(comp.head(10))

        Actual     Predicted         Error  AbsPctError_%
8413    780000  1.043027e+06  2.630268e+05      33.721378
15648   480000  8.323206e+05  3.523206e+05      73.400130
5194     83000  1.149525e+05  3.195247e+04      38.496950
7534    395000  4.080734e+05  1.307338e+04       3.309715
16106   740000  7.417624e+05  1.762438e+03       0.238167
13897   310000  5.440004e+05  2.340004e+05      75.483992
3021    550000  3.664768e+05 -1.835232e+05      33.367858
1962    980000  5.115086e+05 -4.684914e+05      47.805249
6330    125000  5.540290e+04 -6.959710e+04      55.677681
4580   2358600  2.930375e+05 -2.065562e+06      87.575785


In [11]:
from sklearn.metrics import r2_score
print(f"R²: {r2_score(y_test, y_pred):.4f}")

R²: 0.4270


In [12]:
from sklearn.metrics import r2_score
import numpy as np

# 1) Baseline (predict train mean on test)
y_base = np.full_like(y_test, fill_value=y_train.mean(), dtype=float)
print("Baseline R² (train mean):", r2_score(y_test, y_base))

# 2) Training-set R² for your best_model
y_pred_train = best_model.predict(x_train)
print("Train R²:", r2_score(y_train, y_pred_train))
print("Test  R²:", r2_score(y_test,  best_model.predict(x_test)))


Baseline R² (train mean): -1.747626554582382e-08
Train R²: 0.6901198625564575
Test  R²: 0.4270426630973816
