In [1]:
# %pip install category_encoders
# %pip install xgboost

In [2]:
import pandas as pd
import numpy as np
import joblib


In [3]:
db = pd.read_csv('C:\\Users\\DOR CO\\Desktop\\Real state ML\\Real-Estate-Price-Estimator\\DataSet\\DataCleaned.csv')
x = db.drop(columns=['price'])
y = db['price']

In [4]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('scaler', StandardScaler(), ['property_quality','rooms','living_area','bathrooms','garden_sqm',
                                  'terrace_sqm','land_area','distance_from_airport','Skiresort_distance',
                                  'terrace','garden','pool','car_box','land','property_type','city_median','City','province_median','Province'])
], remainder='drop')

In [5]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(
        booster='gbtree',
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        objective='reg:squarederror',
        eval_metric='rmse'
    ))
])



In [6]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [7]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__n_estimators': [500, 1000],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [4, 6, 8],
    'model__min_child_weight': [1, 5, 10],
    'model__subsample': [0.7, 0.9, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0]
}

reg = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

grid = GridSearchCV(
    estimator=reg,
    param_grid={k.replace('model__', 'regressor__model__'): v for k, v in param_grid.items()},
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)


In [8]:
grid.fit(x_train, y_train)
best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)

Fitting 3 folds for each of 486 candidates, totalling 1458 fits


In [9]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

[[1005516.12  780000.  ]
 [ 883647.31  480000.  ]
 [ 119808.45   83000.  ]
 ...
 [ 359739.59  320000.  ]
 [  55584.96   33000.  ]
 [ 124352.38  300000.  ]]


In [10]:
comp = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}, index=y_test.index)
comp['Error'] = comp['Predicted'] - comp['Actual']
comp['AbsPctError_%'] = (comp['Error'].abs() / comp['Actual'].replace(0, np.nan)) * 100
print(comp.head(10))

        Actual     Predicted         Error  AbsPctError_%
8413    780000  1.005516e+06  2.255161e+05      28.912324
15648   480000  8.836473e+05  4.036473e+05      84.093190
5194     83000  1.198085e+05  3.680845e+04      44.347534
7534    395000  5.871779e+05  1.921779e+05      48.652642
16106   740000  7.783062e+05  3.830619e+04       5.176512
13897   310000  5.586713e+05  2.486713e+05      80.216552
3021    550000  2.625661e+05 -2.874339e+05      52.260710
1962    980000  6.711145e+05 -3.088855e+05      31.518929
6330    125000  5.991307e+04 -6.508693e+04      52.069547
4580   2358600  1.078047e+06 -1.280553e+06      54.292944


In [11]:
from sklearn.metrics import r2_score
print(f"R²: {r2_score(y_test, y_pred):.4f}")

R²: 0.4525


In [12]:
from sklearn.metrics import r2_score
import numpy as np

# 1) Baseline (predict train mean on test)
y_base = np.full_like(y_test, fill_value=y_train.mean(), dtype=float)
print("Baseline R² (train mean):", r2_score(y_test, y_base))

# 2) Training-set R² for your best_model
y_pred_train = best_model.predict(x_train)
print("Train R²:", r2_score(y_train, y_pred_train))
print("Test  R²:", r2_score(y_test,  best_model.predict(x_test)))


Baseline R² (train mean): -1.747626554582382e-08
Train R²: 0.759198784828186
Test  R²: 0.4524901509284973


In [13]:

# joblib.dump(best_model, "real_estate_model.pkl")
# print("Model saved successfully!")

In [14]:

# loaded_model = joblib.load("real_estate_model.pkl")

# # Example: Predict with new input
# sample_input = pd.DataFrame([{
#     "Province": "Lombardy",
#     "City": "Milan",
#     "property_quality": 8,
#     "rooms": 3,
#     "living_area": 120,
#     "bathrooms": 2,
#     "garden_sqm": 30,
#     "terrace_sqm": 15,
#     "land_area": 0,
#     "distance_from_airport": 20,
#     "Skiresort_distance": 150,
#     "terrace": 1,
#     "garden": 1,
#     "pool": 0,
#     "car_box": 1,
#     "land": 0,
#     "property_type": 1
# }])

# predicted_price = loaded_model.predict(sample_input)
# print("Predicted Price:", predicted_price[0])
