In [None]:
# %pip install category_encoders
# %pip install xgboost

In [None]:
import pandas as pd
import numpy as np
import joblib


In [None]:
db = pd.read_csv('C:\\Users\\DOR CO\\Desktop\\Real state ML\\Real-Estate-Price-Estimator\\DataSet\\DataCleaned.csv')
x = db.drop(columns=['price'])
y = db['price']

In [None]:
numeric_features = ['rooms', 'bathrooms', 'property_quality','living_area','garden_sqm','terrace_sqm','land_area','distance_from_airport','Skiresort_distance', 'city_median']
boolean_features = ['garden', 'terrace', 'land','pool', 'car_box']
categorical_features = ['Province', 'City', 'property_type']

In [None]:
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('scaler', StandardScaler(), numeric_features),
    ('bool', 'passthrough', boolean_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
], remainder='drop')

In [None]:
from xgboost import XGBRegressor
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(
        booster='gbtree',
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        objective='reg:squarederror',
        eval_metric='rmse'
    ))
])



In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:
from sklearn.compose import TransformedTargetRegressor
from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__n_estimators': [500],
    'model__learning_rate': [0.01, 0.05, 0.1],
    'model__max_depth': [4, 6, 8],
    'model__min_child_weight': [1, 5, 10],
    'model__subsample': [0.7, 0.9, 1.0],
    'model__colsample_bytree': [0.7, 0.8, 1.0]
}

reg = TransformedTargetRegressor(
    regressor=pipe,
    func=np.log1p,
    inverse_func=np.expm1
)

grid = GridSearchCV(
    estimator=reg,
    param_grid={k.replace('model__', 'regressor__model__'): v for k, v in param_grid.items()},
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)


In [None]:
grid.fit(x_train, y_train)
best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

In [None]:
comp = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}, index=y_test.index)
comp['Error'] = comp['Predicted'] - comp['Actual']
comp['AbsPctError_%'] = (comp['Error'].abs() / comp['Actual'].replace(0, np.nan)) * 100
print(comp.head(10))

In [None]:
from sklearn.metrics import r2_score
print(f"R²: {r2_score(y_test, y_pred):.4f}")

In [None]:
from sklearn.metrics import r2_score
import numpy as np

# 1) Baseline (predict train mean on test)
y_base = np.full_like(y_test, fill_value=y_train.mean(), dtype=float)
print("Baseline R² (train mean):", r2_score(y_test, y_base))

# 2) Training-set R² for your best_model
y_pred_train = best_model.predict(x_train)
print("Train R²:", r2_score(y_train, y_pred_train))
print("Test  R²:", r2_score(y_test,  best_model.predict(x_test)))


In [69]:

joblib.dump(best_model, "C:\\Users\\DOR CO\\Desktop\\Real state ML\\Real-Estate-Price-Estimator\\API\\real_estate_model.pkl")
print("Model saved successfully!")

Model saved successfully!


In [None]:
city_db = pd.read_csv('C:\\Users\\DOR CO\\Desktop\\Real state ML\\Real-Estate-Price-Estimator\\DataSet\\CityProvinceMedians.csv')

def city_median_lookup(input):
    if input["City"].values[0].lower() in city_db["City"].values and input["Province"].values[0].lower() in city_db["Province"].values:
        input["city_median"] = city_db.loc[(city_db['City'].values == input["City"].values[0].lower()) & (city_db['Province'].values == input["Province"].values[0].lower()), 'city_median_price'].values[0]
    else:
        print("Not Found")


In [None]:

loaded_model = joblib.load("real_estate_model.pkl")

# Example: Predict with new input
sample_input = pd.DataFrame([{
    "Province": "Milan",
    "City": "Abbiategrasso",
    "property_quality": 8,
    "rooms": 3,
    "living_area": 120,
    "bathrooms": 2,
    "garden_sqm": 30,
    "terrace_sqm": 15,
    "land_area": 0,
    "distance_from_airport": 20,
    "Skiresort_distance": 150,
    "terrace": 1,
    "garden": 1,
    "pool": 0,
    "car_box": 1,
    "land": 0,
    "property_type": 1
}])
city_median_lookup(sample_input)
predicted_price = loaded_model.predict(sample_input)
print("Predicted Price:", predicted_price[0])


City median price: 518000.0
Predicted Price: 565915.3
