In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
db = pd.read_csv('C:\\Users\\DOR CO\\Desktop\\Real state ML\\Real-Estate-Price-Estimator\\gateaway\\properties.csv')
x = db.drop(columns=['price'])
y = db['price']

In [None]:
import category_encoders as ce
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(transformers=[
    ('target', ce.TargetEncoder(cols=["Province", "City"]), ["Province", "City"]),
    ('onehot', OneHotEncoder(sparse_output=False, handle_unknown='ignore'), ["property_type"]),
    ('scaler', StandardScaler(), ['property_quality','rooms','living_area','bathrooms','garden_sqm',
                                  'terrace_sqm','land_area','distance_from_airport','Skiresort_distance',
                                  'terrace','garden','pool','car_box'])
], remainder='drop')

In [None]:
from xgboost import XGBRegressor
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
pipe = Pipeline(steps=[
    ('prep', preprocessor),
    ('model', XGBRegressor(booster='gblinear',random_state=42))
])

In [None]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [None]:

from sklearn.model_selection import GridSearchCV
param_grid = {
    'model__booster': ['gblinear'],
    'model__n_estimators': [300, 700],  # fewer values
    'model__eta': [0.05, 0.1],          # fewer learning rates
    'model__reg_alpha': [0.0, 0.1, 1.0],
    'model__reg_lambda': [0.1, 1.0, 10.0],
    'model__updater': ['coord_descent'],
    'model__feature_selector': ['thrifty'],  # pick one
    'model__top_k': [100, 250]              # fewer options
}
grid = GridSearchCV(
    estimator=pipe,
    param_grid=param_grid,
    scoring='neg_root_mean_squared_error',
    cv=3,
    n_jobs=-1,
    verbose=2
)

In [None]:
grid.fit(x_train, y_train)
best_model = grid.best_estimator_
y_pred = best_model.predict(x_test)

In [None]:
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_test.values.reshape(len(y_test),1)),1))

In [None]:
comp = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}, index=y_test.index)
comp['Error'] = comp['Predicted'] - comp['Actual']
comp['AbsPctError_%'] = (comp['Error'].abs() / comp['Actual'].replace(0, np.nan)) * 100
print(comp.head(10))

In [None]:
from sklearn.metrics import r2_score
print(f"R²: {r2_score(y_test, y_pred):.4f}")