In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import shap

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from sqlalchemy import create_engine
import pandas as pd

engine = create_engine('mssql+pyodbc://armin:123456@HP-VICTUS-16\\SQLEXPRESS/housing_price_data?driver=ODBC+Driver+18+for+SQL+Server&TrustServerCertificate=yes')

df = pd.read_sql("SELECT * FROM housing_price_data", engine)

In [3]:
X = df.drop("price", axis=1)
y = df["price"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [4]:
pipe = Pipeline([("scaler", StandardScaler()),
                 ("regressor", SVR())])

param_grid = [
    {
        "regressor": [SVR()],
        "regressor__C": [0.1, 1, 10],
        "regressor__kernel": ["linear", "rbf"],
        "regressor__gamma": ["scale", "auto"] 
    },
    {
        "regressor": [RandomForestRegressor()],
        "regressor__n_estimators": [50, 100, 150, 200],
        "regressor__max_depth": [None, 5, 10],
        "regressor__min_samples_split": [2, 5, 7] 
    },
    {
        "regressor": [KNeighborsRegressor()],
        "regressor__n_neighbors": [3, 5, 7, 10],
        "regressor__weights": ["uniform", "distance"],
        "regressor__p": [1, 2] 
    },
    {
        "regressor": [LinearRegression()],
        "regressor__fit_intercept": [True, False]
    }
]

In [5]:
grid = GridSearchCV(pipe, param_grid, cv=5)
grid.fit(X_train, y_train)

y_pred = grid.best_estimator_.predict(X_test)

r2 = r2_score(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)

print("Najbolji model:", grid.best_estimator_)
print("Najbolji parametri:", grid.best_params_)
print("Najbolji CV score (R2):", grid.best_score_)
print("R2 na test skupu:", r2)
print("MSE na test skupu:", mse)

Najbolji model: Pipeline(steps=[('scaler', StandardScaler()),
                ('regressor',
                 RandomForestRegressor(max_depth=10, min_samples_split=5))])
Najbolji parametri: {'regressor': RandomForestRegressor(), 'regressor__max_depth': 10, 'regressor__min_samples_split': 5, 'regressor__n_estimators': 100}
Najbolji CV score (R2): 0.6536948627372182
R2 na test skupu: 0.7225871833695061
MSE na test skupu: 31551790034.879593


In [6]:
regressor = RandomForestRegressor(min_samples_split=7, n_estimators=200)

In [7]:
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)

In [8]:
new_house = pd.DataFrame([{
    "bedrooms": 3,
    "bathrooms": 2,
    "floors": 1,
    "waterfront": 0,
    "view": 0,
    "condition": 3,
    "grade": 7,
    "yr_built": 1995,
    "yr_renovated": 0,
    "sqm_living": 150,
    "sqm_lot": 400,
    "sqm_above": 150,
    "sqm_basement": 0
}])

predicted_price = regressor.predict(new_house)
print(f"Price: {predicted_price[0]:,.2f}")

Price: 334,209.33


In [9]:
new_house_2 = pd.DataFrame([{
    "bedrooms": 4,
    "bathrooms": 2,
    "floors": 2,
    "waterfront": 1,
    "view": 3,
    "condition": 4,
    "grade": 9,
    "yr_built": 2005,
    "yr_renovated": 2015,
    "sqm_living": 220,
    "sqm_lot": 500,
    "sqm_above": 180,
    "sqm_basement": 40
}])

predicted_price_2 = regressor.predict(new_house_2)
print(f"Price: {predicted_price_2[0]:,.2f}")


Price: 861,631.54
