In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_absolute_error, r2_score
import lightgbm as lgb
import numpy as np

#####################
# DataBase
#####################

df = pd.read_csv("cities_data/otodom_apartments_demo.csv", sep=";", encoding="utf-8-sig")
df = df[df["city"] != "nieznane"].dropna()

#####################
# Model Details
#####################

X = df.drop(columns=["price"])
y = df["price"]

categorical_cols = X.select_dtypes(include=["object"]).columns

# Prosty LabelEncoder — działa dobrze dla LightGBM
le = LabelEncoder()
for col in categorical_cols:
    X[col] = le.fit_transform(X[col])
# For CatBoost

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialization

model = lgb.LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.05,
    max_depth=-1,
    num_leaves=31,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

model.fit(
    X_train, y_train,
    eval_set=[(X_test, y_test)],
    eval_metric="mae",
    callbacks=[lgb.early_stopping(stopping_rounds=50)],
)


y_pred = model.predict(X_test)

mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"MAE: {mae}")
print(f"R²: {r2}")

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002416 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1273
[LightGBM] [Info] Number of data points in the train set: 27488, number of used features: 10
[LightGBM] [Info] Start training from score 840413.006185
Training until validation scores don't improve for 50 rounds
Did not meet early stopping. Best iteration is:
[1000]	valid_0's l1: 9306.6	valid_0's l2: 1.05484e+09
MAE: 9306.598372223509
R²: 0.9971270447419397
