In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neural_network import MLPRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
import numpy as np
import pandas as pd


In [2]:
df = pd.read_csv('price_list.csv')

In [3]:
df['Gu_encoding'] = df['Gu'].astype('category').cat.codes + 1
df['Dong_encoding'] = df.groupby('Gu').cumcount() + 1
df['Gu_encoding'] = df['Gu_encoding'].astype('int64')
df['Dong'] = df['Gu_encoding'] * 100 + df['Dong_encoding']

#동을 구에 맞게 인코딩합니다.

In [4]:
X = df[['Dong', 'Building Year', 'Floor', 'Actual Price Index', 'Living Price Index','Interest Rate','Distance to MC','Distance to NS','Bus Station Within 500m']]
y = df['Price per Area Log Scale']

In [7]:
X_train_test, X_val_test, y_train_test, y_val_test = train_test_split(X, y, test_size=0.2, random_state=42)

# 그 다음, 훈련 세트를 다시 훈련 세트와 검증 세트로 분할
X_train, X_val, y_train, y_val = train_test_split(X_train_test, y_train_test, test_size=0.25, random_state=42)

In [8]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

In [9]:
base_models = [
    ('mlp', MLPRegressor(random_state=42, max_iter=500)),
    ('lgbm', LGBMRegressor(random_state=42))
]

In [10]:
stacking_regressor = StackingRegressor(
    estimators=base_models,
    final_estimator=LinearRegression()
)

In [11]:
stacking_regressor.fit(X_train_scaled, y_train)
y_pred = stacking_regressor.predict(X_val_scaled)
r2 = r2_score(y_val, y_pred)
mse = mean_squared_error(y_val, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_val, y_pred)

print(f'R^2: {r2:.4f}')
print(f'MSE: {mse:.4f}')
print(f'RMSE: {rmse:.4f}')
print(f'MAE: {mae:.4f}')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002120 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1060
[LightGBM] [Info] Number of data points in the train set: 106622, number of used features: 9
[LightGBM] [Info] Start training from score 7.030842
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001896 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1057
[LightGBM] [Info] Number of data points in the train set: 85297, number of used features: 9
[LightGBM] [Info] Start training from score 7.030435
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000336 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1057
[LightGBM] [Info] Number of data points in the train set: