In [119]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from util.module import *
from util.utility import *
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

df = pd.read_csv('./Data/house-prices-advanced-regression-techniques/train.csv')
numeric_df = df.select_dtypes(include=['int', 'float'])

numeric_df = numeric_df.dropna(axis=0)
numeric_df.reset_index(inplace=True, drop=True)
test_df_x = numeric_df.iloc[:, 1:23]
test_df_y = numeric_df.iloc[:, -1]

array_df = np.array(test_df_x)
fa = Factor_attention(array_df)
fa.col_to_vec(threshold = 0.35)

In [120]:
model = Attention(n_factor= fa.n_factors, info_dim = fa.dim_info)
optimizer = torch.optim.Adam(model.parameters(), lr=0.035)

num_epochs = 500
for epoch in range(num_epochs):
    reconstructed = model(fa)
    loss = pearson_correlation_coefficient_loss_function(reconstructed)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    torch.autograd.set_detect_anomaly(True)

    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch [5/500], Loss: 35.1009
Epoch [10/500], Loss: 42.3433
Epoch [15/500], Loss: 39.1725
Epoch [20/500], Loss: 37.6794
Epoch [25/500], Loss: 36.4373
Epoch [30/500], Loss: 35.4437
Epoch [35/500], Loss: 34.6289
Epoch [40/500], Loss: 34.2486
Epoch [45/500], Loss: 35.3511
Epoch [50/500], Loss: 35.4140
Epoch [55/500], Loss: 34.7932
Epoch [60/500], Loss: 33.9506
Epoch [65/500], Loss: 36.1903
Epoch [70/500], Loss: 35.8456
Epoch [75/500], Loss: 33.8609
Epoch [80/500], Loss: 34.3106
Epoch [85/500], Loss: 33.6806
Epoch [90/500], Loss: 33.7153
Epoch [95/500], Loss: 33.7242
Epoch [100/500], Loss: 34.6471
Epoch [105/500], Loss: 33.9718
Epoch [110/500], Loss: 33.5875
Epoch [115/500], Loss: 33.7005
Epoch [120/500], Loss: 33.5594
Epoch [125/500], Loss: 33.5203
Epoch [130/500], Loss: 33.4525
Epoch [135/500], Loss: 33.2413
Epoch [140/500], Loss: 32.5767
Epoch [145/500], Loss: 32.1069
Epoch [150/500], Loss: 31.7273
Epoch [155/500], Loss: 31.5004
Epoch [160/500], Loss: 31.0869
Epoch [165/500], Loss: 30.96

In [121]:
model.total_result.shape


torch.Size([1121, 12])

In [122]:
# 임의의 데이터 생성
# X = np.array(numeric_df.iloc[:, 1:23])
X = torch.Tensor.cpu(model.total_result).detach().numpy()
Y = np.array(numeric_df.iloc[:, -1])  # 임의의 레이블 생성

# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Min-Max 정규화
scaler_X = MinMaxScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

scaler_Y = MinMaxScaler()
y_train = scaler_Y.fit_transform(y_train.reshape(-1, 1)).ravel()
y_test = scaler_Y.transform(y_test.reshape(-1, 1)).ravel()


In [128]:
Y.mean()

185506.15254237287

In [125]:
# LightGBM 회귀 모델 객체 생성
lgb_model = lgb.LGBMRegressor()

# 탐색할 파라미터 설정
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [10,13,15,17,20,25,30],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'subsample': [0.5, 0.7, 1.0]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(lgb_model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 출력
print("Best parameters found: ", grid_search.best_params_)

# 최적의 파라미터로 모델 훈련
best_lgb_model = grid_search.best_estimator_

# 예측 수행
y_pred = best_lgb_model.predict(X_test)

# 예측값을 원래의 스케일로 변환
y_pred_original = scaler_Y.inverse_transform(y_pred.reshape(-1, 1)).ravel()

# 원래 스케일의 y_test 값도 변환
y_test_original = scaler_Y.inverse_transform(y_test.reshape(-1, 1)).ravel()

# RMSE 계산 및 출력
rmse = mean_squared_error(y_test_original, y_pred_original, squared=False)
print(f"RMSE with best parameters: {rmse}")

Fitting 3 folds for each of 567 candidates, totalling 1701 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2070
[LightGBM] [Info] Number of data points in the train set: 896, number of used features: 11
[LightGBM] [Info] Start training from score 0.208582
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 13, 'n_estimators': 100, 'subsample': 0.5}
RMSE with best parameters: 45198.330806550206


In [126]:
# XGBoost 회귀 모델 객체 생성
xgb_model = xgb.XGBRegressor()

# 탐색할 파라미터 설정
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [10,13,15,17,20,25,30],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'subsample': [0.5, 0.7, 1.0]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 출력
print("Best parameters found: ", grid_search.best_params_)

# 최적의 파라미터로 모델 훈련
best_xgb_model = grid_search.best_estimator_

# 예측 수행
y_pred = best_xgb_model.predict(X_test)

# 예측값을 원래의 스케일로 변환
y_pred_original = scaler_Y.inverse_transform(y_pred.reshape(-1, 1)).ravel()

# 원래 스케일의 y_test 값도 변환
y_test_original = scaler_Y.inverse_transform(y_test.reshape(-1, 1)).ravel()

# RMSE 계산 및 출력
rmse = mean_squared_error(y_test_original, y_pred_original, squared=False)
print(f"RMSE with best parameters: {rmse}")


Fitting 3 folds for each of 567 candidates, totalling 1701 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.05, 'max_depth': 15, 'n_estimators': 100, 'subsample': 0.5}
RMSE with best parameters: 47120.18161661042
