In [17]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from util.module import *
from util.utility import *
import lightgbm as lgb
import xgboost as xgb
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

data

# array_df = np.array(test_df_x)
# fa = Factor_attention(array_df, dim_information=512)
# fa.col_to_vec(threshold = 0.5)

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [24]:
raw_x = data
fa = Factor_attention(raw_x, dim_information=512)
fa.col_to_vec(threshold = 0.5)

In [23]:
raw_x = data
fa = Factor_attention(raw_x, dim_information=512)
fa.col_to_vec(threshold = 0.5)


model = Attention(n_factor= fa.n_factors, info_dim = fa.dim_info)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)
# print(model.total_result.shape)
num_epochs = 500
for epoch in range(num_epochs):
    reconstructed = model(fa)
    loss = pearson_correlation_coefficient_loss_function(reconstructed)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    torch.autograd.set_detect_anomaly(True)

    if (epoch+1) % 5 == 0:
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

LinAlgError: Singular matrix

In [7]:
data.shape

(506, 13)

In [8]:
model.total_result.shape

torch.Size([506, 5])

In [10]:
data_combine = np.hstack([data, torch.Tensor.cpu(model.total_result).detach().numpy()])
data_combine.shape

(506, 18)

In [14]:
# 임의의 데이터 생성
# X = data
# X = torch.Tensor.cpu(model.total_result).detach().numpy()
X = data_combine
Y = target


# 데이터 분할
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Min-Max 정규화
scaler_X = MinMaxScaler()
X_train = scaler_X.fit_transform(X_train)
X_test = scaler_X.transform(X_test)

scaler_Y = MinMaxScaler()
y_train = scaler_Y.fit_transform(y_train.reshape(-1, 1)).ravel()
y_test = scaler_Y.transform(y_test.reshape(-1, 1)).ravel()


In [15]:
# LightGBM 회귀 모델 객체 생성
lgb_model = lgb.LGBMRegressor()

# 탐색할 파라미터 설정
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [10,13,15,17,20,25,30],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'subsample': [0.5, 0.7, 1.0]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(lgb_model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 출력
print("Best parameters found: ", grid_search.best_params_)

# 최적의 파라미터로 모델 훈련
best_lgb_model = grid_search.best_estimator_

# 예측 수행
y_pred = best_lgb_model.predict(X_test)

# 예측값을 원래의 스케일로 변환
y_pred_original = scaler_Y.inverse_transform(y_pred.reshape(-1, 1)).ravel()

# 원래 스케일의 y_test 값도 변환
y_test_original = scaler_Y.inverse_transform(y_test.reshape(-1, 1)).ravel()

# RMSE 계산 및 출력
rmse = mean_squared_error(y_test_original, y_pred_original, squared=False)
print(f"RMSE with best parameters: {rmse}")

Fitting 3 folds for each of 567 candidates, totalling 1701 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000318 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1567
[LightGBM] [Info] Number of data points in the train set: 404, number of used features: 18
[LightGBM] [Info] Start training from score 0.395479
Best parameters found:  {'colsample_bytree': 0.5, 'learning_rate': 0.05, 'max_depth': 10, 'n_estimators': 200, 'subsample': 0.5}
RMSE with best parameters: 3.044963212311396


In [16]:
# XGBoost 회귀 모델 객체 생성
xgb_model = xgb.XGBRegressor()

# 탐색할 파라미터 설정
param_grid = {
    'n_estimators': [50, 100, 200],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [10,13,15,17,20,25,30],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'subsample': [0.5, 0.7, 1.0]
}

# GridSearchCV 객체 생성
grid_search = GridSearchCV(xgb_model, param_grid, cv=3, scoring='neg_mean_squared_error', verbose=1, n_jobs=-1)
grid_search.fit(X_train, y_train)

# 최적의 파라미터 출력
print("Best parameters found: ", grid_search.best_params_)

# 최적의 파라미터로 모델 훈련
best_xgb_model = grid_search.best_estimator_

# 예측 수행
y_pred = best_xgb_model.predict(X_test)

# 예측값을 원래의 스케일로 변환
y_pred_original = scaler_Y.inverse_transform(y_pred.reshape(-1, 1)).ravel()

# 원래 스케일의 y_test 값도 변환
y_test_original = scaler_Y.inverse_transform(y_test.reshape(-1, 1)).ravel()

# RMSE 계산 및 출력
rmse = mean_squared_error(y_test_original, y_pred_original, squared=False)
print(f"RMSE with best parameters: {rmse}")


Fitting 3 folds for each of 567 candidates, totalling 1701 fits
Best parameters found:  {'colsample_bytree': 0.7, 'learning_rate': 0.1, 'max_depth': 13, 'n_estimators': 100, 'subsample': 0.5}
RMSE with best parameters: 2.8479682298968427
