In [2]:
# 모듈 로딩
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
import numpy as np
import matplotlib as plt
import seaborn as sns
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_absolute_error, mean_absolute_percentage_error

In [3]:
# 데이터 로딩
DATA_PATH = r"C:\Users\KDP-14\Desktop\VSCode\KDT6\기업 프로젝트\데이터 전처리 2차\Linear 전처리2(2.7~3.3).csv"
dataDF = pd.read_csv(DATA_PATH)

### train_test 나누기

In [4]:
dataDF

Unnamed: 0,c_temp_pv,k_rpm_pv,n_temp_pv,scale_pv,s_temp_pv
0,69.6,189,67.2,3.010000,67.1
1,69.8,189,67.2,3.010000,67.0
2,69.7,189,67.9,3.080000,65.9
3,69.7,189,67.8,3.080000,65.9
4,69.7,189,67.8,3.080000,65.9
...,...,...,...,...,...
232022,69.7,191,67.6,3.037853,67.3
232023,69.8,191,67.5,3.037462,67.0
232024,69.7,191,67.4,3.037426,66.8
232025,69.7,191,67.3,3.036981,66.7


In [5]:
target = dataDF['scale_pv']
feature = dataDF.drop(columns='scale_pv')

In [6]:
target

0         3.010000
1         3.010000
2         3.080000
3         3.080000
4         3.080000
            ...   
232022    3.037853
232023    3.037462
232024    3.037426
232025    3.036981
232026    3.036280
Name: scale_pv, Length: 232027, dtype: float64

In [7]:
feature

Unnamed: 0,c_temp_pv,k_rpm_pv,n_temp_pv,s_temp_pv
0,69.6,189,67.2,67.1
1,69.8,189,67.2,67.0
2,69.7,189,67.9,65.9
3,69.7,189,67.8,65.9
4,69.7,189,67.8,65.9
...,...,...,...,...
232022,69.7,191,67.6,67.3
232023,69.8,191,67.5,67.0
232024,69.7,191,67.4,66.8
232025,69.7,191,67.3,66.7


In [8]:
X_train, X_test, y_train, y_test = train_test_split(feature,target, test_size= 0.2, random_state=77)

In [9]:
# Bayesian Optimization을 위한 목적 함수 정의
def xgb_evaluate(max_depth, n_estimators, learning_rate, colsample_bytree, subsample, min_child_weight):
    params = {
        'max_depth': int(max_depth),  # 정수형으로 변환
        'learning_rate': learning_rate,
        'n_estimators': int(n_estimators),  # 정수형으로 변환
        'colsample_bytree': colsample_bytree,
        'subsample': subsample,
        'min_child_weight': int(min_child_weight),  # 정수형으로 변환
        'objective': 'reg:squarederror',
        'random_state': 2,
        'n_jobs': 4
    }
    model = XGBRegressor(**params)
    cv_scores = cross_val_score(model, X_train.values, y_train, cv=3, scoring='neg_mean_squared_error')
    return -cv_scores.mean()  # MSE를 최소화해야 하므로 부호 반전

## MSE는 낮을수록 좋은 지표이므로, 부호를 반전하여 반환
## Bayesian Optimization은 목적 함수가 최대화하는 방향으로 작동하기 때문에, MSE의 부호를 반전하여 "최적화 대상 점수"로 사용


In [10]:
# Bayesian Optimization 설정
optimizer = BayesianOptimization(
    f=xgb_evaluate,
    pbounds={
        'max_depth': (3, 7),
        'n_estimators': (500, 1000),
        'learning_rate': (0.03, 0.1),
        'colsample_bytree': (0.5, 0.7),
        'subsample': (0.5, 0.8),
        'min_child_weight': (1, 5),
    },
    random_state=2,
)

In [11]:
# 최적화 실행
# init_points는 초기 random으로 몇개의 값을 찾을것인지(베이지안 최적화는 기존 데이터를 기반으로 가우시안 프로세스 모델)
# n_iter는 추가로 조사할 값들을 몇개 찾을지 설정(어디를 샘플링하면 최대값을 잘 찾을 수 있을지를 판단)
optimizer.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m1        [0m | [0m0.0002056[0m | [0m0.5872   [0m | [0m0.03181  [0m | [0m5.199    [0m | [0m2.741    [0m | [0m710.2    [0m | [0m0.5991   [0m |
| [0m2        [0m | [0m0.0002053[0m | [0m0.5409   [0m | [0m0.07335  [0m | [0m4.199    [0m | [0m2.067    [0m | [0m810.6    [0m | [0m0.6587   [0m |
| [95m3        [0m | [95m0.0002081[0m | [95m0.5269   [0m | [95m0.06595  [0m | [95m3.738    [0m | [95m4.141    [0m | [95m927.0    [0m | [95m0.6483   [0m |
| [0m4        [0m | [0m0.0002047[0m | [0m0.6693   [0m | [0m0.03558  [0m | [0m5.021    [0m | [0m1.261    [0m | [0m714.1    [0m | [0m0.529    [0m |
| [95m5        [0m | [95m0.0002087[0m | [95m0.5254   [0m | [95m0.07177  [0m | [95m3.904    [0m | [95m1.428    [0m | [95m610.2    [0m |

In [12]:
# 최적의 하이퍼파라미터 추출
best_params = optimizer.max['params']

# 하이퍼파라미터 정수형 변환 (필요 시)
best_params['max_depth'] = int(best_params['max_depth'])
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['min_child_weight'] = int(best_params['min_child_weight'])

In [13]:
# 최종 모델 생성 및 학습
final_model = XGBRegressor(
    max_depth=best_params['max_depth'],
    learning_rate=best_params['learning_rate'],
    n_estimators=best_params['n_estimators'],
    colsample_bytree=best_params['colsample_bytree'],
    subsample=best_params['subsample'],
    min_child_weight=best_params['min_child_weight'],
    objective='reg:squarederror',
    random_state=2,
    n_jobs=4
)

final_model.fit(X_train.values, y_train)

In [14]:
# 모델 저장
final_model.save_model('xgb_Lin_2.7 ~ 3.3.json')

In [15]:
loaded_model = XGBRegressor()
loaded_model.load_model('xgb_Lin_2.7 ~ 3.3.json')

# 로드한 모델로 예측
y_pred = loaded_model.predict(X_test)

y_pred = loaded_model.predict(X_test)
MAE = mean_absolute_error(y_test,y_pred)
MAPE = mean_absolute_percentage_error(y_test,y_pred)
Score = loaded_model.score(X_test,y_test)
print(f'MAE => {MAE}     MAPE => {MAPE*100}    R2 => {Score}')

MAE => 0.004725775797804334     MAPE => 0.1554642307823889    R2 => 0.25233797060366514
