In [1]:
import joblib
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
import pandas as pd
import xgboost as xgb
import lightgbm as lgb

In [2]:
data = pd.read_csv('./data/train_std.csv')
data.shape

(391933, 28)

In [3]:
X = data[['ARI_CO', 'ARI_PO', 'SHIP_TYPE_CATEGORY', 'DIST',
       'BREADTH', 'BUILT', 'DEADWEIGHT', 'DEPTH', 'DRAUGHT', 'GT', 'LENGTH',
       'FLAG', 'U_WIND', 'V_WIND', 'AIR_TEMPERATURE', 'BN', 'ATA_LT', 'PORT_SIZE', 
          'year', 'month', 'day', 'hour', 'minute', 'weekday']]

y = data['CI_HOUR']

In [4]:
from sklearn.model_selection import train_test_split
# 데이터를 학습용과 테스트용으로 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((313546, 24), (78387, 24), (313546,), (78387,))

In [5]:
from sklearn.preprocessing import StandardScaler

# 변형 객체 생성
std_scaler = StandardScaler()

# 훈련데이터의 모수 분포 저장
std_scaler.fit(X)

# 훈련 데이터 스케일링
X_train_scaled = std_scaler.transform(X_train)

# 테스트 데이터의 스케일링
X_test_scaled = std_scaler.transform(X_test)

In [6]:
minmax_lgbm = joblib.load('./model/D_lgb_MINMAX2.md')
standard_lgbm = joblib.load('./model/D_lgb_S2.md')
minmax_xgb = joblib.load('./model/D_xgb_M.md')
standard_xgb = joblib.load('./model/D_xgb_S2.md')
minmax_hgb = joblib.load('./model/HGB_K.md')
standard_hgb = joblib.load('./model/HGB_Standard.md')

In [7]:
# ('minmax_xgb',minmax_xgb),
regressor= [('minmax_lgbm', minmax_lgbm),
            ('standard_lgbm', standard_lgbm),
            ('standard_xgb', standard_xgb),
            ('minmax_xgb',minmax_xgb),
            ('minmax_hgb',minmax_hgb),
            ('standard_hgb',standard_hgb)]

In [8]:
stregr = StackingRegressor(estimators=regressor, 
                           final_estimator=minmax_xgb)

In [9]:
stregr.fit(X_train_scaled, y_train)

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002794 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 2375
[LightGBM] [Info] Number of data points in the train set: 313546, number of used features: 23
[LightGBM] [Info] Start training from score 31.607215
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.009512 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2375
[LightGBM] [Info] Number of data points in the train set: 313546, number of used features: 23
[LightGBM] [Info] Start training from score 31.607215
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.002101 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Tota

In [10]:
test_pred = stregr.predict(X_test_scaled)



In [11]:
### 성능평가하기
mae = mean_absolute_error(y_test, test_pred)
mse = mean_squared_error(y_test, test_pred)
r2 = r2_score(y_test, test_pred)

mae, mse, r2

(16.68494825206109, 754.8674340994389, 0.6006098029474922)

In [26]:
stack=stregr.set_output()

In [27]:
### 훈련 모델 저장하기
# 저장할 위치 및 파일명 정의 : 확장자는 임의로 넣어도 됩니다.
#  (보통 방식에서는 확장자 를 주로 사용합니다.)
save_path = "./model/stacked.md"

# 모델 저장시키기
joblib.dump(stack, save_path)

['./model/stacked.md']

In [28]:
hist_model = joblib.load(save_path)
hist_model

In [31]:
pred=hist_model.predict(X_test_scaled)



In [32]:
mae = mean_absolute_error(y_test, pred)
mse = mean_squared_error(y_test, pred)
r2 = r2_score(y_test, pred)

mae, mse, r2

(16.68494825206109, 754.8674340994389, 0.6006098029474922)