In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import ElasticNetCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import multiprocessing
import optuna
import cma

In [30]:
m = pd.read_csv("../../data/processed/processed_3.csv")

In [31]:
m

Unnamed: 0,감독,제작사,배급사,전국 스크린수,전국 관객수,개봉년,국적_기타,국적_독일,국적_러시아,국적_미국,...,장르_어드벤처,장르_전쟁,장르_코미디,장르_판타지,등급_12세이상관람가,등급_15세이상관람가,등급_전체관람가,등급_청소년관람불가,영화구분_독립/예술영화,영화구분_일반영화
0,14.306190,12.886638,13.649190,1587.0,17613682.0,2014,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
1,15.172542,16.604523,13.649190,1978.0,16264944.0,2019,False,False,False,False,...,False,False,True,False,False,True,False,False,False,True
2,15.714698,16.402393,12.997266,1912.0,14410754.0,2017,False,False,False,False,...,False,False,False,True,True,False,False,False,False,True
3,15.908342,15.263750,13.649190,966.0,14245998.0,2014,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
4,14.740589,15.240150,13.427532,2835.0,13934592.0,2019,False,False,False,True,...,False,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2287,9.341284,8.692994,9.155124,166.0,5960.0,2020,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
2288,9.140915,9.140915,9.540295,91.0,6876.0,2021,False,False,False,False,...,False,False,False,False,False,False,True,False,True,False
2289,8.068090,10.300064,8.068090,65.0,3190.0,2021,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
2290,12.487732,12.141428,13.049010,699.0,360873.0,2021,False,False,False,False,...,False,False,False,False,False,False,True,False,False,True


In [32]:
# 변수 제거 함수
def recursive_feature_elimination(X, y, model, min_features=1, verbose=True):
    # 초기 변수 개수
    n_features = X.shape[1]
    
    # 현재 변수들의 성능
    best_score = float("inf")
    best_features = X.columns.tolist()
    
    # 최소 변수 개수 이상인 경우
    while n_features > min_features:
        # 모든 변수에 대해 반복
        scores = []
        for feature in X.columns:
            # 선택한 변수 제외
            features = X.columns.drop(feature)
            X_new = X[features]
            
            # 모델 학습 및 평가
            model.fit(X_new, y)
            y_pred = model.predict(X_new)
            score = mean_squared_error(y, y_pred)
            scores.append(score)
            
        # 가장 성능이 좋은 변수 선택
        idx = pd.Index(scores).argmin()
        worst_feature = X.columns[idx]
        
        # 변수 제거
        X.drop(worst_feature, axis=1, inplace=True)
        n_features = X.shape[1]
        if verbose:
            print(f"Removing {worst_feature}: {n_features} features left")
        
        # 현재 변수 개수에서의 성능이 최선인 경우
        if min(scores) < best_score:
            best_score = min(scores)
            best_features = X.columns.tolist()
        else:
            break
    
    # 최종 선택된 변수들
    return best_features

# ElasticNetCV 모델 객체 생성
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
enet = ElasticNetCV(alphas=alphas, l1_ratio=0.5, cv=5, max_iter=10000)

# 최소 변수 개수 지정
min_features = 5

# 재귀적 변수 제거 수행
X = m.drop("전국 관객수", axis=1)
y = m["전국 관객수"]
selected_features = recursive_feature_elimination(X, y, enet, min_features=min_features)

# 선택된 변수 출력
print(f"{len(selected_features)} features selected:")
print(selected_features)

Removing 국적_프랑스: 41 features left
Removing 국적_중국: 40 features left
41 features selected:
['감독', '제작사', '배급사', '전국 스크린수', '개봉년', '국적_기타', '국적_독일', '국적_러시아', '국적_미국', '국적_스페인', '국적_영국', '국적_일본', '국적_중국', '국적_캐나다', '국적_한국', '장르_SF', '장르_가족', '장르_공연', '장르_공포(호러)', '장르_기타', '장르_다큐멘터리', '장르_드라마', '장르_멜로/로맨스', '장르_뮤지컬', '장르_미스터리', '장르_범죄', '장르_사극', '장르_서부극(웨스턴)', '장르_스릴러', '장르_애니메이션', '장르_액션', '장르_어드벤처', '장르_전쟁', '장르_코미디', '장르_판타지', '등급_12세이상관람가', '등급_15세이상관람가', '등급_전체관람가', '등급_청소년관람불가', '영화구분_독립/예술영화', '영화구분_일반영화']


In [33]:
# 데이터 분할
X = m[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [42]:
# KNeighborsRegressor 모델 학습
knn = KNeighborsRegressor()
knn.fit(X_train, y_train)

# GradientBoostingRegressor 모델 예측
y_pred = knn.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 1333611.8592063787
MAE: 573433.5639534884
R-squared: 0.3969621029041389


In [35]:
# RandomForestRegressor 모델 학습
rf = RandomForestRegressor(random_state=42)
rf.fit(X_train, y_train)

# GradientBoostingRegressor 모델 예측
y_pred = rf.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 710730.9129839207
MAE: 296130.8933946221
R-squared: 0.8287241550573674


In [41]:
# GradientBoostingRegressor 모델 학습
gb = GradientBoostingRegressor(random_state=42)
gb.fit(X_train, y_train)

# GradientBoostingRegressor 모델 예측
y_pred = gb.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 737017.6109481017
MAE: 323099.18514413445
R-squared: 0.8158204367431859


In [43]:
# XGBRegressor 모델 학습
xgb = XGBRegressor(random_state=42)
xgb.fit(X_train, y_train)

# XGBRegressor 모델 예측
y_pred = xgb.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 788092.1349453557
MAE: 322066.73794990365
R-squared: 0.789409054409296


In [44]:
# LGBMRegressor 모델 학습
lgbm = LGBMRegressor(random_state=42)
lgbm.fit(X_train, y_train)

# LGBMRegressor 모델 예측
y_pred = lgbm.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 804103.8163089625
MAE: 325783.2773282595
R-squared: 0.7807649667541882


In [48]:
def optimize_model(model_objective, n_trials=100):
    # Bayesian optimization 방식의 하이퍼파라미터 튜닝을 위해 Optuna 라이브러리 사용
    # 최소화를 목적으로 하기 때문에 'minimize'로 설정
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.CmaEsSampler())
    
    # 사용 가능한 모든 CPU 코어 수를 사용하여 병렬 처리
    n_jobs = multiprocessing.cpu_count()  
    
    # 모델 하이퍼파라미터 최적화 실행
    study.optimize(model_objective, n_trials=n_trials, n_jobs=n_jobs)
    
    # 최적 하이퍼파라미터와 그 때의 평가지표 출력
    print(f"Best RMSE: {study.best_value:.4f}")
    print(f"Best Parameters: {study.best_params}")
    
    # 최적 하이퍼파라미터 반환
    return study.best_params

In [50]:
# Random Forest 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def rf_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    score = cross_val_score(rf, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# Random Forest 모델의 하이퍼파라미터 최적화 및 적용
rf_params = optimize_model(rf_objective)
rf.set_params(**rf_params)

[32m[I 2023-05-10 16:01:20,191][0m A new study created in memory with name: no-name-09dcf6d1-358b-4aa8-b605-2782a6cedfc8[0m
[32m[I 2023-05-10 16:01:27,165][0m Trial 0 finished with value: 1835168.064464574 and parameters: {'n_estimators': 100, 'max_depth': 4}. Best is trial 0 with value: 1835168.064464574.[0m
[32m[I 2023-05-10 16:01:28,592][0m Trial 4 finished with value: 1830675.8329831562 and parameters: {'n_estimators': 100, 'max_depth': 5}. Best is trial 4 with value: 1830675.8329831562.[0m
[32m[I 2023-05-10 16:01:30,333][0m Trial 2 finished with value: 1839347.0115390695 and parameters: {'n_estimators': 100, 'max_depth': 6}. Best is trial 4 with value: 1830675.8329831562.[0m
[32m[I 2023-05-10 16:01:41,913][0m Trial 1 finished with value: 1837458.3697059066 and parameters: {'n_estimators': 200, 'max_depth': 7}. Best is trial 4 with value: 1830675.8329831562.[0m
[32m[I 2023-05-10 16:01:47,919][0m Trial 6 finished with value: 1834567.0353109117 and parameters: {'n_es

Best RMSE: 1828067.3425
Best Parameters: {'n_estimators': 700, 'max_depth': 5}


In [54]:
# 최적의 하이퍼 파라미터 적용 후 학습 및 예측
rf.fit(X_train, y_train)
y_pred = rf.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 750979.4917089096
MAE: 326022.93331947393
R-squared: 0.8087762360908739


In [51]:
# Gradient Boosting 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def gb_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    gb = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
                                   max_depth=max_depth, random_state=42)
    score = cross_val_score(gb, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# Gradient Boosting 모델의 하이퍼파라미터 최적화 및 적용
gb_params = optimize_model(gb_objective)
gb.set_params(**gb_params)

[32m[I 2023-05-10 16:12:26,274][0m A new study created in memory with name: no-name-4a64efc6-5409-40e3-9634-f5bdf58b92d6[0m
[32m[I 2023-05-10 16:12:59,860][0m Trial 2 finished with value: 1849930.4465880645 and parameters: {'n_estimators': 200, 'learning_rate': 0.07437489351261922, 'max_depth': 10}. Best is trial 2 with value: 1849930.4465880645.[0m
[32m[I 2023-05-10 16:13:06,086][0m Trial 3 finished with value: 1814249.0474291218 and parameters: {'n_estimators': 400, 'learning_rate': 0.09668809585893301, 'max_depth': 6}. Best is trial 3 with value: 1814249.0474291218.[0m
[32m[I 2023-05-10 16:13:06,155][0m Trial 5 finished with value: 1811840.3907666884 and parameters: {'n_estimators': 400, 'learning_rate': 0.022096401369883077, 'max_depth': 6}. Best is trial 5 with value: 1811840.3907666884.[0m
[32m[I 2023-05-10 16:13:13,269][0m Trial 6 finished with value: 1776412.653538969 and parameters: {'n_estimators': 900, 'learning_rate': 0.0052598000804391365, 'max_depth': 3}. Be

Best RMSE: 1758851.1817
Best Parameters: {'n_estimators': 700, 'learning_rate': 0.0516604704946125, 'max_depth': 3}


In [55]:
# 최적의 하이퍼 파라미터 적용 후 학습 및 예측
gb.fit(X_train, y_train)
y_pred = gb.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 794358.1277967676
MAE: 331229.15365296666
R-squared: 0.7860469927665912


In [52]:
# XGBoost 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def xgb_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    xgb = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
                       max_depth=max_depth, subsample=subsample, random_state=42)
    score = cross_val_score(xgb, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# XGBoost 모델의 하이퍼파라미터 최적화 및 적용
xgb_params = optimize_model(xgb_objective)
xgb.set_params(**xgb_params)

[32m[I 2023-05-10 16:20:39,584][0m A new study created in memory with name: no-name-624ef55a-c125-4631-9c95-dc5ba4444f22[0m
[32m[I 2023-05-10 16:20:44,521][0m Trial 3 finished with value: 1861146.221228587 and parameters: {'n_estimators': 100, 'learning_rate': 0.01496046387591667, 'max_depth': 4, 'subsample': 0.6135017184052962}. Best is trial 3 with value: 1861146.221228587.[0m
[32m[I 2023-05-10 16:20:53,380][0m Trial 4 finished with value: 1812417.719986936 and parameters: {'n_estimators': 200, 'learning_rate': 0.03601322373321486, 'max_depth': 6, 'subsample': 0.6471860808814012}. Best is trial 4 with value: 1812417.719986936.[0m
[32m[I 2023-05-10 16:21:13,060][0m Trial 6 finished with value: 1828095.010883553 and parameters: {'n_estimators': 400, 'learning_rate': 0.09191798120450435, 'max_depth': 7, 'subsample': 0.6152280618640915}. Best is trial 4 with value: 1812417.719986936.[0m
[32m[I 2023-05-10 16:21:13,960][0m Trial 1 finished with value: 1848108.7733982475 and p

Best RMSE: 1752952.1614
Best Parameters: {'n_estimators': 400, 'learning_rate': 0.07900094106900973, 'max_depth': 3, 'subsample': 0.8200948387714333}


In [56]:
# 최적의 하이퍼 파라미터 적용 후 학습 및 예측
xgb.fit(X_train, y_train)
y_pred = xgb.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 693762.6891566943
MAE: 308841.32489279815
R-squared: 0.8368047224658028


In [53]:
# LightGBM 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def lgbm_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    num_leaves = trial.suggest_int("num_leaves", 10, 100)
    lgbm = LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
    max_depth=max_depth, num_leaves=num_leaves, random_state=42)
    score = cross_val_score(lgbm, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# LightGBM 모델의 하이퍼파라미터 최적화 및 적용
lgbm_params = optimize_model(lgbm_objective)
lgbm.set_params(**lgbm_params)

[32m[I 2023-05-10 16:27:54,828][0m A new study created in memory with name: no-name-0ffdf4f8-91c6-4186-aeca-c6210dddd128[0m
[32m[I 2023-05-10 16:27:55,947][0m Trial 3 finished with value: 1778418.6322863745 and parameters: {'n_estimators': 100, 'learning_rate': 0.033545366242691124, 'max_depth': 4, 'num_leaves': 22}. Best is trial 3 with value: 1778418.6322863745.[0m
[32m[I 2023-05-10 16:27:57,309][0m Trial 1 finished with value: 1767952.6632371717 and parameters: {'n_estimators': 100, 'learning_rate': 0.057412765311955404, 'max_depth': 10, 'num_leaves': 33}. Best is trial 1 with value: 1767952.6632371717.[0m
[32m[I 2023-05-10 16:27:57,919][0m Trial 5 finished with value: 1836549.1576433778 and parameters: {'n_estimators': 200, 'learning_rate': 0.007867187536158346, 'max_depth': 5, 'num_leaves': 53}. Best is trial 1 with value: 1767952.6632371717.[0m
[32m[I 2023-05-10 16:27:59,530][0m Trial 4 finished with value: 1766316.212247893 and parameters: {'n_estimators': 400, 'le

Best RMSE: 1751053.9196
Best Parameters: {'n_estimators': 500, 'learning_rate': 0.05284872856438127, 'max_depth': 3, 'num_leaves': 23}


In [57]:
# 최적의 하이퍼 파라미터 적용 후 학습 및 예측
lgbm.fit(X_train, y_train)
y_pred = lgbm.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 757807.2748068348
MAE: 330444.4078675656
R-squared: 0.8052832786796792


In [58]:
# 메타 모델 선정
models = [rf, gb, xgb, lgbm]
best_score = float("-inf")
best_model = None

for model in models:
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    if rmse > best_score:
        best_score = rmse
        best_model = model

print("best_model :", best_model)

# 앙상블 모델 정의
stack = StackingCVRegressor(regressors=(rf, gb, xgb, lgbm),
                            meta_regressor=best_model,
                            cv=KFold(n_splits=5, shuffle=True, random_state=42),
                            use_features_in_secondary=True)

# 앙상블 모델 학습
stack.fit(X_train, y_train)

# 앙상블 모델 예측
y_pred = stack.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

best_model : LGBMRegressor(learning_rate=0.05284872856438127, max_depth=3, n_estimators=500,
              num_leaves=23, random_state=42)
RMSE: 756331.5232983214
MAE: 314178.7276999512
R-squared: 0.8060409217087647




In [59]:
def stack_objective(trial):
    # 앙상블 모델의 하이퍼파라미터 범위 설정
    params = {
        'meta_regressor__alpha': trial.suggest_loguniform('meta_regressor__alpha', 0.001, 1),
        'meta_regressor__learning_rate': trial.suggest_loguniform('meta_regressor__learning_rate', 0.001, 0.1),
        'meta_regressor__max_depth': trial.suggest_int('meta_regressor__max_depth', 3, 10),
        'meta_regressor__n_estimators': trial.suggest_int('meta_regressor__n_estimators', 50, 1000, step=50),
        'meta_regressor__subsample': trial.suggest_uniform('meta_regressor__subsample', 0.5, 1),
    }

    # 각각의 모델에 대해서 하이퍼파라미터 범위 설정
    for i, regressor in enumerate(stack.regressors):
        prefix = f'regressor_{i}__'
        params.update({
            f'{prefix}max_depth': trial.suggest_int(f'{prefix}max_depth', 3, 10),
            f'{prefix}n_estimators': trial.suggest_int(f'{prefix}n_estimators', 50, 1000, step=50),
            f'{prefix}subsample': trial.suggest_uniform(f'{prefix}subsample', 0.5, 1),
        })

        if isinstance(regressor, stackRegressor):
            params.update({
                f'{prefix}learning_rate': trial.suggest_loguniform(f'{prefix}learning_rate', 0.001, 0.1),
                f'{prefix}num_leaves': trial.suggest_int(f'{prefix}num_leaves', 2, 256),
            })

        elif isinstance(regressor, XGBRegressor):
            params.update({
                f'{prefix}learning_rate': trial.suggest_loguniform(f'{prefix}learning_rate', 0.001, 0.1),
                f'{prefix}booster': trial.suggest_categorical(f'{prefix}booster', ['gbtree', 'dart']),
            })

    # 생성된 하이퍼파라미터를 사용하여 모델 생성 및 평가
    stack.set_params(**params)
    score = cross_val_score(stack, X_train, y_train, cv=5, n_jobs=-1, scoring='neg_mean_squared_error')
    rmse = np.sqrt(-score.mean())
    return rmse

# 앙상블 모델의 하이퍼파라미터 최적화 및 적용
stack_params = optimize_model(stack_objective)
stack.set_params(**stack_params)

[32m[I 2023-05-10 16:30:00,862][0m A new study created in memory with name: no-name-289b64c7-3f29-422c-991d-ed45ae8281a3[0m
  'meta_regressor__alpha': trial.suggest_loguniform('meta_regressor__alpha', 0.001, 1),
  'meta_regressor__learning_rate': trial.suggest_loguniform('meta_regressor__learning_rate', 0.001, 0.1),
  'meta_regressor__subsample': trial.suggest_uniform('meta_regressor__subsample', 0.5, 1),
  f'{prefix}subsample': trial.suggest_uniform(f'{prefix}subsample', 0.5, 1),
[33m[W 2023-05-10 16:30:00,883][0m Trial 1 failed with parameters: {'meta_regressor__alpha': 0.8250858810526385, 'meta_regressor__learning_rate': 0.0037440549318928586, 'meta_regressor__max_depth': 3, 'meta_regressor__n_estimators': 950, 'meta_regressor__subsample': 0.9520557285365834, 'regressor_0__max_depth': 4, 'regressor_0__n_estimators': 500, 'regressor_0__subsample': 0.6203566721814344} because of the following error: NameError("name 'stackRegressor' is not defined").[0m
Traceback (most recent cal

NameError: name 'stackRegressor' is not defined

In [14]:
# 메타 모델 선정
models = [rf, gb, xgb, lgbm]
best_score = float("-inf")
best_model = None

for model in models:
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    if rmse > best_score:
        best_score = rmse
        best_model = model

print("best_model :", best_model)

# 앙상블 모델 정의
stack = StackingCVRegressor(regressors=(rf, gb, xgb, lgbm),
                            meta_regressor=best_model,
                            cv=KFold(n_splits=5, shuffle=True, random_state=42),
                            use_features_in_secondary=True)

# 앙상블 모델 학습
stack.fit(X_train, y_train)

# 앙상블 모델 예측
y_pred = stack.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

best_model : KNeighborsRegressor(n_neighbors=2)




In [19]:
# X_train, X_test 스케일링
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# y_train, y_test 스케일링
scaler_y = MinMaxScaler()
y_train_scaled = scaler_y.fit_transform(y_train.to_numpy().reshape(-1, 1)).flatten()
y_test_scaled = scaler_y.transform(y_test.to_numpy().reshape(-1, 1)).flatten()

In [None]:
# 메타 모델 선정
models = [rf, gb, xgb, lgbm]
best_score = float("-inf")
best_model = None

for model in models:
    score = cross_val_score(model, X_train, y_train, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    if rmse > best_score:
        best_score = rmse
        best_model = model

print("best_model :", best_model)

# 앙상블 모델 정의
stack = StackingCVRegressor(regressors=(rf, gb, xgb, lgbm),
                            meta_regressor=best_model,
                            cv=KFold(n_splits=5, shuffle=True, random_state=42),
                            use_features_in_secondary=True)

# 앙상블 모델 학습
stack.fit(X_train, y_train)

# 앙상블 모델 예측
y_pred = stack.predict(X_test)

# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")