In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.linear_model import ElasticNetCV, LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingCVRegressor
from sklearn.neural_network import MLPRegressor

import multiprocessing
import optuna
import cma

Only `cma.purecma` has been imported. Install `numpy` ("pip install numpy") if you want to import the entire `cma` package.


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
m = pd.read_csv("../../data/processed/processed_3.csv")

In [3]:
m

Unnamed: 0,감독,제작사,배급사,전국 스크린수,전국 관객수,국적_기타,국적_독일,국적_러시아,국적_미국,국적_스페인,...,장르_어드벤처,장르_전쟁,장르_코미디,장르_판타지,등급_12세이상관람가,등급_15세이상관람가,등급_전체관람가,등급_청소년관람불가,영화구분_독립/예술영화,영화구분_일반영화
0,14.306190,12.886638,13.607103,1587,17613682,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
1,15.172542,16.604523,13.607103,1978,16264944,False,False,False,False,False,...,False,False,True,False,False,True,False,False,False,True
2,15.714698,16.402393,12.960970,1912,14410754,False,False,False,False,False,...,False,False,False,True,True,False,False,False,False,True
3,15.908342,15.263750,13.607103,966,14245998,False,False,False,False,False,...,False,False,False,False,True,False,False,False,False,True
4,14.740589,15.240150,13.427532,2835,13934592,False,False,False,True,False,...,False,False,False,False,True,False,False,False,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2324,7.217443,7.217443,9.570018,82,1362,True,False,False,False,False,...,False,False,False,False,False,True,False,False,False,True
2325,7.205635,7.205635,7.205635,60,1346,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False
2326,7.156956,8.649415,9.092294,71,1282,False,False,False,True,False,...,False,False,False,False,False,True,False,False,False,True
2327,7.770353,7.134891,7.134891,79,1254,False,False,False,False,False,...,False,False,False,False,False,True,False,False,True,False


In [4]:
# 변수 제거 함수
def recursive_feature_elimination(X, y, model, min_features=1, verbose=True):
    # 초기 변수 개수
    n_features = X.shape[1]
    
    # 현재 변수들의 성능
    best_score = float("inf")
    best_features = X.columns.tolist()
    
    # 최소 변수 개수 이상인 경우
    while n_features > min_features:
        # 모든 변수에 대해 반복
        scores = []
        for feature in X.columns:
            # 선택한 변수 제외
            features = X.columns.drop(feature)
            X_new = X[features]
            
            # 모델 학습 및 평가
            model.fit(X_new, y)
            y_pred = model.predict(X_new)
            score = mean_squared_error(y, y_pred)
            scores.append(score)
            
        # 가장 성능이 좋은 변수 선택
        idx = pd.Index(scores).argmin()
        worst_feature = X.columns[idx]
        
        # 변수 제거
        X.drop(worst_feature, axis=1, inplace=True)
        n_features = X.shape[1]
        if verbose:
            print(f"Removing {worst_feature}: {n_features} features left")
        
        # 현재 변수 개수에서의 성능이 최선인 경우
        if min(scores) < best_score:
            best_score = min(scores)
            best_features = X.columns.tolist()
        else:
            break
    
    # 최종 선택된 변수들
    return best_features

# ElasticNetCV 모델 객체 생성
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
enet = ElasticNetCV(alphas=alphas, l1_ratio=0.5, cv=5, max_iter=10000)

# 최소 변수 개수 지정
min_features = 5

# 재귀적 변수 제거 수행
X = m.drop("전국 관객수", axis=1)
y = m["전국 관객수"]
selected_features = recursive_feature_elimination(X, y, enet, min_features=min_features)

# 선택된 변수 출력
print(f"{len(selected_features)} features selected:")
print(selected_features)

Removing 국적_기타: 40 features left
Removing 국적_프랑스: 39 features left
Removing 국적_홍콩: 38 features left
39 features selected:
['감독', '제작사', '배급사', '전국 스크린수', '국적_독일', '국적_러시아', '국적_미국', '국적_스페인', '국적_영국', '국적_일본', '국적_중국', '국적_한국', '국적_홍콩', '장르_SF', '장르_가족', '장르_공연', '장르_공포(호러)', '장르_기타', '장르_다큐멘터리', '장르_드라마', '장르_멜로/로맨스', '장르_뮤지컬', '장르_미스터리', '장르_범죄', '장르_사극', '장르_서부극(웨스턴)', '장르_스릴러', '장르_애니메이션', '장르_액션', '장르_어드벤처', '장르_전쟁', '장르_코미디', '장르_판타지', '등급_12세이상관람가', '등급_15세이상관람가', '등급_전체관람가', '등급_청소년관람불가', '영화구분_독립/예술영화', '영화구분_일반영화']


In [5]:
# 데이터 분할
X = m[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [6]:
# 모델 정의
knn = KNeighborsRegressor()
rf = RandomForestRegressor(random_state=42)
gb = GradientBoostingRegressor(random_state=42)
xgb = XGBRegressor(random_state=42)
lgbm = LGBMRegressor(random_state=42)

# 앙상블 모델 정의
stack = StackingCVRegressor(regressors=(knn, rf, gb, xgb, lgbm),
                            meta_regressor=gb,
                            cv=KFold(n_splits=5, shuffle=True, random_state=42),
                            use_features_in_secondary=True)

# 앙상블 모델 학습
stack.fit(X_train, y_train)

# 앙상블 모델 예측
y_pred = stack.predict(X_test)



In [7]:
# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 917808.4689940624
MAE: 380126.92917561776
R-squared: 0.700736788201779


In [8]:
# KNN 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def knn_objective(trial):
    n_neighbors = trial.suggest_int("n_neighbors", 1, 30)
    knn = KNeighborsRegressor(n_neighbors=n_neighbors)
    score = cross_val_score(knn, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# Random Forest 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def rf_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    rf = RandomForestRegressor(n_estimators=n_estimators, max_depth=max_depth, random_state=42)
    score = cross_val_score(rf, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# Gradient Boosting 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def gb_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    gb = GradientBoostingRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
                                   max_depth=max_depth, random_state=42)
    score = cross_val_score(gb, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# XGBoost 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def xgb_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    subsample = trial.suggest_float("subsample", 0.5, 1)
    xgb = XGBRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
                       max_depth=max_depth, subsample=subsample, random_state=42)
    score = cross_val_score(xgb, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

# LightGBM 모델의 하이퍼파라미터 탐색 공간과 목적 함수 정의
def lgbm_objective(trial):
    n_estimators = trial.suggest_int("n_estimators", 100, 1000, step=100)
    learning_rate = trial.suggest_float("learning_rate", 0.001, 0.1)
    max_depth = trial.suggest_int("max_depth", 3, 10)
    num_leaves = trial.suggest_int("num_leaves", 10, 100)
    lgbm = LGBMRegressor(n_estimators=n_estimators, learning_rate=learning_rate,
    max_depth=max_depth, num_leaves=num_leaves, random_state=42)
    score = cross_val_score(lgbm, X, y, cv=5, scoring="neg_mean_squared_error")
    rmse = (-1 * score.mean()) ** 0.5
    return rmse

def optimize_model(model_objective, n_trials=100):
    study = optuna.create_study(direction="minimize", sampler=optuna.samplers.CmaEsSampler())
    n_jobs = multiprocessing.cpu_count()  # 사용 가능한 모든 CPU 코어 수
    study.optimize(model_objective, n_trials=n_trials, n_jobs=n_jobs)
    print(f"Best RMSE: {study.best_value:.4f}")
    print(f"Best Parameters: {study.best_params}")
    return study.best_params

# KNN 모델의 하이퍼파라미터 최적화
knn_params = optimize_model(knn_objective)

# Random Forest 모델의 하이퍼파라미터 최적화
rf_params = optimize_model(rf_objective)

# Gradient Boosting 모델의 하이퍼파라미터 최적화
gb_params = optimize_model(gb_objective)

# XGBoost 모델의 하이퍼파라미터 최적화
xgb_params = optimize_model(xgb_objective)

# LightGBM 모델의 하이퍼파라미터 최적화
lgbm_params = optimize_model(lgbm_objective)

[32m[I 2023-05-09 09:36:11,911][0m A new study created in memory with name: no-name-e79ef900-ccd3-4c48-83d8-eb35d6027139[0m
[32m[I 2023-05-09 09:36:12,510][0m Trial 2 finished with value: 1992476.764600211 and parameters: {'n_neighbors': 12}. Best is trial 2 with value: 1992476.764600211.[0m
[32m[I 2023-05-09 09:36:12,516][0m Trial 0 finished with value: 1991694.826169131 and parameters: {'n_neighbors': 18}. Best is trial 0 with value: 1991694.826169131.[0m
[32m[I 2023-05-09 09:36:12,519][0m Trial 3 finished with value: 1995772.5178442544 and parameters: {'n_neighbors': 25}. Best is trial 0 with value: 1991694.826169131.[0m
[32m[I 2023-05-09 09:36:12,523][0m Trial 7 finished with value: 1992476.764600211 and parameters: {'n_neighbors': 12}. Best is trial 0 with value: 1991694.826169131.[0m
[32m[I 2023-05-09 09:36:12,524][0m Trial 5 finished with value: 1991694.826169131 and parameters: {'n_neighbors': 18}. Best is trial 0 with value: 1991694.826169131.[0m
[32m[I 2023

Best RMSE: 1980672.0451
Best Parameters: {'n_neighbors': 3}


[32m[I 2023-05-09 09:36:27,277][0m Trial 3 finished with value: 1832739.4748937823 and parameters: {'n_estimators': 100, 'max_depth': 6}. Best is trial 3 with value: 1832739.4748937823.[0m
[32m[I 2023-05-09 09:36:43,175][0m Trial 1 finished with value: 1826493.2854095607 and parameters: {'n_estimators': 400, 'max_depth': 4}. Best is trial 1 with value: 1826493.2854095607.[0m
[32m[I 2023-05-09 09:37:03,356][0m Trial 7 finished with value: 1824615.5518334548 and parameters: {'n_estimators': 600, 'max_depth': 5}. Best is trial 7 with value: 1824615.5518334548.[0m
[32m[I 2023-05-09 09:37:10,092][0m Trial 0 finished with value: 1839288.7601912012 and parameters: {'n_estimators': 500, 'max_depth': 7}. Best is trial 7 with value: 1824615.5518334548.[0m
[32m[I 2023-05-09 09:37:17,725][0m Trial 5 finished with value: 1826980.9033946367 and parameters: {'n_estimators': 900, 'max_depth': 4}. Best is trial 7 with value: 1824615.5518334548.[0m
[32m[I 2023-05-09 09:37:19,395][0m Tri

Best RMSE: 1824480.6960
Best Parameters: {'n_estimators': 500, 'max_depth': 5}


[32m[I 2023-05-09 09:45:35,115][0m Trial 1 finished with value: 1859271.1948923394 and parameters: {'n_estimators': 100, 'learning_rate': 0.055382910059533275, 'max_depth': 10}. Best is trial 1 with value: 1859271.1948923394.[0m
[32m[I 2023-05-09 09:45:48,528][0m Trial 2 finished with value: 1817866.8666010725 and parameters: {'n_estimators': 600, 'learning_rate': 0.04593918936549182, 'max_depth': 3}. Best is trial 2 with value: 1817866.8666010725.[0m
[32m[I 2023-05-09 09:45:57,868][0m Trial 5 finished with value: 1835434.1376810307 and parameters: {'n_estimators': 500, 'learning_rate': 0.06778527192839176, 'max_depth': 5}. Best is trial 2 with value: 1817866.8666010725.[0m
[32m[I 2023-05-09 09:46:12,706][0m Trial 4 finished with value: 1857365.6414105725 and parameters: {'n_estimators': 400, 'learning_rate': 0.05332408154950612, 'max_depth': 9}. Best is trial 2 with value: 1817866.8666010725.[0m
[32m[I 2023-05-09 09:46:20,812][0m Trial 8 finished with value: 1825957.7290

Best RMSE: 1807363.1939
Best Parameters: {'n_estimators': 900, 'learning_rate': 0.08322528723646312, 'max_depth': 3}


[32m[I 2023-05-09 09:55:16,893][0m Trial 6 finished with value: 1843508.5622719254 and parameters: {'n_estimators': 100, 'learning_rate': 0.04496558705677152, 'max_depth': 5, 'subsample': 0.9424700969970036}. Best is trial 6 with value: 1843508.5622719254.[0m
[32m[I 2023-05-09 09:55:33,495][0m Trial 5 finished with value: 1839546.6894726986 and parameters: {'n_estimators': 400, 'learning_rate': 0.03139831051955276, 'max_depth': 5, 'subsample': 0.6109475615989499}. Best is trial 5 with value: 1839546.6894726986.[0m
[32m[I 2023-05-09 09:55:33,853][0m Trial 7 finished with value: 1835998.7767041947 and parameters: {'n_estimators': 400, 'learning_rate': 0.02386563753132556, 'max_depth': 5, 'subsample': 0.8586112648405853}. Best is trial 7 with value: 1835998.7767041947.[0m
[32m[I 2023-05-09 09:55:40,868][0m Trial 2 finished with value: 1820137.140159307 and parameters: {'n_estimators': 800, 'learning_rate': 0.057775267121728945, 'max_depth': 3, 'subsample': 0.984715462979125}. B

Best RMSE: 1811474.7673
Best Parameters: {'n_estimators': 100, 'learning_rate': 0.04708715141709794, 'max_depth': 3, 'subsample': 0.8258520600116113}


[32m[I 2023-05-09 10:01:02,742][0m Trial 0 finished with value: 1840517.3637899002 and parameters: {'n_estimators': 200, 'learning_rate': 0.008714073648800195, 'max_depth': 4, 'num_leaves': 19}. Best is trial 0 with value: 1840517.3637899002.[0m
[32m[I 2023-05-09 10:01:03,496][0m Trial 2 finished with value: 1823639.3516184764 and parameters: {'n_estimators': 200, 'learning_rate': 0.049288408337355566, 'max_depth': 7, 'num_leaves': 18}. Best is trial 2 with value: 1823639.3516184764.[0m
[32m[I 2023-05-09 10:01:05,453][0m Trial 6 finished with value: 1825022.6129001523 and parameters: {'n_estimators': 300, 'learning_rate': 0.06472975330853399, 'max_depth': 8, 'num_leaves': 27}. Best is trial 2 with value: 1823639.3516184764.[0m
[32m[I 2023-05-09 10:01:05,909][0m Trial 3 finished with value: 1823389.4077521684 and parameters: {'n_estimators': 900, 'learning_rate': 0.024737119017531495, 'max_depth': 3, 'num_leaves': 26}. Best is trial 3 with value: 1823389.4077521684.[0m
[32m

Best RMSE: 1807076.9120
Best Parameters: {'n_estimators': 100, 'learning_rate': 0.043753404921026987, 'max_depth': 6, 'num_leaves': 41}


In [9]:
# 앙상블 모델 정의
stack = StackingCVRegressor(regressors=(knn, rf, gb, xgb, lgbm),
                            meta_regressor=gb,
                            cv=KFold(n_splits=5, shuffle=True, random_state=42),
                            use_features_in_secondary=True)

# 앙상블 모델 학습
stack.fit(X_train, y_train)

# 앙상블 모델 예측
y_pred = stack.predict(X_test)



In [10]:
# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 917808.4689940624
MAE: 380126.92917561776
R-squared: 0.700736788201779


In [11]:
# Base Model 정의
lr = LinearRegression()
rf = RandomForestRegressor(n_estimators=100, random_state=42)
xgb = XGBRegressor(n_estimators=100, random_state=42)
lgbm = LGBMRegressor(n_estimators=100, random_state=42)
mlp = MLPRegressor(random_state=42)

# Meta Model 정의
meta_model = LinearRegression()

# Stacking 알고리즘 적용하기
stack = StackingCVRegressor(regressors=(lr, rf, xgb, lgbm, mlp),
                            meta_regressor=meta_model,
                            cv=KFold(n_splits=5, shuffle=True, random_state=42),
                            use_features_in_secondary=True)

# Stacking 알고리즘 적용하기
stack.fit(X_train, y_train)

# Test 데이터로 예측하기
y_pred = stack.predict(X_test)



In [12]:
# 평가지표 계산
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# 평가지표 출력
print(f"RMSE: {rmse}")
print(f"MAE: {mae}")
print(f"R-squared: {r2}")

RMSE: 845260.8873338506
MAE: 465832.1508821438
R-squared: 0.7461771256791856
