# 모델 선택.ipynb에서 선택한 RandomForestRegressor의 최적 파라미터 찾기

# 라이브러리

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install pycaret
!pip install scikit-learn
!pip install joblib==1.3
!pip install -q catboost
!pip install --upgrade -q xgboost
!pip install bayesian-optimization==1.4.2

Collecting pycaret
  Downloading pycaret-3.3.2-py3-none-any.whl (486 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m486.1/486.1 kB[0m [31m7.6 MB/s[0m eta [36m0:00:00[0m
Collecting joblib<1.4,>=1.2.0 (from pycaret)
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.2/302.2 kB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn>1.4.0 (from pycaret)
  Downloading scikit_learn-1.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (13.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.3/13.3 MB[0m [31m20.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pyod>=1.1.3 (from pycaret)
  Downloading pyod-2.0.1.tar.gz (163 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m163.8/163.8 kB[0m [31m17.7 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting imbalanced-learn>=0.12.0 (from p

In [None]:
import pandas as pd
import numpy as np
from pycaret.regression import setup, compare_models
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
import pycaret
from itertools import permutations
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
import itertools
from sklearn.ensemble import RandomForestRegressor

# 데이터 불러오기 및 전처리

In [None]:
df = pd.read_csv("/content/drive/MyDrive/DATATHON/data/@KBO_5개구단/KBO_top5.csv", encoding = 'cp949')

df = df.drop(columns=['선수명', '년도'])

# '타율' 열에서 '-' 값을 가진 행을 제거
df = df[df['타율'] != '-']

# 나머지 열에서 '-' 값을 0으로 변경
df.replace('-', 0, inplace=True)
# NaN 값을 0으로 대체
df.fillna(0, inplace=True)

# 문자열 열을 숫자형으로 변환
df['출루율'] = pd.to_numeric(df['출루율'])
df['장타율'] = pd.to_numeric(df['장타율'])

# 특징과 타겟 설정
features = ['출루율', '장타율', '홈런', '수비 승리 기여도', '득점권타율', '도루허용']
target = '추정득점'

X = df[features]
y = df[target]

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=100)

# 1차시도

In [None]:
# 베이지안 최적화

# 목적 함수 정의: 평균 제곱 오차를 최소화하는 함수
def rf_cv(n_estimators, max_depth, max_features):
    # 모델 정의
    model = RandomForestRegressor(n_estimators=int(n_estimators),
                                  max_depth=int(max_depth),
                                  max_features=max_features,
                                  random_state=0)

    # 교차 검증을 통한 평균 제곱 오차 계산
    scores = -cross_val_score(model, X_train, y_train, scoring='neg_root_mean_squared_error', cv=5, n_jobs=-1)
    return scores.mean()

# 범위 설정
pbounds_rf = {'n_estimators': (50, 200),
           'max_depth': (1, 20),
           'max_features': (0.1, 1.0)}

# 베이지안 최적화 객체 생성
optimizer = BayesianOptimization(f=rf_cv, pbounds=pbounds_rf, random_state=0)

# 최적화 수행
optimizer.maximize(init_points=10, n_iter=100, acq='ei', xi=0.01)

# 최적 파라미터 및 점수 출력
print("Best Parameters: ", optimizer.max['params'])
print("Best Score (RMSE): ", optimizer.max['target'])

|   iter    |  target   | max_depth | max_fe... | n_esti... |
-------------------------------------------------------------
| [0m1        [0m | [0m9.347    [0m | [0m11.43    [0m | [0m0.7437   [0m | [0m140.4    [0m |
| [95m2        [0m | [95m9.549    [0m | [95m11.35    [0m | [95m0.4813   [0m | [95m146.9    [0m |
| [0m3        [0m | [0m9.336    [0m | [0m9.314    [0m | [0m0.9026   [0m | [0m194.5    [0m |
| [0m4        [0m | [0m9.308    [0m | [0m8.285    [0m | [0m0.8126   [0m | [0m129.3    [0m |
| [0m5        [0m | [0m9.349    [0m | [0m11.79    [0m | [0m0.933    [0m | [0m60.66    [0m |
| [95m6        [0m | [95m16.57    [0m | [95m2.655    [0m | [95m0.1182   [0m | [95m174.9    [0m |
| [0m7        [0m | [0m9.32     [0m | [0m15.78    [0m | [0m0.883    [0m | [0m196.8    [0m |
| [0m8        [0m | [0m9.182    [0m | [0m16.18    [0m | [0m0.5153   [0m | [0m167.1    [0m |
| [0m9        [0m | [0m11.09    [0m | [0m3.24

# 2차시도

In [None]:
# 하이퍼파라미터 그리드 설정

param_grid = {
    'n_estimators': [50, 100, 150],
    'max_depth': [None, 10, 20, 30],
    'max_features': ['auto', 'sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# 모델 초기화
rf = RandomForestRegressor(random_state=42)

# 그리드 서치 설정
grid_search = GridSearchCV(estimator=rf, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# 그리드 서치 실행
grid_search.fit(X_train, y_train)

# 최적의 하이퍼파라미터와 성능 출력
best_params = grid_search.best_params_
best_score = grid_search.best_score_

best_params, best_score

Fitting 5 folds for each of 324 candidates, totalling 1620 fits


({'max_depth': None,
  'max_features': 'sqrt',
  'min_samples_leaf': 1,
  'min_samples_split': 2,
  'n_estimators': 50},
 0.9056737398515633)

In [None]:
# 모델 및 하이퍼파라미터 범위 설정 (RandomForest만)

models = {
    'RandomForest': (RandomForestRegressor(random_state=42), {
        'n_estimators': [50, 100, 150, 200],
        'max_depth': [10, 20, 30, 40],
        'max_features': [0.3, 0.5, 0.7, 1.0]
    })
}

# 랜덤 서치 및 그리드 서치 수행
results = {}

for name, (model, param_grid) in models.items():
    random_search = RandomizedSearchCV(model, param_distributions=param_grid, n_iter=10, cv=5, random_state=42, n_jobs=-1)
    random_search.fit(X_train, y_train)
    best_params = random_search.best_params_

    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    predictions = grid_search.predict(X_test)
    mae = np.mean(np.abs(predictions - y_test))
    mse = np.mean((predictions - y_test) ** 2)
    rmse = np.sqrt(mse)
    r2 = grid_search.score(X_test, y_test)

    results[name] = {
        'Best Params': best_params,
        'MAE': mae,
        'MSE': mse,
        'RMSE': rmse,
        'R2': r2
    }

# 결과 출력
results_df = pd.DataFrame(results).T
results_df_sorted = results_df.sort_values(by='RMSE')

# 최종 결과 데이터프레임 출력
results_df_sorted

Unnamed: 0,Best Params,MAE,MSE,R2,RMSE
RandomForest,"{'n_estimators': 100, 'max_features': 0.7, 'ma...",6.729294,129.551769,0.860595,11.382081


n_estimators': 100, 'max_features': 0.7, 'max_depth': 20}

# 결론

- n_estimators=50, max_depth=20, max_features=0.3, random_state=42 로 최종 결정