# OPTUNA 이용한 하이퍼파라미터 튜닝
- xgboost
- ligthgbm
- randomforest

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from functools import partial
import optuna
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import RandomForestRegressor

In [None]:
# 경로설정
df = pd.read_csv('../../data/train_V2.csv')
df = df.dropna()

In [None]:
def feature_engineering(df):
    #수치형 데이터
    df_col= [ 'assists', 'boosts', 'damageDealt', 'DBNOs',
       'headshotKills', 'heals', 'killPlace', 'killPoints', 'kills','matchType',
       'killStreaks', 'longestKill', 'matchDuration', 'maxPlace',
       'numGroups', 'rankPoints', 'revives', 'rideDistance', 'roadKills',
       'swimDistance', 'teamKills', 'vehicleDestroys', 'walkDistance',
       'weaponsAcquired', 'winPoints']
    # df_t는 데이터를 그룹아이디 기준으로 평균을 내기 위해 저장할 데이터프레임
    df_t=df.copy()
    #라벨인코딩
    le = LabelEncoder()
    df_t['matchType']= le.fit_transform(df_t['matchType'])
    df_t['matchType']
    # 그룹아이디를 기준으로 수치형 컬럼들을 평균화
    df_t= df_t.groupby(['groupId'])[df_col].agg('mean').reset_index()
    
    return df_t


# df에는 앞에서 만든 함수를 통해 피처 엔지니어링 수치형 데이터를 평균을 냄
train02 = feature_engineering(df)

#원본 train01 데이터의 순서와 로우를 잃지 않기 위해 그룹아이디와 그에 맞는 타겟벨류를 가지고옴
train03=df[['groupId','winPlacePerc']]

# 데이터 합치기- 앞에서 데이터를 피처 엔지니어링한 train02를 원본 데이터인 train03의 순서에 맞게 넣기 위해 merge를 사용.
# how='left'는 두 데이터프레임중 기준을 잡을 데이터프레임을 정함.
# 원본train03 의 순서와 로우수를 맞춰줌
train03=pd.merge(train03,train02,how='left',on='groupId')
# 모델 학습을 위해 두가지 변수를 제거 killplace는 큰 비중을 차지하여 공부를 위해 제거 groupId는 학습을 위해 수치형 데이터만
#남게 하기 위해 제거 
train03=train03.drop(columns=['killPlace','groupId'])

X=train03.drop(columns='winPlacePerc')
y=train03[['winPlacePerc']]

In [None]:
def optimizer(trial, X, y, model):
    X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, random_state=42)
    if model == 'xgb':
        param = {
            'lambda': trial.suggest_loguniform('lambda', 1e-3, 10.0),
            'alpha': trial.suggest_loguniform('alpha', 1e-3, 10.0),
            'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3,0.4,0.5,0.6,0.7,0.8,0.9, 1.0]),
            'subsample': trial.suggest_categorical('subsample', [0.4,0.5,0.6,0.7,0.8,1.0]),
            'learning_rate': trial.suggest_categorical('learning_rate', [0.01,0.014, 0.02, 0.05]),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'max_depth': trial.suggest_categorical('max_depth', [5,7,9,11,13,15,17]),
            'random_state': trial.suggest_categorical('random_state', [42]),
            'min_child_weight': trial.suggest_int('min_child_weight', 0, 300),
            'n_jobs': -1
        }
        model = xgb.XGBRegressor(**param)
        model.fit(X_train,y_train,eval_set=[(X_valid,y_valid)],early_stopping_rounds=200)
        preds = model.predict(X_valid)
    
    elif model == 'lgb':
        param = {
            'lambda_l1' : trial.suggest_loguniform('lambda_l1', 1e-8, 1e-1),
            'lambda_l2' : trial.suggest_loguniform('lambda_l2', 1e-8, 1e-1),
            'path_smooth' : trial.suggest_loguniform('path_smooth', 1e-8, 1e-3),
            'learning_rate': trial.suggest_categorical('learning_rate', [0.01,0.014, 0.02, 0.05]),
            'num_leaves' : trial.suggest_int('num_leaves', 30, 200),
            'min_data_in_leaf' : trial.suggest_int('min_data_in_leaf', 10, 100),
            'max_bin' : trial.suggest_int('max_bin', 100, 255),
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'feature_fraction' : trial.suggest_uniform('feature_fraction', 0.5, 0.9),
            'bagging_fraction' : trial.suggest_uniform('bagging_fraction', 0.5, 0.9),
            'random_state': trial.suggest_categorical('random_state', [42]),
            'n_jobs': -1,
        }
        
        model = lgb.LGBMRegressor(**param)
        
        model.fit(X_train, y_train,eval_set=[(X_valid, y_valid)],eval_metric='l1',early_stopping_rounds=200)
        preds = model.predict(X_valid)

    elif model == 'rf':
        param = {
            'n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'random_state': trial.suggest_categorical('random_state', [42]),
            'max_depth': trial.suggest_categorical('max_depth', [5,7,9,10]),
            'min_samples_leaf': trial.suggest_categorical('min_samples_leaf', [3, 4, 5]),
            'min_samples_split': trial.suggest_categorical('min_samples_split', [8, 10, 12]),
            'n_jobs': -1,
        }
        
        model = RandomForestRegressor(**param)
        model.fit(X_train, y_train)
        preds = model.predict(X_valid)
        
    mae = mean_absolute_error(y_valid, preds)
    
    return mae


In [None]:
# xgboost
opt_func = partial(optimizer, X=X, y=y, model='xgb')

xgb_study = optuna.create_study(direction="minimize")
xgb_study.optimize(opt_func, n_trials=10)

In [None]:
xgb_study.trials_dataframe()

In [None]:
Best_trial = xgb_study.best_trial.params
Best_trial

In [None]:
# lightgbm
opt_func = partial(optimizer, X=X, y=y, model='lgb')

lgb_study = optuna.create_study(direction="minimize")
lgb_study.optimize(opt_func, n_trials=10)

In [None]:
lgb_study.trials_dataframe()

In [None]:
Best_trial = lgb_study.best_trial.params
Best_trial

In [None]:
# rf
opt_func = partial(optimizer, X=X, y=y, model='rf')

rf_study = optuna.create_study(direction="minimize")
rf_study.optimize(opt_func, n_trials=10)

In [None]:
rf_study.trials_dataframe()

In [None]:
Best_trial = rf_study.best_trial.params
Best_trial