## 김영준 - RandomForest

In [None]:
# ===== 김영준 RandomForest =====
rnd.seed(334)
ntrees = 500
patientRate = 0.2
eta = 0.01
seed = 9191

tuner_params = {"num_leaves": [pow(2, i) - 1 for i in [2, 4, 6, 8]],
                "subsample": [0.4, 0.6, 0.8],
                "colsample_bytree": [0.6, 0.8, 1],
                "reg_lambda": list(np.linspace(0.1, 10, 10).round(3))}
lgb_model = lgb.LGBMRegressor(boosting_type="rf", objective="regression",
                              n_estimators=int(np.floor(ntrees * patientRate)),
                              learning_rate=eta, silent=True, n_jobs=None,
                              subsample_freq=1, random_state=seed)
model_tuner = GridTuner(lgb_model, param_grid=tuner_params, cv=10, refit=False,
                        n_jobs=multiprocessing.cpu_count(),
                        pre_dispatch=multiprocessing.cpu_count(),
                        scoring="neg_root_mean_squared_error")
model_tuner.fit(train_x, train_y, categorical_feature=findIdx(train_x, cat_vars), verbose=False)

model_rf = {}
print("Tuning Result --->", model_tuner.best_params_)
model_rf["best_params"] = model_tuner.best_params_

lgb_model = lgb.LGBMRegressor(boosting_type="rf", objective="regression",
                              num_leaves=model_tuner.best_params_["num_leaves"],
                              n_estimators=ntrees, learning_rate=eta,
                              n_jobs=multiprocessing.cpu_count(), random_state=seed+9,
                              reg_lambda=model_tuner.best_params_["reg_lambda"],
                              subsample=model_tuner.best_params_["subsample"],
                              colsample_bytree=model_tuner.best_params_["colsample_bytree"],
                              subsample_freq=1, silent=True)
model_rf["model"] = lgb_model.fit(train_x, train_y, categorical_feature=findIdx(train_x, cat_vars),
                                  eval_set=[(val_x, val_y)], eval_metric="rmse", verbose=False,
                                  early_stopping_rounds=int(np.floor(ntrees * patientRate)))

model_rf["pred"] = model_rf["model"].predict(val_x)
model_rf["performance"] = {"RMSE": np.sqrt(metrics.mean_squared_error(val_y, model_rf["pred"])),
                           "R2": metrics.r2_score(val_y, model_rf["pred"])}

print(model_rf["model"].best_iteration_)
print(model_rf["best_params"])
print(model_rf["performance"])

## 김남이 - XGBoost

In [None]:
#=======================[ 김남이 XGBoost ]=======================#
oh_encoder = MyOneHotEncoder()
train_x_oh = oh_encoder.fit_transform(train_x, cat_vars)	

ntrees = 5000 
model_xgb3 = XGBRegressor(booster="gbtree", n_estimators=int(ntrees*0.3), objective="reg:squarederror", seed=343)
xgb_param_grid = {
    'learning_rate': [0.01,0.05],
    'max_depth': [2,4,6],
    'reg_lambda' : [0.5, 1, 5, 10],
    'subsample' : [0.5, 0.6, 0.8]
}

xgb_grid3 = GridTuner(model_xgb3, param_grid=xgb_param_grid, scoring='neg_root_mean_squared_error',
                     cv=10, n_jobs=-1, refit=False, verbose=1)
xgb_grid3.fit(train_x_oh, train_y)

cv_result = pd.DataFrame(xgb_grid3.cv_results_)
cv_result.sort_values(by=['rank_test_score'], inplace=True)
from IPython.core.display import display, HTML
pd.set_option('display.max.colwidth', 500)

cv2 = cv_result.loc[:,['params', 'mean_test_score','rank_test_score']]
cv2.head()


train_x, val_x, train_x_oh, val_x_oh, train_y, val_y = tts(train_x, train_x_oh, train_y, test_size=0.2, random_state=777)

xgb4 = XGBRegressor(n_estimators=5000, learning_rate=0.01, max_depth=4, reg_lambda= 5, subsample=0.5, objective="reg:squarederror",colsample_bytree=0.8,seed=343)
evals=[(val_x_oh, val_y)]
xgb4.fit(train_x_oh, train_y, early_stopping_rounds=500,eval_metric='rmse', eval_set=evals, verbose=1)

xgb4_pred = xgb4.predict(val_x_oh)
xgb4_rmse = mean_squared_error(val_y, xgb4_pred)
xgb4_r2 = r2_score(val_y, xgb4_pred)
print('Mean squared error: ', np.sqrt(xgb4_rmse))
print('R2 score: ', xgb4_r2)

## 이지예 - CatBoost

In [None]:
####------------------------[ 이지예의 Catboost ] -------------------------####
start = time.time()

### 1. CatBoost 최적 하이퍼 파라미터 찾기
ntrees = 3000
cb = cb.CatBoostRegressor(random_state=11, n_estimators=int(ntrees*0.2), loss_function = 'RMSE' )
param = {
    'learning_rate' : [0.05, 0.06, 0.1],
    'max_depth' : [2,5,8],
    'l2_leaf_reg' : [0,3,5,10]
}
grid_cv = GridSearchCV(cb, param_grid=param,  scoring='neg_root_mean_squared_error', cv=10, verbose=1, n_jobs=-1)
grid_cv.fit(train_x_oh, train_y)
print('최적 하이퍼 파라미터: \n', grid_cv.best_params_)
print('최고 예측 정확도(RMSE의 -값): {0:.4f}'.format(grid_cv.best_score_))

### 2. 최적 하이퍼파라미터에 적용시키기
ntrees = 3000
cb1 = cb.CatBoostRegressor(l2_leaf_reg = 3,learning_rate = 0.06, n_estimators = ntrees, max_depth=5, boosting_type='Plain', early_stopping_rounds=500, use_best_model=True, loss_function = 'RMSE') # 최적 하이퍼파라미터에 적용한 후 다시 학습시키기
cb1_model = cb1.fit(train_x_oh, train_y, eval_set=[(val_x_oh, val_y)])
# GridSearchCV를 이용해 최적으로 학습된 estimators로 예측 수행
cb1_model_predict = cb1_model.predict(val_x_oh)

print("Time:%.1f" % (time.time() - start), "seconds") # 코드 실행 시간 계산
print("RMSE: {:.3f}".format(sqrt(mean_squared_error(val_y,cb1_model_predict))))
print("R2: {:.3f}".format(r2_score(val_y,cb1_model_predict)))

## 이예주 - LightGBM

In [None]:
#========================이예주 LightGBM==============================#
start = time.time()

#최적 파라미터 찾기
ntrees = 5000
model_lgb = LGBMRegressor(boosting="goss", n_estimators=int(ntrees*0.2), objective="regression", seed=525)
lgb_param_grid = {
    'learning_rate': [0.01,0.05,0.1],
    'num_leaves': [3,7,15,31],
    'reg_lambda' : [0.1, 1, 10],
    'subsample' : [0.5,0.6,0.7]
}
lgb_grid = GridSearchCV(model_lgb, param_grid=lgb_param_grid, 
                        scoring='neg_root_mean_squared_error',
                     cv=10, n_jobs=-1, refit=False, verbose=1)
lgb_grid.fit(train_x_oh, train_y)
print("최적 하이퍼 파라미터:" , lgb_grid.best_params_)

#학습
lgb = LGBMRegressor(
    boosting="goss", n_estimators=ntrees,
    objective="regression", seed=525,
    learning_rate = 0.01, num_leaves = 7,
    reg_lambda= 1,subsample= 0.5
)
evals = [(X_test, y_test)]
lgb.fit(X_train, y_train, early_stopping_rounds = 100, eval_metric='rmse', eval_set=evals, verbose=True)
lgb_pred = lgb.predict(X_test)
lgb_rmse = sqrt(mean_squared_error(y_test, lgb_pred))
lgb_r2 = r2_score(y_test, lgb_pred)
print('Mean squared error: ', lgb_rmse)
print('R2 score: ', lgb_r2)

## Stacking

In [None]:
#---------------- Stacked Ensemble----------------
# RandomForest 제외 (성능 문제)
rnd.seed(1234)
stacking_base_models = [
    ("XGBoost", xgb.XGBRegressor(booster="gbtree", objective="reg:squarederror",
                                 n_estimators=2478, max_depth=4,
                                 subsample=0.6, colsample_bytree=0.8,
                                 reg_lambda=5, learning_rate=0.01,
                                 verbosity=0, random_state=777)),
    ("LightGBM", lgb.LGBMRegressor(boosting_type="goss", objective="regression",
                                   n_estimators=912, num_leaves=2**3-1,
                                   subsample=0.5, colsample_bytree=0.8,
                                   reg_lambda=11, learning_rate=0.01,
                                   silent=True, random_state=777)),
    ("CatBoost", cat.CatBoostRegressor(boosting_type='Plain', loss_function='RMSE',
                                       n_estimators=535, max_depth=5,
                                       rsm=0.8, # rsm = colsample_bytree
                                       l2_leaf_reg=3, learning_rate=0.06,
                                       silent=True, random_seed=777))
]

meta_learner_model = lm.ElasticNetCV(l1_ratio=[.1, .3, .5, .6, .7, .75, .8, .85, .9, .95, .99, 1], n_alphas=1000, random_state=777)
{'RMSE': 77.21209404644407, 'R2': 0.8712949169257178}
meta_learner_model = lm.LinearRegression()
{'RMSE': 76.8885220689633, 'R2': 0.8723713829212315}

result_se = {}
result_se["model"] = ensemble.StackingRegressor(estimators=stacking_base_models,
                                                final_estimator=meta_learner_model,
                                                cv=10,
                                                n_jobs=multiprocessing.cpu_count())

result_se["model"].fit(train_x_oh, train_y)
result_se["pred"] = result_se["model"].predict(val_x_oh)
result_se["performance"] = {"RMSE": np.sqrt(metrics.mean_squared_error(val_y, result_se["pred"])),
                             "R2": metrics.r2_score(val_y, result_se["pred"])}
print(result_se["performance"])