### 🚩 Import libraries & packages

In [47]:
# import data tools
import numpy as np
import pandas as pd


# import visual tools
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline
%config InlineBackend.figure_format = 'retina'


# import util tools
import os
from os.path import join    # define route of files

import warnings
warnings.filterwarnings("ignore")


# import data pre-processing tools
import missingno as msno    # check missing data


# import ML tools 1
import sklearn.model_selection as model_selection
import sklearn.metrics as metrics


# import ML tools 2
import sklearn.ensemble as ensemble
import xgboost as xgb
import lightgbm as lgb
    # XGBM (extreme GBM), LGBM (lighting GBM) :
    # gradient boosting machine (GBM) 알고리즘 계열의 변형
    # 각각 독자적인 오픈 소스 라이브러리 형태로 해당 모델을 사용할 수 있다 :D

### 🚩 Define constants (hyper params)

In [48]:
RANDOM_STATE = 2020

TEST_SIZE = 0.2    # train/test split ratio for train_test_split()
CV_SIZE = 5        # cross validation size

EPOCHS = 5

### 🚩 Load data files

In [49]:
# define routes of data(.csv) files
data_dir = "~/aiffel/kaggle_kakr_housing/data/"


# load csv files -> pd.DataFrame
train_data = pd.read_csv(join(data_dir, "train.csv"))
test_data = pd.read_csv(join(data_dir, "test.csv"))

### 🚩 Data pre-processing

In [50]:
### train data pre-processing

# "date" column format change
train_data["date"] = train_data["date"].apply(lambda i: i[:6]).astype(int)

# "price" column regularization(?) -> grow variation of "price" values
train_data["price"] = np.log1p(train_data["price"])

# "id" column remove
train_data = train_data.drop(columns = ["id"])



### test data pre-processing

# "date" column format change
test_data["date"] = test_data["date"].apply(lambda i: i[:6]).astype(int)

# "id" column remove
test_data = test_data.drop(columns = ["id"])

### 🚩 Extract feature matrices (X) & target vectors (y)

In [51]:
# split train data -> feature matrix (X) & target vector (y) split
X = train_data.drop(columns = ["price"])    # exclude target vector column
y = train_data["price"]

### 🚩 Define useful methods (RMSE, cross validation, grid search)

In [52]:
# get RMSE losses from log("price") values
def getRMSE_log2exp(y_test, y_pred):
    y_test, y_pred = np.expm1(y_test), np.expm1(y_pred)
    mse = metrics.mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    return rmse



# get cross validation scores of "one" learning model (for model evaluation)
def getCVscore(X, model):
    kfold = model_selection.KFold(n_splits = CV_SIZE).get_n_splits(X.values)
    score = np.mean(model_selection.cross_val_score(model, X = X.values, y = y, cv = kfold))
    print("CV score of", model.__class__.__name__, ":", score)
    return score
        
    
    
# search best parameter values for learning models
def searchBestParams(model, X, y, param_grid, verbose = 2, n_jobs = 5):
    # initialize grid search model
    grid = model_selection.GridSearchCV(model, param_grid = param_grid, \
                                        scoring = "neg_mean_squared_error", \
                                        cv = CV_SIZE, verbose = verbose, n_jobs = n_jobs)
    
    # grid search model fitting
    grid.fit(X, y)
    
    # return best 5 parameter values
    result = pd.DataFrame(grid.cv_results_["params"])
    result["score"] = grid.cv_results_["mean_test_score"]
    result = result.sort_values("score", ascending = False, ignore_index = True)
    print(result.head())
    return result.head()



# save predicted "price" values as a submission file
def makeSubmissionFile(y_pred):
    data_dir = "~/aiffel/kaggle_kakr_housing/data/"
    submission = pd.read_csv(join(data_dir, "sample_submission.csv"))
    submission["price"] = y_pred
    submission.to_csv(join(data_dir, "submission_new.csv"), index = False)
    print("The submission file has created succesfully.")

### 🚩 Generate & evaluate models

In [53]:
# Generate model instances
extreme = xgb.XGBRegressor(random_state = RANDOM_STATE)
light = lgb.LGBMRegressor(random_state = RANDOM_STATE)
boost = ensemble.GradientBoostingRegressor(random_state = RANDOM_STATE)
forest = ensemble.RandomForestRegressor(random_state = RANDOM_STATE)

# create my own learning model collections
models = [extreme, light, boost, forest]



# Evaluate model performance (cross validation)
#for model in models:
#    score = getCVscore(X, model)

# Output :
# CV score of XGBRegressor  :  0.8973388661281285
# CV score of LGBMRegressor  :  0.9024911910917768
# CV score of GradientBoostingRegressor  :  0.8796312932769542
# CV score of RandomForestRegressor : 0.8851571351312119

### It seems four models provide sufficiently high performance!
### cross validation 실행 결과, 4개의 학습 모델이 충분한 성능을 제공한다는 것을 확인
### 그렇다면 해당 4개의 학습 모델을 활용하여 학습 & 예측 도전!!
### 성능 확인을 마쳤으므로, 성능 평가 과정은 주석(#)으로 처리하여 다음 코드 실행 때는 생략

### 🚩 Search best LGBM param values

In [54]:
# Set available values for LGBM parameters
lgbm_param_grid = {"max_depth" : [-1], \
                    "learning_rate" : [0.01, 0.05, 0.1], \
                    "n_estimators" : [50, 75, 100], \
                    "num_leaves" : [26, 31, 36], \
                    "boosting_type" : ["gbdt"], \
                    "reg_lambda" : [30, 50, 70]}
    # max_depth : 의사 결정 나무의 깊이, 정수 사용
    # learning_rate : 한 스텝에 이동하는 양을 결정하는 파라미터, 보통 0.0001~0.1 사이의 실수 사용
    # n_estimators : 사용하는 개별 모델의 개수, 보통 50~100 이상의 정수 사용
    # num_leaves : 하나의 LightGBM 트리가 가질 수 있는 최대 잎의 수
    # boosting_type : 부스팅 방식, gbdt, rf 등의 문자열 입력
    # reg_lambda : L2 regularization term on weights

    
# Search best set of parameter values
#print(searchBestParams(light, X, y, lgbm_param_grid, verbose = 0))

# Output :
#   boosting_type  learning_rate  max_depth  n_estimators  num_leaves  reg_lambda      score
# 0          gbdt            0.1         -1           100          36          30  -0.026989
# 1          gbdt            0.1         -1           100          31          30  -0.027051
# 2          gbdt            0.1         -1           100          36          50  -0.027284
# 3          gbdt            0.1         -1           100          26          30  -0.027552
# 4          gbdt            0.1         -1           100          31          50  -0.027646

### LGBM는 learning_rate = 0.1, n_estimators = 100, num_leaves = 36, reg_lambda = 30 일 때 최상의 성능임을 확인
### XGBM 또한 LGBM과 유사한 GBM 계열의 학습 알고리즘이므로, 유사한 수치 대입하면 OK
### 파라미터 값별 성능 확인을 마쳤으므로, 파라미터 탐색 과정은 주석(#)으로 처리하여 다음 코드 실행 때는 생략

### 🚩 Adjust XGBD & LGBD params

In [55]:
# Generate another model instances with adjusted params
extreme = xgb.XGBRegressor(random_state = RANDOM_STATE, learning_rate = 0.2, n_estimators = 100)
light = lgb.LGBMRegressor(random_state = RANDOM_STATE, learning_rate = 0.1, n_estimators = 300, num_leaves = 36, reg_lambda = 30)

# Update XGBD & LGBD models in my model collection
models[0] = extreme
models[1] = light

### grid search 결과 값과 유사한 값 위주로 다양한 값을 시도해 본 결과, 해당 파라미터 값으로 결정

### 🚩 Perform fit() & predict() -> Compare Models

In [56]:
for model in models :
    # split train data -> for training & for valication
    X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y, \
                                        test_size = TEST_SIZE, random_state = RANDOM_STATE)

    # model fitting (learning)
    model.fit(X_train, y_train)

    # predict "price"
    y_pred = model.predict(X_valid)

    # get error (wish less than 1.1M)
    error = getRMSE_log2exp(y_valid, y_pred)
    print("RMSE of", model.__class__.__name__, " : ", error)
    
# Output :
# RMSE of XGBRegressor  :  107509.3104391456
# RMSE of LGBMRegressor  :  104654.07159199048
# RMSE of GradientBoostingRegressor  :  128360.19649691365
# RMSE of RandomForestRegressor  :  125487.07102453562

### XGBM, LGBM 모델이 희망이 보이므로 두 가지 모델에 대하여 ensemble 기법을 시도해보자

RMSE of XGBRegressor  :  107509.3104391456
RMSE of LGBMRegressor  :  104654.07159199048
RMSE of GradientBoostingRegressor  :  128360.19649691365
RMSE of RandomForestRegressor  :  125487.07102453562


### 🚩 Define ensemble system methods

In [57]:
# create XGBM/LGBM models with random seed and fitting for EPOCHS iteration
def getAveragingBlending(X, y, X_test, epochs, XGBM = False, LGBM = False):
    y_preds = []
    
    for i in range(epochs):
        if XGBM:    # XGBM model
            model = xgb.XGBRegressor(learning_rate = 0.2, n_estimators = 100)
        else:      # LGBM model
            model = lgb.LGBMRegressor(learning_rate = 0.1, n_estimators = 300, num_leaves = 36, reg_lambda = 30)
            
        X_train, X_valid, y_train, y_valid = model_selection.train_test_split(X, y, test_size = TEST_SIZE)
        
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        y_preds.append(y_pred)  # save predicted values from each model

    y_preds = np.array(y_preds) 
    mean = np.mean(y_preds, axis = 0)    # get mean values of predicted values
    return y_pred

### 🚩 Predict "price" with various learning ideas

In [58]:
# 1. predict with only one XGBM
#extreme = xgb.XGBRegressor(random_state = RANDOM_STATE, learning_rate = 0.2, n_estimators = 100)
#extreme.fit(X, y)
#y_pred = extreme.predict(test_data)


# 2-1. predict with only one LGBM
#light = lgb.LGBMRegressor(random_state = RANDOM_STATE, learning_rate = 0.1, n_estimators = 300, num_leaves = 36)
#light.fit(X, y)
#y_pred = light.predict(test_data)


# 2-2. predict with only one LGBM & regularization parameter 30
#light_reg = lgb.LGBMRegressor(random_state = RANDOM_STATE, learning_rate = 0.1, n_estimators = 300, num_leaves = 36, reg_lambda = 30)
#light_reg.fit(X, y)
#y_pred = light_reg.predict(test_data)


# 2-3. predict with only one LGBM & regularization parameter 50
light_reg = lgb.LGBMRegressor(random_state = RANDOM_STATE, learning_rate = 0.1, n_estimators = 300, num_leaves = 36, reg_lambda = 50)
light_reg.fit(X, y)
y_pred = light_reg.predict(test_data)


# 3. predict with ensembled XGBMs
#y_pred = getAveragingBlending(X, y, test_data, EPOCHS, XGBM = True)


# 4. predict with ensembled LGBMs
#y_pred = getAveragingBlending(X_train, y_train, X_valid, EPOCHS, LGBM = True)

### 🚩 Save predicted "price" as submission file

In [43]:
# recover original price value range
y_pred = np.expm1(y_pred)

# save as file
makeSubmissionFile(y_pred)

The submission file has created succesfully.


-----
**회고록** :  
XGBM, LGBM 이 다른 학습 모델에 비해 성능이 좋다던데, 몸소 체험할 수 있었다  
또한 같은 모델이라도 parameter 값에 따라서도 충분히 성능을 조정할 수 있음을 체감했다  
동일한 모델에 서로 다른 random seed를 부여해서 ensemble 을 시도하였는데, 대부분 성능이 좋지 않았다
서로 다른 모델들을 묶어서 ensemble 을 해야 성능이 향상되는걸까?