In [1]:
import os

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.model_selection import KFold, cross_val_score
from sklearn.model_selection import GridSearchCV


from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb


import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import PolynomialFeatures
from utils import get_data

In [2]:
train_data_path = os.path.join('data', 'train.csv')
sub_data_path = os.path.join('data', 'test.csv')

In [3]:
df_train = pd.read_csv(train_data_path)
df_test = pd.read_csv(sub_data_path)
print('train data dim : {}'.format(df_train.shape))
print('sub data dim : {}'.format(df_test.shape))

train data dim : (15035, 21)
sub data dim : (6468, 20)


In [4]:
test_id = df_test.id

In [5]:
X, y = get_data(df_train)

In [6]:
poly = PolynomialFeatures()
X = poly.fit_transform(X)

In [7]:
X_t = get_data(df_test,is_train_data=False)
X_t = poly.transform(X_t)

In [8]:
gboost = GradientBoostingRegressor(random_state=2019, max_depth=4, n_estimators=500)
xgboost = xgb.XGBRegressor(random_state=2019, max_depth=5, n_estimators=500)
lightgbm = lgb.LGBMRegressor(random_state=2019, max_depth=4, n_estimators=500)

models = [{'model':gboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
          {'model':lightgbm, 'name':'LightGBM'}]

### Cross Validation
교차 검증을 통해 모델의 성능을 간단히 평가하겠습니다.

In [9]:
def get_cv_score(models):
    for m in models:
        print("Model {} CV score : {:.4f}".format(m['name'], np.mean(cross_val_score(m['model'], X, y)), 
                                             cv=5))

In [10]:
get_cv_score(models)



Model GradientBoosting CV score : 0.8729




Model XGBoost CV score : 0.8726




Model LightGBM CV score : 0.8776


## Grid Search

param_grid = {'max_depth':[4,5,6] , 'n_estimators':[250,300]}
grid = GridSearchCV(cv=5, estimator = xgboost, param_grid = param_grid, scoring='r2')
grid.fit(X,y)

grid.best_score_ , grid.best_params_

### Make Submission

회귀 모델의 경우에는 cross_val_score 함수가 R<sup>2</sup>를 반환합니다.<br>
R<sup>2</sup> 값이 1에 가까울수록 모델이 데이터를 잘 표현함을 나타냅니다. 3개 트리 모델이 상당히 훈련 데이터에 대해 괜찮은 성능을 보여주고 있습니다.<br> 훈련 데이터셋으로 3개 모델을 학습시키고, Average Blending을 통해 제출 결과를 만들겠습니다.

In [11]:
def AveragingBlending(models, x, y, sub_x):
    for m in models : 
        m['model'].fit(x, y)
    
    predictions = np.column_stack([
        m['model'].predict(sub_x) for m in models
    ])
    return np.mean(predictions, axis=1)

In [12]:
y_pred = AveragingBlending(models, X, y, X_t)

In [13]:
sub = pd.DataFrame(data={'id':test_id,'price':y_pred})

In [14]:
sub.to_csv('submission23.csv', index=False)