In [13]:
# 관련 라이브러리 import
import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import xgboost as xgb
import lightgbm as lgb
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

from sklearn.model_selection import GridSearchCV

In [55]:
# 데이터 가져오기
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

train_data_path = join(data_dir, 'train.csv')
sub_data_path = join(data_dir, 'test.csv')

print(train_data_path)
print(sub_data_path)

train = pd.read_csv(train_data_path)
sub = pd.read_csv(sub_data_path)

print(f'train data dim : {train.shape}')
print(f'sub data dim : {sub.shape}')

/aiffel/aiffel/kaggle_kakr_housing/data/train.csv
/aiffel/aiffel/kaggle_kakr_housing/data/test.csv
train data dim : (15035, 21)
sub data dim : (6468, 20)


In [56]:
# 학습 데이터에서 정답 분리
y = train['price']
del train['price']

print(train.columns)

# 데이터 전처리
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
sub['date'] = sub['date'].apply(lambda i: i[:6]).astype(int)

Index(['id', 'date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',
       'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',
       'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',
       'sqft_living15', 'sqft_lot15'],
      dtype='object')


In [57]:
# 데이터 결측치 검사
data = pd.concat((train, sub), axis=0)

print(len(data))

for c in data.columns:
    print('{} : {}'.format(c, len(data.loc[pd.isnull(data[c]), c].values)))

21503
id : 0
date : 0
bedrooms : 0
bathrooms : 0
sqft_living : 0
sqft_lot : 0
floors : 0
waterfront : 0
view : 0
condition : 0
grade : 0
sqft_above : 0
sqft_basement : 0
yr_built : 0
yr_renovated : 0
zipcode : 0
lat : 0
long : 0
sqft_living15 : 0
sqft_lot15 : 0


In [58]:
random_state = 250113

# 데이터 분리
X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.1, random_state=random_state)


In [59]:
# 모델 생성
gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = xgb.XGBRegressor(random_state=random_state)
lightgbm = lgb.LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)

models = [gboost, xgboost, lightgbm, rdforest]


In [60]:
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))


In [61]:
# 하이퍼 파라미터 최적화를 위한 함수
def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5):
    # GridSearchCV
    grid_model = GridSearchCV(model, param_grid=param_grid, scoring='neg_mean_squared_error', cv=5, verbose=verbose, n_jobs=n_jobs)

    # 모델 fit
    grid_model.fit(train, y)

    params = grid_model.cv_results_['params']
    score = grid_model.cv_results_['mean_test_score']

    results = pd.DataFrame(params)
    results['score'] = score
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [75]:
# 하이퍼 파라미터 설정값
param_grid = {
    'num_leaves': [31, 63],
    'max_depth': [10, 30, 50],
    'n_estimators': [100, 300, 500]
}

In [63]:
# 데이터 선택
#features = ['date', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above', 'sqft_basement', 'yr_built', 'yr_renovated', 'sqft_living15', 'sqft_lot15']

#train = train[features]
y = np.log1p(y)

In [48]:
def get_scores(models, train, y):
    df = {}
    for model in models:
        model_name = model.__class__.__name__

        # train, test 데이터셋 분리
        X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.2, random_state=random_state)

        # 모델 학습
        model.fit(X_train, y_train)
        
        # 예측
        y_pred = model.predict(X_test)

        # 예측 결과의 rmse값 저장
        df[model_name] = rmse(y_test, y_pred)
        
        # data frame에 저장
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)
    return score_df

get_scores(models, train, y)

Unnamed: 0,RMSE
XGBRegressor,134272.681536
RandomForestRegressor,132491.749357
GradientBoostingRegressor,130132.470157
LGBMRegressor,121313.856049


[CV] END .......................max_depth=1, n_estimators=50; total time=   2.9s
[CV] END ......................max_depth=1, n_estimators=100; total time=   8.5s
[CV] END ......................max_depth=10, n_estimators=50; total time=  40.9s
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.2min
[CV] END .......................max_depth=1, n_estimators=50; total time=   4.3s
[CV] END ......................max_depth=1, n_estimators=100; total time=   8.6s
[CV] END ......................max_depth=10, n_estimators=50; total time=  42.1s
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.2min
[CV] END .......................max_depth=1, n_estimators=50; total time=   7.3s
[CV] END ......................max_depth=1, n_estimators=100; total time=  10.4s
[CV] END ......................max_depth=10, n_estimators=50; total time=  39.7s
[CV] END .....................max_depth=10, n_estimators=100; total time= 1.3min


In [76]:
# 최적 모델 학습
model = lgb.LGBMRegressor(random_state=random_state)
#model = xgb.XGBRegressor(random_state=random_state)
my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5)

Fitting 5 folds for each of 18 candidates, totalling 90 fits


Unnamed: 0,max_depth,n_estimators,num_leaves,score,RMSLE
8,30,300,31,-0.026495,0.162771
14,50,300,31,-0.026495,0.162771
2,10,300,31,-0.026591,0.163068
16,50,500,31,-0.026758,0.163579
10,30,500,31,-0.026758,0.163579
3,10,300,63,-0.026802,0.163713
4,10,500,31,-0.026827,0.163788
1,10,100,63,-0.02687,0.163922
7,30,100,63,-0.027154,0.164784
13,50,100,63,-0.027154,0.164784


In [77]:
model = lgb.LGBMRegressor(max_depth=30, n_estimators=300, num_leaves=31, random_state=random_state)
#model = xgb.XGBRegressor(random_state=random_state)

model.fit(train, y)

LGBMRegressor(max_depth=30, n_estimators=300, random_state=250113)

In [78]:
# submission 데이터 준비
#sub = sub[features]

prediction = model.predict(sub)
prediction = np.expm1(prediction)
prediction

array([ 503177.58369034,  485859.8823204 , 1410212.58179806, ...,
        458023.38064559,  333485.73359231,  441249.29932214])

In [79]:
data_dir = os.getenv('HOME')+'/aiffel/kaggle_kakr_housing/data'

submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission.head()

submission['price'] = prediction
submission.head()

submission_csv_path = '{}/submission_{}_RMSLE_{}.csv'.format(data_dir, 'lgbm', '0.162771')
submission.to_csv(submission_csv_path, index=False)
print(submission_csv_path)

/aiffel/aiffel/kaggle_kakr_housing/data/submission_lgbm_RMSLE_0.162771.csv


In [1]:
print(f'Kaggle Score: 111858.87392 (Private), 108813.35707 (Public) ')

Kaggle Score: 111858.87392 (Private), 108813.35707 (Public) 
