In [436]:
import matplotlib.pyplot as plt
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import warnings
warnings.filterwarnings("ignore")

import os
from os.path import join

import pandas as pd
import numpy as np

import missingno as msno

from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
import xgboost as xgb
import lightgbm as lgb

import matplotlib.pyplot as plt
import seaborn as sns


data_dir = os.getenv('HOME')+'/Desktop/Changhee/changhee_git/Kaggle_practice/data'

train_data_path = join(data_dir, 'train.csv')
test_data_path = join(data_dir, 'test.csv') 

train = pd.read_csv(train_data_path)
test = pd.read_csv(test_data_path)

In [437]:
train['date'] = train['date'].apply(lambda i: i[:6]).astype(int)
y = train['price']
del train['price']
del train['id']
y = np.log1p(y)


skew_columns = ['bedrooms', 'sqft_living', 'sqft_lot', 'sqft_above', 'sqft_basement']

for c in skew_columns:
    train[c] = np.log1p(train[c].values)
    test[c]=np.log1p(test[c].values)


In [438]:
test['date'] = test['date'].apply(lambda i: i[:6]).astype(int)
del test['id']

In [439]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

#log취한 y로 구하려면 y_test랑 y_pred에 np.expm1() 해줘야해.
def rmse(y_test, y_pred):
    return np.sqrt(mean_squared_error(np.expm1(y_test), np.expm1(y_pred)))

from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor

random_state=2020
gboost = GradientBoostingRegressor(random_state=random_state)
xgboost = XGBRegressor(random_state=random_state)
lightgbm = LGBMRegressor(random_state=random_state)
rdforest = RandomForestRegressor(random_state=random_state)


### 모델 별 RMSE로 성능 평가

In [1]:
models = [gboost, xgboost, lightgbm, rdforest]


def get_scores(models, train, y):
    df = {}

    for model in models:
        model_name = model.__class__.__name__

        X_train, X_test, y_train, y_test = train_test_split(train, y, random_state=random_state, test_size=0.2)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        df[model_name] = rmse(y_test, y_pred)
        score_df = pd.DataFrame(df, index=['RMSE']).T.sort_values('RMSE', ascending=False)

    return score_df

get_scores(models,train,y)

NameError: name 'gboost' is not defined

### Hyper parameter 조정 : Random search로

In [None]:
from sklearn.model_selection import RandomizedSearchCV

def my_GridSearch(model, train, y, param_grid, verbose=2, n_jobs=5, n_inter=10):
    # RandomizedSearchCV 모델로 초기화
    random_model = RandomizedSearchCV(model, param_distributions=param_grid, scoring='neg_mean_squared_error', \
                              cv=5, verbose=verbose, n_jobs=n_jobs)

    # 모델 fitting
    random_model.fit(train, y)

    # 결과값 저장
    params = random_model.cv_results_['params']
    score = random_model.cv_results_['mean_test_score']

    # 데이터 프레임 생성
    results = pd.DataFrame(params)
    results['score'] = score

    # RMSLE 값 계산 후 정렬
    results['RMSLE'] = np.sqrt(-1 * results['score'])
    results = results.sort_values('RMSLE')

    return results

In [None]:
# LGBMRegressor
model = LGBMRegressor(random_state=random_state)
np.random.seed(2020)
param_LGBMR = {
    #'n_estimators': np.random.randint(10,400,100),
    #'max_depth': np.random.randint(20, 60,20),
    #'learning_rate': np.random.randint(5,500,50)*0.0001
    
  
    'num_leaves': np.random.randint(20,300,30),
    'max_depth' : np.random.randint(20,60,30),
    'colsample_bytree' : np.random.randint(2,9,10)*0.1,
    'subsample' : np.random.randint(2,9,10)*0.1
   
}


my_GridSearch(model, train, y, param_LGBMR)

#### 하이퍼 파라미터에 따른 모델 초기화

In [None]:
lightgbm = LGBMRegressor(subsample=0.3,num_leaves=99,max_depth=47,colsample_bytree=0.8)

In [None]:
# XgBRegressor
model = XGBRegressor(random_state=random_state)
np.random.seed(2020)
param_XGBR = {
              #'objective':['reg:linear'],
              'learning_rate': np.random.randint(10,90,40)*0.01, #so called `eta` value
              'max_depth': np.random.randint(1,20,20),
              'min_child_weight': np.random.randint(1,20,20),
              #'gamma':np.random.randint(0,60,20)*0.01,
              #'silent': [1],
              #'subsample': [0.7],
              'colsample_bytree': np.random.randint(0,10,50)*0.1,
              'n_estimators': np.random.randint(400,600,20),
            
              
             }

my_GridSearch(model, train, y, param_XGBR)

#### 하이퍼 파라미터에 따른 모델 초기화

In [440]:
xgboost=xgb.XGBRegressor(n_estimators=489,min_child_weight=1,max_depth=9,learning_rate=0.13,colsample_bytree=0.7)

In [427]:
#xgboost.fit(train,y)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.13, max_delta_step=0, max_depth=9,
             min_child_weight=1, missing=nan, monotone_constraints='()',
             n_estimators=489, n_jobs=0, num_parallel_tree=1, random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [429]:
#prediction = xgboost.predict(test)
#prediction=np.expm1(prediction)
#prediction

array([ 510530.47,  438959.03, 1344426.5 , ...,  501507.47,  340366.62,
        424781.06], dtype=float32)

In [441]:
def AveragingBlending(models, x, y, sub_x):
    
    for m in models : 
        m['model'].fit(x.values, y)
    
    predictions = np.column_stack([
        m['model'].predict(sub_x.values) for m in models
    ])
    predictions=np.expm1(predictions)
    
    return np.mean(predictions, axis=1)

In [442]:
models = [{'model':xgboost, 'name':'GradientBoosting'}, {'model':xgboost, 'name':'XGBoost'},
         {'model':xgboost, 'name':'LightGBM'},{'model':xgboost,'name':'RandomeForest'},{'model':xgboost, 'name':'XGBoost'}]


y_pred = AveragingBlending(models, train, y, test)
print(len(y_pred))
y_pred

6468


array([ 510530.44,  438959.06, 1344426.5 , ...,  501507.44,  340366.62,
        424781.06], dtype=float32)

In [443]:
submission_path = join(data_dir, 'sample_submission.csv')
submission = pd.read_csv(submission_path)
submission.head()

Unnamed: 0,id,price
0,15035,100000
1,15036,100000
2,15037,100000
3,15038,100000
4,15039,100000


In [444]:
result = pd.DataFrame({
    'id' : submission['id'], 
    'price' : y_pred
})

result.head()

Unnamed: 0,id,price
0,15035,510530.4
1,15036,438959.1
2,15037,1344426.0
3,15038,294490.2
4,15039,330819.6


In [445]:
my_submission_path = join(data_dir, 'submission_hyper_XGB5_2.csv')
result.to_csv(my_submission_path, index=False)

print(my_submission_path)

/home/aiffel/Desktop/Changhee/changhee_git/Kaggle_practice/data/submission_hyper_XGB5_2.csv


# 실험 과정

#### 데이터
 1. Train Data중 분포가 고르지 못한 데이터 Log(x+1)정규화
 2. Price 도 Log(x+1) 정규화
 
#### 모델
 3. 성능이 가장 좋은 XGBoost 선택(Model Scorre 평가)
 4. Randomsearch로 파라미터 탐색해서 적용
 5. Blending (XGBoost 5개 로)
 
#### 학습 결과
 6. Kaggle에 제출한 Score 가 110900~110930  사이로 나왔다.
 
 ![img](img1.png)