In [62]:
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', None)
import missingno as msno
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats

from sklearn.linear_model import Ridge, RidgeCV, ElasticNet, ElasticNetCV, LassoCV, LassoLarsCV
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler, LabelEncoder
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
import xgboost as xgb
import lightgbm as lgb

import warnings
warnings.filterwarnings("ignore")

# Origin
ori_train = pd.read_csv('./Input/train.csv')
ori_test = pd.read_csv('./Input/test.csv')

# Data Load
df_train = pd.read_csv('./Input/train.csv')
df_test = pd.read_csv('./Input/test.csv')

# 이상치 제거
#df_train = df_train.loc[df_train['id']!=2302] # grade 3
df_train = df_train.loc[df_train['id']!=4123] # grade 3
df_train = df_train.loc[df_train['id']!=2775] # grade 11
#df_train = df_train.loc[df_train['id']!=8912] # sqft_living > 13000

# ID 제거
df_train.drop("id", axis=1, inplace=True)
df_test.drop("id", axis=1, inplace=True)

# 데이터 타입 변경
#object_feats = ['bedrooms', 'bathrooms', 'floors', 'waterfront', 'view', 'condition', 'grade']
#df_train[object_feats] = df_train[object_feats].astype('object')

# 거래 년도
df_train['year'] = df_train['date'].apply(lambda x: str(x[0:4])).astype(int)
df_test['year'] = df_test['date'].apply(lambda x: str(x[0:4])).astype(int)
le1 = LabelEncoder()
le1.fit(df_train['year'])
le1.fit(df_test['year'])
df_train['year'] = le1.transform(df_train['year'])
df_test['year'] = le1.transform(df_test['year'])

# 거래 년월
df_train['yearmm'] = df_train['date'].apply(lambda x: str(x[0:6])).astype(int)
df_test['yearmm'] = df_test['date'].apply(lambda x: str(x[0:6])).astype(int)
le2 = LabelEncoder()
le2.fit(df_train['yearmm'])
le2.fit(df_test['yearmm'])
df_train['yearmm'] = le2.transform(df_train['yearmm'])
df_test['yearmm'] = le2.transform(df_test['yearmm'])

# 거래 날짜
df_train['date'] = df_train['date'].apply(lambda x: str(x[0:8])).astype(int)
df_test['date'] = df_test['date'].apply(lambda x: str(x[0:8])).astype(int)

# 우편번호 카테고리화
le3 = LabelEncoder()
le3.fit(df_train['zipcode'])
le3.fit(df_test['zipcode'])
df_train['zipcode'] = le3.transform(df_train['zipcode'])
df_test['zipcode'] = le3.transform(df_test['zipcode'])

# 재건축 여부
df_train['is_renovated'] = df_train['yr_renovated'].map(lambda x: 1 if x > 0 else 0)
df_test['is_renovated'] = df_test['yr_renovated'].map(lambda x: 1 if x > 0 else 0)

# 최신 건축 년도
df_train['yr_renovated'] = np.maximum(df_train['yr_built'], df_train['yr_renovated'])
df_test['yr_renovated'] = np.maximum(df_test['yr_built'], df_test['yr_renovated'])

# 방의 총 갯수
df_train['totalrooms'] = df_train['bedrooms'] + df_train['bathrooms']
df_test['totalrooms'] = df_test['bedrooms'] + df_test['bathrooms']

## 부지 활용도
df_train['living_lot_ratio'] = df_train['sqft_living'] / df_train['sqft_lot']
df_test['living_lot_ratio'] = df_test['sqft_living'] / df_test['sqft_lot']

# 평균 전체공간
#df_train['sqft_total'] = df_train['sqft_living'] + df_train['sqft_lot']
#df_test['sqft_total'] = df_test['sqft_living'] + df_test['sqft_lot']

# 평균 전체공간
#df_train['sqft_total15'] = df_train['sqft_living15'] + df_train['sqft_lot15']
#df_test['sqft_total15'] = df_test['sqft_living15'] + df_test['sqft_lot15']

# 층 별 주거공간
df_train['sqft_living_floor'] = df_train['sqft_above'] / df_train['floors']
df_test['sqft_living_floor'] = df_test['sqft_above'] / df_test['floors']

# 부지 대비 건물 면적 비율
df_train['sqft_building_ratio'] = df_train['sqft_living_floor'] / df_train['sqft_lot']
df_test['sqft_building_ratio'] = df_test['sqft_living_floor'] / df_test['sqft_lot']

# 평균 대비 주거공간 비율
df_train['living15_ratio'] = (df_train['sqft_living'] - df_train['sqft_living15'])
df_test['living15_ratio'] = (df_test['sqft_living'] - df_test['sqft_living15'])

# 평균 대비 부지 비율
df_train['lot15_ratio'] = df_train['sqft_lot'] - df_train['sqft_lot15']
df_test['lot15_ratio'] = df_test['sqft_lot'] - df_test['sqft_lot15']

# 평균 대비 전체 비율
#df_train['total15_ratio'] = df_train['sqft_total'] / df_train['sqft_total15']
#df_test['total15_ratio'] = df_test['sqft_total'] / df_test['sqft_total15']

# 위도 단순화
def category_lat(x):
    if x < 47.2:
        return 0    
    elif x < 47.3:
        return 1
    elif x < 47.4:
        return 2
    elif x < 47.5:
        return 3
    elif x < 47.6:
        return 4
    elif x < 47.7:
        return 5
    else:
        return 6
    
df_train['lat_cat'] = df_train['lat'].apply(category_lat)
df_test['lat_cat'] = df_test['lat'].apply(category_lat)

# 경도 단순화
def category_long(x):
    if x < -122.5:
        return 0    
    elif x < -122.4:
        return 1
    elif x < -122.3:
        return 2
    elif x < -122.2:
        return 3
    elif x < -122.1:
        return 4
    else:
        return 5
    
df_train['long_cat'] = df_train['long'].apply(category_long)
df_test['long_cat'] = df_test['long'].apply(category_long)

# 등급 단순화
def category_grade(x):
    if x < 4:
        return 1
    elif x < 7:
        return 2
    elif x < 9:
        return 3
    elif x < 11:
        return 4    
    else:
        return 5
    
df_train['grade_cat'] = df_train['grade'].apply(category_grade)
df_test['grade_cat'] = df_test['grade'].apply(category_grade)

# 외관 점수 (cat)
df_train['out_score_cat'] = (df_train['view']+1) * df_train['grade_cat']
df_test['out_score_cat'] = (df_test['view']+1) * df_test['grade_cat']

# 내관 점수 (cat)
df_train['in_score_cat'] = df_train['condition'] * df_train['grade_cat']
df_test['in_score_cat'] = df_test['condition'] * df_test['grade_cat']

# 총괄 점수 (cat)
df_train['total_score_cat'] = df_train['out_score_cat'] + df_train['in_score_cat']
df_test['total_score_cat'] = df_test['out_score_cat'] + df_test['in_score_cat']

# 내외관 평가
#df_train['view_condition'] = df_train['view'] + df_train['condition']
#df_test['view_condition'] = df_test['view'] + df_test['condition']

# 침실 점수 (cat)
df_train['bedrooms_score_cat'] = df_train['bedrooms'] * df_train['grade_cat']
df_test['bedrooms_score_cat'] = df_test['bedrooms'] * df_test['grade_cat']

# 화장실 점수 (cat)
df_train['bathrooms_score_cat'] = df_train['bathrooms'] * df_train['grade_cat']
df_test['bathrooms_score_cat'] = df_test['bathrooms'] * df_test['grade_cat']

# 방 점수 (cat)
df_train['totalrooms_score_cat'] = df_train['totalrooms'] * df_train['grade_cat']
df_test['totalrooms_score_cat'] = df_test['totalrooms'] * df_test['grade_cat']

# 정규화
skew_columns = ['sqft_living','sqft_lot', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15']
#skew_columns = ['sqft_living','sqft_lot', 'sqft_total', 'sqft_above', 'sqft_basement', 'sqft_living15', 'sqft_lot15', 'sqft_total15', 'living15_ratio', 'lot15_ratio', 'total15_ratio']
for col in skew_columns:
    df_train[col] = df_train[col].map(lambda x: np.log1p(x))
    df_test[col] = df_test[col].map(lambda x: np.log1p(x))
    
# 가격 정규화 (np.log1p)
df_train['price'] = df_train['price'].map(lambda x: np.log1p(x))
        
# Drop features
df_train.drop(['grade_cat', 'bedrooms_score_cat', 'bathrooms_score_cat'], axis=1, inplace=True)
df_test.drop(['grade_cat', 'bedrooms_score_cat', 'bathrooms_score_cat'], axis=1, inplace=True)    

# 값 나누기
Y_train = df_train['price']
Y_check = np.expm1(Y_train)
df_train.drop('price', axis=1, inplace=True)
X_train = df_train
X_test = df_test

In [18]:
# Cross validation strategy
n_folds = 5
kf = KFold(n_splits=n_folds, shuffle=True, random_state=42)

def rmse_cv(model):
    rmse = np.sqrt(-cross_val_score(model, X_train, Y_train, scoring="neg_mean_squared_error", cv=kf))
    return rmse
    
def rmse(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)
    
# 모델 생성
gboost = GradientBoostingRegressor(n_estimators=5000, learning_rate=0.05, max_depth=4, max_features="sqrt", min_samples_leaf=15, min_samples_split=10, loss="huber", random_state=64)
model_xgb = xgb.XGBRegressor(n_estimators=2500, learning_rate=0.05, max_depth=4, objective="reg:linear", eval_metric='rmse', colsample_bytree=0.8, gamma=0.05, min_child_weight=1.8, reg_alpha=0.5, subsample=0.8, silent=1, random_state=64, nthread=-1)
model_lgb = lgb.LGBMRegressor(n_estimators=3000, learning_rate=0.015, max_depth=4, objective="regression", num_leaves=31, min_data_in_leaf=30, min_child_samples=20, boosting="gbdt", feature_fraction=0.9, bagging_freq=1, bagging_fraction=0.9, bagging_seed=11, metric='rmse', lambda_l1=0.1, nthread=4, random_state=64)
#averaged_models = AveragingModels(models = (gboost, model_xgb, model_lgb))

In [58]:
score1 = rmse_cv(gboost)
print("Gradient Boosting score: {:.4f} ({:.4f})".format(score1.mean(), score1.std()))
score2 = rmse_cv(model_xgb)
print("XGB score: {:.4f} ({:.4f})".format(score2.mean(), score2.std()))
score3 = rmse_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})".format(score3.mean(), score3.std()))
#score4 = rmse_cv(averaged_models)
#print("Averaged base models score: {:.4f} ({:.4f})".format(score4.mean(), score4.std()))

Gradient Boosting score: 0.1603 (0.0032)
XGB score: 0.1591 (0.0028)
LGBM score: 0.1610 (0.0021)


In [63]:
gboost.fit(X_train, Y_train)
gboost_train_pred = np.expm1(gboost.predict(X_train))
gboost_pred = np.expm1(gboost.predict(X_test))
print("RMSE(GradientBoosting): {:.6f}".format(rmse(Y_check, gboost_train_pred)))

model_xgb.fit(X_train, Y_train)
xgb_train_pred = np.expm1(model_xgb.predict(X_train))
xgb_pred = np.expm1(model_xgb.predict(X_test))
print("RMSE(XGB):              {:.6f}".format(rmse(Y_check, xgb_train_pred)))

model_lgb.fit(X_train, Y_train)
lgb_train_pred = np.expm1(model_lgb.predict(X_train))
lgb_pred = np.expm1(model_lgb.predict(X_test))
print("RMSE(LightGBM):         {:.6f}".format(rmse(Y_check, lgb_train_pred)))

#averaged_models.fit(X_train, Y_train)
#avg_train_pred = np.expm1(averaged_models.predict(X_train))
#avg_pred = np.expm1(averaged_models.predict(X_test))
#print("RMSE(AverageModel):     {:.6f}".format(rmse(Y_check, avg_train_pred)))

#ensemble_train_pred = gboost_train_pred * 0.7 + xgb_train_pred * 0.1 + lgb_train_pred * 0.2
#ensemble_pred = gboost_pred * 0.7 + xgb_pred * 0.1 + lgb_pred * 0.2
#print("RMSE(EnsembleModel):    {:.6f}".format(rmse(Y_check, ensemble_train_pred)))

RMSE(GradientBoosting): 53711.591956
RMSE(XGB):              66158.322772
RMSE(LightGBM):         85059.466246


In [64]:
ensemble_train_pred = gboost_train_pred * 0.7 + xgb_train_pred * 0.15 + lgb_train_pred * 0.15
ensemble_pred = gboost_pred * 0.7 + xgb_pred * 0.1 + lgb_pred * 0.2
ensemble_pred2 = gboost_pred * 0.7 + xgb_pred * 0.15 + lgb_pred * 0.15
print("RMSE(EnsembleModel):    {:.6f}".format(rmse(Y_check, ensemble_train_pred)))

RMSE(EnsembleModel):    58069.934440


In [65]:
best = pd.read_csv('./Output/submission_ensemble_102757.csv')
y_best = best['price']
print("RMSE(with best_case):  {:.6f}".format(rmse(y_best, ensemble_pred)))
print("RMSE2(with best_case): {:.6f}".format(rmse(y_best, ensemble_pred2)))

RMSE(with best_case):  19950.832396
RMSE2(with best_case): 20007.170801


In [66]:
submission1 = pd.read_csv('./Input/sample_submission.csv')
submission1['price'] = ensemble_pred2
submission1.to_csv('./Output/submission_ens.csv', index=False)
submission2 = pd.read_csv('./Input/sample_submission.csv')
submission2['price'] = ensemble_pred
submission2.to_csv('./Output/submission_ensemble.csv', index=False)