In [95]:
import numpy as np
import pandas as pd
import missingno as msno

import matplotlib.pyplot as plt
import seaborn as sns

from keras import models
from keras import layers

from sklearn.linear_model import ElasticNet
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone

# Origin
ori_train = pd.read_csv('./Input/train.csv')
ori_test = pd.read_csv('./Input/test.csv')

# Data Load
df_train = pd.read_csv('./Input/train.csv')
df_test = pd.read_csv('./Input/test.csv')

# grade, sqft_living, lat, totalrooms, floors, view, waterfront, condition, yr_built_mod, living_ratio_norm
# ID 처리
df_train.drop('id', axis=1, inplace=True)
df_test.drop('id', axis=1, inplace=True)

# 가격 norm
df_train['price'] = df_train['price'].map(lambda x: np.log1p(x))

# 날짜 변경 (년.월.일)
df_train['date'] = df_train['date'].apply(lambda x: str(x[2:8])).astype(int)
df_test['date'] = df_test['date'].apply(lambda x: str(x[2:8])).astype(int)

# grade
df_train = df_train.loc[df_train['grade'] != 3]

# sqft_living norm
df_train = df_train.loc[df_train['sqft_living'] < 13000]
df_train['sqft_living'] = df_train['sqft_living'].map(lambda x: np.log1p(x))
df_test['sqft_living'] = df_test['sqft_living'].map(lambda x: np.log1p(x))

# sqft_lot norm
df_train['sqft_lot'] = df_train['sqft_lot'].map(lambda x: np.log1p(x))
df_test['sqft_lot'] = df_test['sqft_lot'].map(lambda x: np.log1p(x))

# sqft_above
df_train['sqft_above'] = df_train['sqft_above'].map(lambda x: np.log1p(x))
df_test['sqft_above'] = df_test['sqft_above'].map(lambda x: np.log1p(x))

# sqft_basement
df_train['sqft_basement'] = df_train['sqft_basement'].map(lambda x: np.log1p(x))
df_test['sqft_basement'] = df_test['sqft_basement'].map(lambda x: np.log1p(x))

# bedrooms
df_train['bedrooms'] = df_train['bedrooms'].map(lambda x: np.log1p(x))
df_test['bedrooms'] = df_test['bedrooms'].map(lambda x: np.log1p(x))

# Total rooms
df_train['totalrooms'] = df_train['bedrooms'] + df_train['bathrooms']
df_test['totalrooms'] = df_test['bedrooms'] + df_test['bathrooms']

# yr_built_mod
df_train['yr_built'] = df_train['yr_built'] - 1900
df_test['yr_built'] = df_test['yr_built'] - 1900

# yr_renovated_bool
df_train['yr_renovated'] = df_train['yr_renovated'].map(lambda x: 1 if x > 0 else 0)
df_test['yr_renovated'] = df_test['yr_renovated'].map(lambda x: 1 if x > 0 else 0)

## 부지 당 주거 공간 활용 비율
#data['living_ratio'] = (data['sqft_living']/data['sqft_lot'])/data['floors']
#data['living_ratio_norm'] = data['living_ratio'].map(lambda x: np.log1p(x) if x > 0 else 0)

# One-hot encoding
df_train = pd.get_dummies(df_train, columns=['waterfront'], prefix='waterfront')
df_test = pd.get_dummies(df_test, columns=['waterfront'], prefix='waterfront')

# Drop features
df_train.drop(['zipcode', 'long', 'sqft_living15', 'sqft_lot15'], axis=1, inplace=True)
df_test.drop(['zipcode', 'long', 'sqft_living15', 'sqft_lot15'], axis=1, inplace=True)

# 값 나누기
Y_train = df_train['price']
df_train.drop('price', axis=1, inplace=True)
X_train = df_train
X_test = df_test

In [96]:
# Cross validation strategy
n_folds = 4

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=9).get_n_splits(X_train.values)
    rmse = np.sqrt(-cross_val_score(model, X_train.values, Y_train, scoring="neg_mean_squared_error", cv=kf))
    return rmse
    
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        for model in self.models_:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.column_stack([model.predict(X) for model in self.models_])
        return np.mean(predictions, axis=1)

In [98]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=0.9, random_state=9))
gboost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05, max_depth=3, max_features="sqrt", min_samples_leaf=15, min_samples_split=10, loss="huber", random_state=9)
model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5, learning_rate=0.05, n_estimators=720, random_state=9)
model_xgb = xbg.XGBRegressor(random_state=2019)
averaged_models = AveragingModels(models = (ENet, gboost, model_lgb))

In [99]:
score2 = rmsle_cv(ENet)
print("ENet score: {:.4f} ({:.4f})\n".format(score2.mean(), score2.std()))
score3 = rmsle_cv(gboost)
print("Gradient Boosting score: {:.4f} ({:.4f})\n".format(score3.mean(), score3.std()))
score4 = rmsle_cv(model_lgb)
print("LGBM score: {:.4f} ({:.4f})\n".format(score4.mean(), score4.std()))
score = rmsle_cv(averaged_models)
print("Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ENet score: 0.2545 (0.0030)

Gradient Boosting score: 0.1872 (0.0044)

LGBM score: 0.1901 (0.0039)

Averaged base models score: 0.1954 (0.0037)



In [100]:
averaged_models.fit(X_train.values, Y_train)
avg_train_pred = averaged_models.predict(X_train.values)
avg_pred = np.expm1(averaged_models.predict(X_test.values))
print(rmsle(Y_train, avg_train_pred))

0.18210413897562874


In [94]:
submission = pd.read_csv('./Input/sample_submission.csv')
submission['price'] = avg_pred
submission.to_csv('./Output/submission_6.csv', index=False)