In [13]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import Lasso, Ridge
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score
from sklearn.base import BaseEstimator, RegressorMixin, clone
from sklearn.ensemble import GradientBoostingRegressor
import xgboost as xgb
import lightgbm as lgb

In [15]:
import warnings

warnings.filterwarnings('ignore')

In [16]:
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")

In [17]:

print(train.shape, test.shape)

(1460, 81) (1459, 80)


In [18]:
train_ID = train['Id']
test_ID = test['Id']
train.drop("Id", axis=1, inplace=True)
test.drop("Id", axis=1, inplace=True)

In [19]:
# Log-transform the target
train["SalePrice"] = np.log1p(train["SalePrice"])

In [20]:
# Combine datasets
y = train.SalePrice
all_data = pd.concat([train.drop("SalePrice", axis=1), test], axis=0)
all_data.shape

(2919, 79)

In [21]:
# Fill missing values smartly
for col in ("PoolQC", "MiscFeature", "Alley", "Fence", "FireplaceQu", "GarageType", 
            "GarageFinish", "GarageQual", "GarageCond", "BsmtQual", "BsmtCond", 
            "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "MasVnrType"):
    all_data[col] = all_data[col].fillna("None")

for col in ("GarageYrBlt", "GarageArea", "GarageCars", "BsmtFinSF1", "BsmtFinSF2", 
            "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath", "MasVnrArea"):
    all_data[col] = all_data[col].fillna(0)

# LotFrontage by neighborhood median
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median())
)

# Mode for MSZoning, Functional, etc.
for col in ["MSZoning", "Electrical", "KitchenQual", "Exterior1st", "Exterior2nd", 
            "SaleType", "Functional"]:
    all_data[col] = all_data[col].fillna(all_data[col].mode()[0])

# Drop this non-useful column
all_data.drop(['Utilities'], axis=1, inplace=True)

# Convert categorical
all_data = pd.get_dummies(all_data)
print(all_data.shape)


(2919, 300)


In [22]:
# Split back
ntrain = train.shape[0]
X = all_data[:ntrain]
X_test = all_data[ntrain:]

In [23]:
# Scale
scaler = RobustScaler()
X = scaler.fit_transform(X)
X_test = scaler.transform(X_test)

In [24]:
# Define Models
lasso = make_pipeline(RobustScaler(), Lasso(alpha=0.0005, random_state=1))
ridge = make_pipeline(RobustScaler(), Ridge(alpha=10))
gboost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10,
                                   loss='huber', random_state=5)
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, random_state=7, nthread=-1)

model_lgb = lgb.LGBMRegressor(objective='regression', num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin=55, bagging_fraction=0.8,
                              bagging_freq=5, feature_fraction=0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf=6, min_sum_hessian_in_leaf=11)


In [25]:
class AveragingModels(BaseEstimator, RegressorMixin):
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        for model in self.models_:
            model.fit(X, y)
        return self

    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)

averaged_models = AveragingModels(models=(lasso, ridge, gboost, model_xgb, model_lgb))
averaged_models.fit(X, y)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.006674 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1509
[LightGBM] [Info] Number of data points in the train set: 1460, number of used features: 248
[LightGBM] [Info] Start training from score 12.024057


In [26]:
def rmsle_cv(model):
    kf = KFold(5, shuffle=True, random_state=42)
    rmse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error", cv=kf))
    return rmse

score = rmsle_cv(averaged_models)
print("Stacked model RMSLE: {:.4f} ({:.4f})".format(score.mean(), score.std()))

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002761 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1466
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 240
[LightGBM] [Info] Start training from score 12.030658
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002154 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1458
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 237
[LightGBM] [Info] Start training from score 12.016898
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002160 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1468
[LightGBM] [Info] Number of data points in the train set: 1168, number of used features: 240
[LightGBM] [Info] Start t

In [27]:
X_test = pd.DataFrame(X_test, columns=all_data.columns[:X_test.shape[1]])  # Align just in case
X_test = X_test.reindex(columns=all_data.columns[:X.shape[1]], fill_value=0)

stacked_pred = np.expm1(averaged_models.predict(X_test))

submission = pd.DataFrame({"Id": test_ID, "SalePrice": stacked_pred})
submission.to_csv("submission.csv", index=False)
submission.head()



Unnamed: 0,Id,SalePrice
0,1461,119175.639739
1,1462,153085.241266
2,1463,179998.305201
3,1464,195236.846112
4,1465,192178.621193
