In [1]:
import pandas as pd
import numpy as np

from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.cross_validation import cross_val_score
from sklearn.preprocessing import Imputer

from scipy.stats import skew

import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

data = pd.read_csv("feature_fillna_onehot.csv")



In [2]:
# Adding total sqfootage feature 
data['TotalSF'] = data['TotalBsmtSF'] + data['1stFlrSF'] + data['2ndFlrSF']

In [3]:
# The feature names
data.columns

Index([u'Id', u'SalePrice', u'KitchenAbvGr', u'LotArea', u'LotFrontage',
       u'LowQualFinSF', u'MasVnrArea', u'MiscVal', u'EnclosedPorch',
       u'ExterCond',
       ...
       u'SaleType_ConLw', u'SaleType_New', u'SaleType_Oth', u'SaleType_WD',
       u'Street_Grvl', u'Street_Pave', u'Utilities_AllPub',
       u'Utilities_NoSeWa', u'source', u'TotalSF'],
      dtype='object', length=270)

In [4]:
# Numerical features
feature_num = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", 
               "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "TotRmsAbvGrd", 
               "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", "ScreenPorch",
              "PoolArea", "MiscVal", "TotalSF"]

In [5]:
from scipy.special import boxcox1p
from scipy.stats import skew

In [6]:
# Box Cox Transformation of (highly) skewed features
lam = 0.15
for feature in feature_num:
    if skew(data[feature]) > 0.75:
        data[feature] = boxcox1p(data[feature], lam)
        print feature

LotFrontage
LotArea
MasVnrArea
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
TotRmsAbvGrd
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
TotalSF


In [7]:
# Split the data
df_train = data[data["source"] == "train"]
df_test = data[data["source"] == "test"]

In [8]:
# Delete price outlier
df_train = df_train[~(df_train["Id"] == 524)]
df_train = df_train[~(df_train["Id"] == 1299)]

In [9]:
# The target value
price = df_train["SalePrice"]
y_train = np.log(price)

In [10]:
# Drop "Id", "SalePrice", and "source"
X_train = df_train.drop("Id", axis = 1)
X_train = X_train.drop("SalePrice", axis = 1)
X_train = X_train.drop("source", axis = 1)
X_test = df_test.drop("Id", axis = 1)
X_test = X_test.drop("SalePrice", axis = 1)
X_test = X_test.drop("source", axis = 1)

In [11]:
# Library for machine learning
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb

In [12]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [13]:
# Base models
# Lasso
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
# Gradient boosting
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
# XGboosting
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
# Random forest
model_RF = RandomForestRegressor(n_estimators=1500, n_jobs=-1, max_features = "auto")

In [14]:
# Score for lasso
score = rmsle_cv(lasso)
print "\nLasso score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())


Lasso score: 0.1095 (0.0055)



In [15]:
# Score for gradient boosting
score = rmsle_cv(GBoost)
print "Gradient Boosting score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())

Gradient Boosting score: 0.1153 (0.0062)



In [16]:
# Score for XGBoosting
score = rmsle_cv(model_xgb)
print "Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())

Xgboost score: 0.1146 (0.0057)



In [17]:
# Score for random forest
score = rmsle_cv(model_RF)
print "Random forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std())

Random forest score: 0.1363 (0.0032)



In [18]:
# Predict by Lasso
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
y_pred_lasso = np.exp(y_pred_lasso)

# Submission 
sub = pd.DataFrame()
sub['Id'] = df_test["Id"]
sub['SalePrice'] = y_pred_lasso
sub.to_csv('submission_lasso.csv',index=False)

In [19]:
# Predict by Gradient boosting
GBoost.fit(X_train, y_train)
y_pred_GBoost = GBoost.predict(X_test)
y_pred_GBoost = np.exp(y_pred_GBoost)

# Submission
sub = pd.DataFrame()
sub['Id'] = df_test["Id"]
sub['SalePrice'] = y_pred_GBoost
sub.to_csv('submission_GBoost.csv',index=False)

In [20]:
# Predict by random forest
model_RF.fit(X_train, y_train)
y_pred_RF = model_RF.predict(X_test)
y_pred_RF = np.exp(y_pred_RF)

# Submission
sub = pd.DataFrame()
sub['Id'] = df_test["Id"]
sub['SalePrice'] = y_pred_RF
sub.to_csv('submission_RF.csv',index=False)

In [21]:
# Lasso + gradient boosting
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
GBoost.fit(X_train, y_train)
y_pred_GBoost = GBoost.predict(X_test)

y_pred_lasso_GBoost = (y_pred_lasso + y_pred_GBoost) / 2
y_pred_lasso_GBoost = np.exp(y_pred_lasso_GBoost)

# Submission
sub = pd.DataFrame()
sub['Id'] = df_test["Id"]
sub['SalePrice'] = y_pred_lasso_GBoost
sub.to_csv('submission_Lasso_GBoost.csv',index=False)

In [22]:
# Xgboosting
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)
y_pred_xgb = np.exp(y_pred_xgb)

# Submission
sub = pd.DataFrame()
sub['Id'] = df_test["Id"]
sub['SalePrice'] = y_pred_xgb
sub.to_csv('submission_xgb.csv',index=False)

In [23]:
# Lasso + xgboosting
lasso.fit(X_train, y_train)
y_pred_lasso = lasso.predict(X_test)
model_xgb.fit(X_train, y_train)
y_pred_xgb = model_xgb.predict(X_test)

y_pred_lasso_xgb = (y_pred_lasso + y_pred_xgb) / 2
y_pred_lasso_xgb = np.exp(y_pred_lasso_xgb)

# Submission
sub = pd.DataFrame()
sub['Id'] = df_test["Id"]
sub['SalePrice'] = y_pred_lasso_GBoost
sub.to_csv('submission_Lasso_xgb.csv',index=False)

In [39]:
# Stacking lasso and xgb by linear regression
from sklearn import datasets, linear_model

# Create linear regression object
regr = linear_model.LinearRegression()

# Fit the basic models
lasso.fit(X_train, y_train)
X_pred_lasso = lasso.predict(X_train)
model_xgb.fit(X_train, y_train)
X_pred_xgb = model_xgb.predict(X_train)

In [40]:
X = np.ones((len(X_pred_lasso), 3))

In [41]:
X[:,0] = X_pred_lasso
X[:,1] = X_pred_xgb

In [42]:
regr.fit(X, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [43]:
regr.coef_

array([-0.16750584,  1.17754486,  0.        ])

In [48]:
# Predict the values
y_pred_lasso = lasso.predict(X_test)
y_pred_xgb = model_xgb.predict(X_test)

coef = 0.5
y_pred = coef * y_pred_lasso + (1-coef) * y_pred_xgb
y_pred= np.exp(y_pred)

# Submission
sub = pd.DataFrame()
sub['Id'] = df_test["Id"]
sub['SalePrice'] = y_pred
sub.to_csv('submission_lm_lasso_xgb.csv',index=False)