In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

### Set up the Metric

In [None]:
metric = 'neg_mean_squared_error'
kf = KFold(n_splits=8, shuffle=True, random_state=0)

In [None]:
def performance(model):
    score = np.sqrt(-cross_val_score(model, X_train, y_train, cv=kf, scoring=metric, n_jobs=4)).mean()
    return score

In [None]:
def grid_search(model, params):
    grid = GridSearchCV(model, params, cv=kf, scoring=metric, n_jobs=4, verbose=True).fit(X_train, y_train)
    print(grid.best_params_)
    print(np.sqrt(-grid.best_score_))
    return grid.best_estimator_

### Prepare the Data

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
test_id = test['Id']
train.columns = [i.lower() for i in train.columns]
test.columns = [i.lower() for i in test.columns]

In [None]:
train.shape, test.shape

In [None]:
train.head()

### Get Rid of Outliers

In [None]:
train = train.drop(train[train.lotfrontage>300].index)
train = train.drop(train[train.lotarea>100000].index)
train = train.drop(train[train.grlivarea>4000].index)

In [None]:
train.shape

In [None]:
train.columns

### Combine the Data for Processing

In [None]:
sep = train.shape[0]
combine = train.drop('saleprice', axis=1).append(test)

In [None]:
combine.shape

### Fill in the Missing Values

In [None]:
combine['mszoning'] = combine.mszoning.fillna(combine.mszoning.mode()[0])
combine['lotfrontage'] = combine.groupby('neighborhood').lotfrontage.transform(lambda x: x.fillna(x.median()))
combine['alley'] = combine.alley.fillna('None')
combine = combine.drop('utilities', axis=1)
combine['exterior1st'] = combine.exterior1st.fillna(combine.exterior1st.mode()[0])
combine['exterior2nd'] = combine.exterior2nd.fillna(combine.exterior2nd.mode()[0])
combine['masvnrarea'] = combine.masvnrarea.fillna(0.0)
combine['masvnrtype'] = combine.masvnrtype.fillna('None')
combine['bsmtqual'] = combine.bsmtqual.fillna('None')
combine['bsmtcond'] = combine.bsmtcond.fillna('None')
combine['bsmtexposure'] = combine.bsmtexposure.fillna('None')
combine['bsmtfintype1'] = combine.bsmtfintype1.fillna('None')
combine['bsmtfinsf1'] = combine.bsmtfinsf1.fillna(0.0)
combine['bsmtfintype2'] = combine.bsmtfintype2.fillna('None')
combine['bsmtfinsf2'] = combine.bsmtfinsf2.fillna(0.0)
combine['bsmtunfsf'] = combine.bsmtunfsf.fillna(0.0)
combine['totalbsmtsf'] = combine.totalbsmtsf.fillna(0.0)
combine['electrical'] = combine.electrical.fillna(combine.electrical.mode()[0])
combine['bsmtfullbath'] = combine.bsmtfullbath.fillna(0.0)
combine['bsmthalfbath'] = combine.bsmthalfbath.fillna(0.0)
combine['kitchenqual'] = combine.kitchenqual.fillna(combine.kitchenqual.mode()[0])
combine['functional'] = combine.functional.fillna(combine.functional.mode()[0])
combine['fireplacequ'] = combine.fireplacequ.fillna('None')
combine['garagetype'] = combine.garagetype.fillna('None')
combine['garageyrblt'] = combine.garageyrblt.fillna(combine.garageyrblt.min()-1)
combine['garagefinish'] = combine.garagefinish.fillna('None')
combine['garagecars'] = combine.garagecars.fillna(0.0)
combine['garagearea'] = combine.garagearea.fillna(0.0)
combine['garagequal'] = combine.garagequal.fillna('None')
combine['garagecond'] = combine.garagecond.fillna('None')
combine['poolqc'] = combine.poolqc.fillna('None')
combine['fence'] = combine.fence.fillna('None')
combine['miscfeature'] = combine.miscfeature.fillna('None')
combine['saletype'] = combine.saletype.fillna(combine.saletype.mode()[0])

In [None]:
combine.isnull().sum()[combine.isnull().sum()!=0]

### Feature Engineering

In [None]:
to_drop = ['id', 'lotshape']
to_category = ['mssubclass', 'mszoning', 'street', 'alley',
               'landcontour', 'lotconfig', 'landslope', 'neighborhood',
               'condition1', 'condition2', 'bldgtype', 'housestyle',
               'roofstyle', 'roofmatl', 'overallcond', 'overallqual',
               'exterior1st', 'exterior2nd', 'masvnrtype', 'exterqual',
               'extercond', 'foundation', 'bsmtqual', 'bsmtcond', 'bsmtexposure',
               'bsmtfintype1', 'bsmtfintype2', 'heating', 'heatingqc',
               'centralair', 'electrical', 'kitchenqual', 'functional',
               'fireplacequ', 'garagetype', 'garagefinish', 'garagequal',
               'garagecond', 'paveddrive', 'poolqc', 'fence', 'miscfeature',
               'mosold', 'saletype', 'salecondition', 'yrsold']
num_feats = ['lotfrontage', 'lotarea', 'masvnrarea', 'garageyrblt', 'yearremodadd',
          'bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', '1stflrsf',
          '2ndflrsf', 'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath',
          'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd',
          'fireplaces', 'garagecars', 'garagearea', 'wooddecksf',
          'openporchsf', 'enclosedporch', '3ssnporch', 'screenporch', 'poolarea',
          'miscval', 'yearbuilt', 'totalsf']

In [None]:
combine['is_reglotshape'] = combine.lotshape.apply(lambda x: 1 if x=='Reg' else 0)
combine['totalsf'] = combine['totalbsmtsf'] + combine['1stflrsf'] + combine['2ndflrsf']

In [None]:
combine = combine.drop(to_drop, axis=1)
for feat in to_category:
    combine[feat] = combine[feat].astype('category')
for feat in num_feats:
    if abs(combine[feat].skew()) > 0.75:
        combine[feat] = np.log1p(combine[feat])
combine = pd.get_dummies(combine)

In [None]:
combine = pd.DataFrame(StandardScaler().fit_transform(combine), columns=combine.columns)

In [None]:
def prepare_data(combine):
    X_train = combine[:sep]
    y_train = np.log(train['saleprice'])
    X_test = combine[sep:]
    return X_train, y_train, X_test

In [None]:
X_train, y_train, X_test = prepare_data(combine)

### Base Model Tunning

In [None]:
base_regs = []

In [None]:
params = {
    'alpha': np.logspace(-4, -1, 20)
}
reg = grid_search(Lasso(), params)

### Feature Selection based on Lasso Result

In [None]:
coefs = reg.coef_
features = combine.columns.tolist()

for i in range(len(coefs)):
    if coefs[i] == 0:
        combine = combine.drop(features[i], axis=1)

X_train, y_train, X_test = prepare_data(combine)

params = {
    'alpha': np.logspace(-4, -1, 20)
}
reg = grid_search(Lasso(), params)

In [None]:
base_regs.append(reg)

In [None]:
params = {
    'alpha': np.logspace(-1, 3, 50)
}
reg = grid_search(Ridge(), params)

In [None]:
base_regs.append(reg)

In [None]:
params = {
    'alpha': np.logspace(-3, 4, 8),
    'l1_ratio': np.arange(0.1, 1.1, 0.1)
}
reg = grid_search(ElasticNet(), params)

In [None]:
base_regs.append(reg)

In [None]:
params = {
    'n_estimators': [200],
    'random_state': [0],
    'max_depth': [14],
    'min_samples_split': [5],
    'max_features': [0.5]
}
reg = grid_search(RandomForestRegressor(), params)

In [None]:
base_regs.append(reg)

In [None]:
params = {
    'n_estimators': [100],
    'random_state': [0],
    'max_depth': [17],
    'min_samples_split': [3],
    'max_features': [0.5]
}
reg = grid_search(RandomForestRegressor(), params)

In [None]:
base_regs.append(reg)

In [None]:
params = {
    'n_estimators': [1000],
    'learning_rate': [0.05],
    'random_state': [0],
    'max_depth': [4],
    'subsample': [0.3],
    'max_features': [0.5]
}
reg = grid_search(GradientBoostingRegressor(), params)

In [None]:
base_regs.append(reg)

In [None]:
params = {
    'n_estimators': [1000],
    'learning_rate': [0.02],
    'random_state': [0],
    'max_depth': [7],
    'subsample': [0.3],
    'colsample_bytree': [1.0],
    'reg_alpha': [0.001],
    'reg_lambda': [1.0]
}
reg = grid_search(XGBRegressor(), params)

In [None]:
base_regs.append(reg)

### Use Ridge as a Blender

In [None]:
from sklearn.model_selection import cross_val_predict

In [None]:
preds = pd.DataFrame()
for i, model in enumerate(base_regs):
    preds['model_'+str(i)] = cross_val_predict(model, X_train, y_train, cv=kf, n_jobs=4)

In [None]:
params = {
    'alpha': np.logspace(-5, 1, 7)
}
grid = GridSearchCV(Ridge(), params, cv=kf, scoring='neg_mean_squared_error', n_jobs=4)
grid.fit(preds, y_train)
print(np.sqrt(-grid.best_score_))
print(grid.best_params_)

In [None]:
blender = Ridge(alpha=0.001)

### Make Prediction and Submission

In [None]:
y_preds = pd.DataFrame()
for i, model in enumerate(base_regs):
    y_pred = model.fit(X_train, y_train).predict(X_test)
    y_preds['model_'+str(i)] = y_pred

In [None]:
y_pred = blender.fit(preds, y_train).predict(y_preds)
y_pred = pd.Series(y_pred, name='SalePrice')
sub = pd.concat([test_id, y_pred], axis=1)
sub.to_csv('submission.csv', index=False)