## Project Outline
* Set up the metric
* Prepare the data
* Exploratory data analysis(stored in another notebook)
* Get rid of outliers
* Fill in the missing values
* Feature engineering
* Feature selection (based on Lasso)
* Base model tunning
* Blending
* Make predictions

In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
import warnings
warnings.filterwarnings('ignore')

  from numpy.core.umath_tests import inner1d


### Set up the Metric

In [2]:
metric = 'neg_mean_squared_error'
kf = KFold(n_splits=8, shuffle=True, random_state=0)

In [3]:
def performance(model):
    score = np.sqrt(-cross_val_score(model, X_train, y_train, cv=kf, scoring=metric, n_jobs=4)).mean()
    return score

In [4]:
def grid_search(model, params):
    grid = GridSearchCV(model, params, cv=kf, scoring=metric, n_jobs=4, verbose=True).fit(X_train, y_train)
    print(grid.best_params_)
    print(np.sqrt(-grid.best_score_))
    return grid.best_estimator_

### Prepare the Data

In [5]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
test_id = test['Id']
train.columns = [i.lower() for i in train.columns]
test.columns = [i.lower() for i in test.columns]

In [6]:
train.shape, test.shape

((1460, 81), (1459, 80))

In [7]:
train.head()

Unnamed: 0,id,mssubclass,mszoning,lotfrontage,lotarea,street,alley,lotshape,landcontour,utilities,...,poolarea,poolqc,fence,miscfeature,miscval,mosold,yrsold,saletype,salecondition,saleprice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


### Get Rid of Outliers

In [8]:
train = train.drop(train[train.lotfrontage>300].index)
train = train.drop(train[train.lotarea>100000].index)
train = train.drop(train[train.grlivarea>4000].index)

In [9]:
train.shape

(1451, 81)

In [10]:
train.columns

Index(['id', 'mssubclass', 'mszoning', 'lotfrontage', 'lotarea', 'street',
       'alley', 'lotshape', 'landcontour', 'utilities', 'lotconfig',
       'landslope', 'neighborhood', 'condition1', 'condition2', 'bldgtype',
       'housestyle', 'overallqual', 'overallcond', 'yearbuilt', 'yearremodadd',
       'roofstyle', 'roofmatl', 'exterior1st', 'exterior2nd', 'masvnrtype',
       'masvnrarea', 'exterqual', 'extercond', 'foundation', 'bsmtqual',
       'bsmtcond', 'bsmtexposure', 'bsmtfintype1', 'bsmtfinsf1',
       'bsmtfintype2', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', 'heating',
       'heatingqc', 'centralair', 'electrical', '1stflrsf', '2ndflrsf',
       'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath', 'fullbath',
       'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'kitchenqual',
       'totrmsabvgrd', 'functional', 'fireplaces', 'fireplacequ', 'garagetype',
       'garageyrblt', 'garagefinish', 'garagecars', 'garagearea', 'garagequal',
       'garagecond', 'paveddrive

### Combine the Data for Processing

In [11]:
sep = train.shape[0]
combine = train.drop('saleprice', axis=1).append(test)

In [12]:
combine.shape

(2910, 80)

### Fill in the Missing Values

In [13]:
combine['mszoning'] = combine.mszoning.fillna(combine.mszoning.mode()[0])
combine['lotfrontage'] = combine.groupby('neighborhood').lotfrontage.transform(lambda x: x.fillna(x.median()))
combine['alley'] = combine.alley.fillna('None')
combine = combine.drop('utilities', axis=1)
combine['exterior1st'] = combine.exterior1st.fillna(combine.exterior1st.mode()[0])
combine['exterior2nd'] = combine.exterior2nd.fillna(combine.exterior2nd.mode()[0])
combine['masvnrarea'] = combine.masvnrarea.fillna(0.0)
combine['masvnrtype'] = combine.masvnrtype.fillna('None')
combine['bsmtqual'] = combine.bsmtqual.fillna('None')
combine['bsmtcond'] = combine.bsmtcond.fillna('None')
combine['bsmtexposure'] = combine.bsmtexposure.fillna('None')
combine['bsmtfintype1'] = combine.bsmtfintype1.fillna('None')
combine['bsmtfinsf1'] = combine.bsmtfinsf1.fillna(0.0)
combine['bsmtfintype2'] = combine.bsmtfintype2.fillna('None')
combine['bsmtfinsf2'] = combine.bsmtfinsf2.fillna(0.0)
combine['bsmtunfsf'] = combine.bsmtunfsf.fillna(0.0)
combine['totalbsmtsf'] = combine.totalbsmtsf.fillna(0.0)
combine['electrical'] = combine.electrical.fillna(combine.electrical.mode()[0])
combine['bsmtfullbath'] = combine.bsmtfullbath.fillna(0.0)
combine['bsmthalfbath'] = combine.bsmthalfbath.fillna(0.0)
combine['kitchenqual'] = combine.kitchenqual.fillna(combine.kitchenqual.mode()[0])
combine['functional'] = combine.functional.fillna(combine.functional.mode()[0])
combine['fireplacequ'] = combine.fireplacequ.fillna('None')
combine['garagetype'] = combine.garagetype.fillna('None')
combine['garageyrblt'] = combine.garageyrblt.fillna(combine.garageyrblt.min()-1)
combine['garagefinish'] = combine.garagefinish.fillna('None')
combine['garagecars'] = combine.garagecars.fillna(0.0)
combine['garagearea'] = combine.garagearea.fillna(0.0)
combine['garagequal'] = combine.garagequal.fillna('None')
combine['garagecond'] = combine.garagecond.fillna('None')
combine['poolqc'] = combine.poolqc.fillna('None')
combine['fence'] = combine.fence.fillna('None')
combine['miscfeature'] = combine.miscfeature.fillna('None')
combine['saletype'] = combine.saletype.fillna(combine.saletype.mode()[0])

In [14]:
combine.isnull().sum()[combine.isnull().sum()!=0]

Series([], dtype: int64)

### Feature Engineering

In [15]:
to_drop = ['id', 'lotshape']
to_category = ['mssubclass', 'mszoning', 'street', 'alley',
               'landcontour', 'lotconfig', 'landslope', 'neighborhood',
               'condition1', 'condition2', 'bldgtype', 'housestyle',
               'roofstyle', 'roofmatl', 'overallcond', 'overallqual',
               'exterior1st', 'exterior2nd', 'masvnrtype', 'exterqual',
               'extercond', 'foundation', 'bsmtqual', 'bsmtcond', 'bsmtexposure',
               'bsmtfintype1', 'bsmtfintype2', 'heating', 'heatingqc',
               'centralair', 'electrical', 'kitchenqual', 'functional',
               'fireplacequ', 'garagetype', 'garagefinish', 'garagequal',
               'garagecond', 'paveddrive', 'poolqc', 'fence', 'miscfeature',
               'mosold', 'saletype', 'salecondition', 'yrsold']
num_feats = ['lotfrontage', 'lotarea', 'masvnrarea', 'garageyrblt', 'yearremodadd',
          'bsmtfinsf1', 'bsmtfinsf2', 'bsmtunfsf', 'totalbsmtsf', '1stflrsf',
          '2ndflrsf', 'lowqualfinsf', 'grlivarea', 'bsmtfullbath', 'bsmthalfbath',
          'fullbath', 'halfbath', 'bedroomabvgr', 'kitchenabvgr', 'totrmsabvgrd',
          'fireplaces', 'garagecars', 'garagearea', 'wooddecksf',
          'openporchsf', 'enclosedporch', '3ssnporch', 'screenporch', 'poolarea',
          'miscval', 'yearbuilt', 'totalsf']

In [16]:
combine['is_reglotshape'] = combine.lotshape.apply(lambda x: 1 if x=='Reg' else 0)
combine['totalsf'] = combine['totalbsmtsf'] + combine['1stflrsf'] + combine['2ndflrsf']

In [17]:
combine = combine.drop(to_drop, axis=1)
for feat in to_category:
    combine[feat] = combine[feat].astype('category')
for feat in num_feats:
    if abs(combine[feat].skew()) > 0.75:
        combine[feat] = np.log1p(combine[feat])
combine = pd.get_dummies(combine)

In [18]:
combine = pd.DataFrame(StandardScaler().fit_transform(combine), columns=combine.columns)

In [19]:
def prepare_data(combine):
    X_train = combine[:sep]
    y_train = np.log(train['saleprice'])
    X_test = combine[sep:]
    return X_train, y_train, X_test

In [20]:
X_train, y_train, X_test = prepare_data(combine)

### Feature Selection based on Lasso Result

In [21]:
base_regs = []

In [22]:
params = {
    'alpha': np.logspace(-4, -1, 20)
}
reg = grid_search(Lasso(), params)

Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Done  42 tasks      | elapsed:    5.7s
[Parallel(n_jobs=4)]: Done 153 out of 160 | elapsed:   10.5s remaining:    0.5s
[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:   10.6s finished


{'alpha': 0.0026366508987303583}
0.11130984834575268


In [23]:
coefs = reg.coef_
features = combine.columns.tolist()

for i in range(len(coefs)):
    if coefs[i] == 0:
        combine = combine.drop(features[i], axis=1)

X_train, y_train, X_test = prepare_data(combine)

params = {
    'alpha': np.logspace(-4, -1, 20)
}
reg = grid_search(Lasso(), params)

Fitting 8 folds for each of 20 candidates, totalling 160 fits


[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    3.0s


{'alpha': 0.0008858667904100823}
0.10761658095500862


[Parallel(n_jobs=4)]: Done 160 out of 160 | elapsed:    4.5s finished


### Model Tuning and Keep the Base Models

In [24]:
base_regs.append(reg)

In [25]:
params = {
    'alpha': np.logspace(-1, 3, 50)
}
reg = grid_search(Ridge(), params)

Fitting 8 folds for each of 50 candidates, totalling 400 fits


[Parallel(n_jobs=4)]: Done 300 tasks      | elapsed:    1.9s


{'alpha': 126.48552168552958}
0.10710308769916094


[Parallel(n_jobs=4)]: Done 400 out of 400 | elapsed:    2.4s finished


In [26]:
base_regs.append(reg)

In [27]:
params = {
    'alpha': np.logspace(-3, 4, 8),
    'l1_ratio': np.arange(0.1, 1.1, 0.1)
}
reg = grid_search(ElasticNet(), params)

Fitting 8 folds for each of 80 candidates, totalling 640 fits


[Parallel(n_jobs=4)]: Done  76 tasks      | elapsed:    4.1s


{'alpha': 0.01, 'l1_ratio': 0.1}
0.10746445363814439


[Parallel(n_jobs=4)]: Done 640 out of 640 | elapsed:    9.4s finished


In [28]:
base_regs.append(reg)

In [29]:
params = {
    'n_estimators': [200],
    'random_state': [0],
    'max_depth': [14],
    'min_samples_split': [5],
    'max_features': [0.5]
}
reg = grid_search(RandomForestRegressor(), params)

Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:    9.4s finished


{'max_depth': 14, 'max_features': 0.5, 'min_samples_split': 5, 'n_estimators': 200, 'random_state': 0}
0.13508738934910777


In [30]:
base_regs.append(reg)

In [31]:
params = {
    'n_estimators': [100],
    'random_state': [0],
    'max_depth': [17],
    'min_samples_split': [3],
    'max_features': [0.5]
}
reg = grid_search(RandomForestRegressor(), params)

Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:    5.4s finished


{'max_depth': 17, 'max_features': 0.5, 'min_samples_split': 3, 'n_estimators': 100, 'random_state': 0}
0.13485883632237716


In [32]:
base_regs.append(reg)

In [33]:
params = {
    'n_estimators': [1000],
    'learning_rate': [0.05],
    'random_state': [0],
    'max_depth': [4],
    'subsample': [0.3],
    'max_features': [0.5]
}
reg = grid_search(GradientBoostingRegressor(), params)

Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:   16.4s finished


{'learning_rate': 0.05, 'max_depth': 4, 'max_features': 0.5, 'n_estimators': 1000, 'random_state': 0, 'subsample': 0.3}
0.11422744775507698


In [34]:
base_regs.append(reg)

In [35]:
params = {
    'n_estimators': [1000],
    'learning_rate': [0.02],
    'random_state': [0],
    'max_depth': [7],
    'subsample': [0.3],
    'colsample_bytree': [1.0],
    'reg_alpha': [0.001],
    'reg_lambda': [1.0]
}
reg = grid_search(XGBRegressor(), params)

Fitting 8 folds for each of 1 candidates, totalling 8 fits


[Parallel(n_jobs=4)]: Done   8 out of   8 | elapsed:   47.7s finished


{'colsample_bytree': 1.0, 'learning_rate': 0.02, 'max_depth': 7, 'n_estimators': 1000, 'random_state': 0, 'reg_alpha': 0.001, 'reg_lambda': 1.0, 'subsample': 0.3}
0.11470753366792613


In [36]:
base_regs.append(reg)

### Use Ridge as a Blender

In [37]:
from sklearn.model_selection import cross_val_predict

In [38]:
preds = pd.DataFrame()
for i, model in enumerate(base_regs):
    preds['model_'+str(i)] = cross_val_predict(model, X_train, y_train, cv=kf, n_jobs=4)

In [39]:
params = {
    'alpha': np.logspace(-5, 1, 7)
}
grid = GridSearchCV(Ridge(), params, cv=kf, scoring='neg_mean_squared_error', n_jobs=4)
grid.fit(preds, y_train)
print(np.sqrt(-grid.best_score_))
print(grid.best_params_)

0.10534820357355877
{'alpha': 0.001}


In [40]:
blender = Ridge(alpha=0.001)

### Make Prediction and Submission

In [42]:
y_preds = pd.DataFrame()
for i, model in enumerate(base_regs):
    y_pred = model.fit(X_train, y_train).predict(X_test)
    y_preds['model_'+str(i)] = y_pred

In [44]:
y_pred = blender.fit(preds, y_train).predict(y_preds)
y_pred = pd.Series(np.exp(y_pred), name='SalePrice')
sub = pd.concat([test_id, y_pred], axis=1)
sub.to_csv('submission.csv', index=False)