In [273]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from scipy import stats
from scipy.stats import norm, skew

train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [274]:
raw_train = train.copy()
raw_test = test.copy()

train.drop('Id',axis=1,inplace=True)
test.drop('Id',axis=1,inplace=True)

In [275]:
train.drop( train[train['GrLivArea']>4500].index,inplace=True)

In [276]:
train['SalePrice'] = np.log1p( train['SalePrice'])

In [277]:
y = train['SalePrice']
all_data = pd.concat([train,test],ignore_index=True)
all_data.drop('SalePrice',axis=1,inplace=True)

In [278]:
all_data.shape

(2917, 79)

In [279]:
train.dtypes

MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
Street            object
                  ...   
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
SalePrice        float64
Length: 80, dtype: object

In [280]:
all_data['LotFrontage'].value_counts().sort_values(ascending=False).values[0]

276

In [281]:
missing_count = all_data.isnull().sum()
missing_count = missing_count[missing_count > 0]
missing_percent = missing_count / len(all_data) * 100
missing_info = pd.concat([missing_count,missing_percent],axis=1,keys=['Count','Percent']).sort_values(by='Percent',ascending=False)
missing_info['dtype'] = all_data.dtypes[  missing_info.index ]
missing_info

Unnamed: 0,Count,Percent,dtype
PoolQC,2908,99.691464,object
MiscFeature,2812,96.400411,object
Alley,2719,93.212204,object
Fence,2346,80.425094,object
FireplaceQu,1420,48.680151,object
LotFrontage,486,16.660953,float64
GarageFinish,159,5.450806,object
GarageQual,159,5.450806,object
GarageCond,159,5.450806,object
GarageYrBlt,159,5.450806,float64


In [282]:
for col in missing_info.index:
    missing_info.loc[col,'mode_percent'] = all_data[col].value_counts().sort_values(ascending=False).values[0] / len(all_data) * 100

In [283]:
missing_info

Unnamed: 0,Count,Percent,dtype,mode_percent
PoolQC,2908,99.691464,object,0.137127
MiscFeature,2812,96.400411,object,3.256771
Alley,2719,93.212204,object,4.113816
Fence,2346,80.425094,object,11.278711
FireplaceQu,1420,48.680151,object,25.437093
LotFrontage,486,16.660953,float64,9.461776
GarageFinish,159,5.450806,object,42.16661
GarageQual,159,5.450806,object,89.201234
GarageCond,159,5.450806,object,90.915324
GarageYrBlt,159,5.450806,float64,4.868015


In [284]:
num_feats = all_data.select_dtypes(exclude='object').columns
cat_feats = all_data.select_dtypes(include='object').columns

In [285]:
missing_num_feats = []
missing_cat_feats = []
for col in missing_info.index:
    if col in num_feats:
        missing_num_feats.append(col)
    if col in cat_feats:
        missing_cat_feats.append(col)

In [286]:
missing_num_feats

['LotFrontage',
 'GarageYrBlt',
 'MasVnrArea',
 'BsmtFullBath',
 'BsmtHalfBath',
 'GarageArea',
 'GarageCars',
 'TotalBsmtSF',
 'BsmtUnfSF',
 'BsmtFinSF2',
 'BsmtFinSF1']

In [287]:
missing_cat_feats

['PoolQC',
 'MiscFeature',
 'Alley',
 'Fence',
 'FireplaceQu',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'GarageType',
 'BsmtExposure',
 'BsmtCond',
 'BsmtQual',
 'BsmtFinType2',
 'BsmtFinType1',
 'MasVnrType',
 'MSZoning',
 'Functional',
 'Utilities',
 'Electrical',
 'KitchenQual',
 'Exterior2nd',
 'Exterior1st',
 'SaleType']

In [288]:
all_data["PoolQC"] = all_data["PoolQC"].fillna("None")
all_data["MiscFeature"] = all_data["MiscFeature"].fillna("None")
all_data["Alley"] = all_data["Alley"].fillna("None")
all_data["Fence"] = all_data["Fence"].fillna("None")
all_data["FireplaceQu"] = all_data["FireplaceQu"].fillna("None")

In [289]:
all_data.groupby('Neighborhood')['LotFrontage'].median()

Neighborhood
Blmngtn    43.0
Blueste    24.0
BrDale     21.0
BrkSide    51.0
ClearCr    80.5
CollgCr    70.0
Crawfor    70.0
Edwards    64.5
Gilbert    64.0
IDOTRR     60.0
MeadowV    21.0
Mitchel    74.0
NAmes      73.0
NPkVill    24.0
NWAmes     80.0
NoRidge    89.0
NridgHt    92.0
OldTown    60.0
SWISU      60.0
Sawyer     72.0
SawyerW    67.0
Somerst    72.5
StoneBr    60.0
Timber     82.0
Veenker    80.0
Name: LotFrontage, dtype: float64

In [290]:
all_data.loc[all_data['LotFrontage'].isnull() , 'LotFrontage'] = all_data[all_data['LotFrontage'].isnull()].apply(lambda row : all_data[all_data['Neighborhood'] == row['Neighborhood']].median() , axis=1)

In [291]:
all_data['LotFrontage'].isnull().sum()

0

In [292]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

In [293]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)

In [294]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

In [295]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

In [296]:
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

In [297]:
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

In [298]:
all_data = all_data.drop(['Utilities'], axis=1)

In [299]:
all_data["Functional"] = all_data["Functional"].fillna("Typ")

In [300]:
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])

In [301]:
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

In [302]:
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

In [303]:
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

In [304]:
all_data.isnull().sum().sum()

0

In [305]:
num_feats

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [307]:
num_actuallyCat_feats = ['MSSubClass', 'OverallQual','OverallCond', 'MoSold', 'YrSold']

In [308]:
all_data[num_actuallyCat_feats].dtypes

MSSubClass     int64
OverallQual    int64
OverallCond    int64
MoSold         int64
YrSold         int64
dtype: object

In [310]:
all_data[num_actuallyCat_feats] = all_data[num_actuallyCat_feats].astype(str)

In [311]:
all_data[num_actuallyCat_feats].dtypes

MSSubClass     object
OverallQual    object
OverallCond    object
MoSold         object
YrSold         object
dtype: object

In [312]:
all_data.shape

(2917, 78)

- ordinal들을 lebel encoding 해 준후 더미 변환해주는 것과, 그냥 바로 categorical 전체를 더미 변환 해주는 것이 차이가 있을까?

In [313]:
nominal_feats = []
for feat in cat_feats:
    if feat not in ('FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold'):
        nominal_feats.append(feat)

In [314]:
ordinal_feats = ['FireplaceQu', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond', 
        'ExterQual', 'ExterCond','HeatingQC', 'PoolQC', 'KitchenQual', 'BsmtFinType1', 
        'BsmtFinType2', 'Functional', 'Fence', 'BsmtExposure', 'GarageFinish', 'LandSlope',
        'LotShape', 'PavedDrive', 'Street', 'Alley', 'CentralAir', 'MSSubClass', 'OverallCond', 
        'YrSold', 'MoSold']

In [315]:
pd.get_dummies(all_data['FireplaceQu'])

Unnamed: 0,Ex,Fa,Gd,None,Po,TA
0,0,0,0,1,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,1,0,0,0
4,0,0,0,0,0,1
...,...,...,...,...,...,...
2912,0,0,0,1,0,0
2913,0,0,0,1,0,0
2914,0,0,0,0,0,1
2915,0,0,0,1,0,0


In [316]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()
all_data['FireplaceQu'] = encoder.fit_transform(all_data['FireplaceQu'])

In [317]:
all_data[['FireplaceQu']]

Unnamed: 0,FireplaceQu
0,3
1,5
2,5
3,2
4,5
...,...
2912,3
2913,3
2914,5
2915,3


In [318]:
pd.get_dummies(all_data['FireplaceQu'])

Unnamed: 0,0,1,2,3,4,5
0,0,0,0,1,0,0
1,0,0,0,0,0,1
2,0,0,0,0,0,1
3,0,0,1,0,0,0
4,0,0,0,0,0,1
...,...,...,...,...,...,...
2912,0,0,0,1,0,0
2913,0,0,0,1,0,0
2914,0,0,0,0,0,1
2915,0,0,0,1,0,0


In [319]:
ordinal_feats.remove('FireplaceQu')

In [320]:
for feat in ordinal_feats:
    all_data[feat] = encoder.fit_transform(all_data[feat])

In [321]:
ordinal_feats.append('FireplaceQu')

In [322]:
all_data[ordinal_feats]

Unnamed: 0,BsmtQual,BsmtCond,GarageQual,GarageCond,ExterQual,ExterCond,HeatingQC,PoolQC,KitchenQual,BsmtFinType1,...,LotShape,PavedDrive,Street,Alley,CentralAir,MSSubClass,OverallCond,YrSold,MoSold,FireplaceQu
0,2,4,5,5,2,4,0,3,2,2,...,3,2,1,1,1,10,4,2,4,3
1,2,4,5,5,3,4,0,3,3,0,...,3,2,1,1,1,5,7,1,7,5
2,2,4,5,5,2,4,0,3,2,2,...,0,2,1,1,1,10,4,2,11,5
3,4,1,5,5,3,4,2,3,2,0,...,0,2,1,1,1,11,4,0,4,2
4,2,4,5,5,2,4,0,3,2,2,...,0,2,1,1,1,10,4,2,3,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2912,4,4,3,3,3,4,2,3,3,6,...,3,2,1,1,1,2,6,0,8,3
2913,4,4,5,5,3,4,4,3,3,5,...,3,2,1,1,1,2,4,0,6,3
2914,4,4,5,5,3,4,0,3,3,0,...,3,2,1,1,1,5,6,0,11,5
2915,2,4,3,3,3,4,4,3,3,2,...,3,2,1,1,1,14,4,0,9,3


In [323]:
all_data.shape

(2917, 78)

In [324]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

In [326]:
num_feats = all_data.select_dtypes(exclude='object').columns
num_feats

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'Street', 'Alley', 'LotShape',
       'LandSlope', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF',
       'TotalBsmtSF', 'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu',
       'GarageYrBlt', 'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual',
       'GarageCond', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscVal', 'MoSold', 'YrSold', 'TotalSF'],
      dtype='object')

In [329]:
skewness = all_data[num_feats].apply(lambda col : skew(col) ).sort_values(ascending=False) 

In [330]:
from scipy.special import boxcox1p
skewed_features = skewness.index
lam = 0.15
for feat in skewed_features:
    all_data[feat] = boxcox1p(all_data[feat], lam)

In [331]:
all_data = pd.get_dummies(all_data)
all_data.shape

(2917, 229)

In [335]:
train = all_data.iloc[:train.shape[0],]
test = all_data.iloc[train.shape[0]:,]

In [343]:
x = train
y = y 

In [352]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
#import xgboost as xgb
#import lightgbm as lgb

In [355]:
def rmsle_cv(model):
    kf = KFold(n_splits=5, shuffle=True, random_state=42)
    rmse = np.sqrt( -cross_val_score(model,x,y,scoring='neg_mean_squared_error',cv=kf) )
    return rmse

In [357]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)
'''model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)'''

"model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, \n                             learning_rate=0.05, max_depth=3, \n                             min_child_weight=1.7817, n_estimators=2200,\n                             reg_alpha=0.4640, reg_lambda=0.8571,\n                             subsample=0.5213, silent=1,\n                             random_state =7, nthread = -1)\nmodel_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,\n                              learning_rate=0.05, n_estimators=720,\n                              max_bin = 55, bagging_fraction = 0.8,\n                              bagging_freq = 5, feature_fraction = 0.2319,\n                              feature_fraction_seed=9, bagging_seed=9,\n                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)"

In [368]:
models = [lasso,ENet,KRR,GBoost]
scores = []
for model in models:
    scores.append( rmsle_cv(model).mean() )

In [369]:
scores

[0.11166603058143276,
 0.11157515128335264,
 0.11588217025761831,
 0.11796618489487734]

In [377]:
pd.DataFrame(scores,index=['lasso','ENet','KRR','GBoost'],columns=['Score'])

Unnamed: 0,Score
lasso,0.111666
ENet,0.111575
KRR,0.115882
GBoost,0.117966


In [427]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self,models):
        self.models = models
        
    def fit(self,x,y):
        self.models_ = [ clone(model) for model in self.models ]
        for model in self.models_:
            model.fit(x,y)
        return self
        
    def predict(self,x):
        predictions = np.column_stack([
            model.predict(x) for model in self.models_
        ])
        return np.mean(predictions, axis=1)   

In [428]:
AveragingModels = AveragingModels(models = (ENet, GBoost, KRR, lasso))

In [429]:
rmsle_cv(AveragingModels).mean()

0.10940504584871699

In [489]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
        
    def fit(self,x,y):
        self.base_models_ = [ list() for model in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kf = KFold(n_splits=self.n_folds, shuffle=True, random_state=42)
        
        fold_preds = np.zeros(( len(y) , len(self.base_models_)))
        
        for i, model in enumerate(self.base_models):
            for train_idx, holdout_idx in kf.split(x,y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(x.iloc[train_idx],y.iloc[train_idx])
                y_pred = instance.predict(x.iloc[holdout_idx])
                fold_preds[holdout_idx,i] = y_pred
                
        self.meta_model_.fit(fold_preds,y)
        return self
    
    def predict(self,x):
        meta_feats = np.column_stack( [ np.column_stack([instance.predict(x) for instance in instance_list] ).mean(axis=1) for instance_list in self.base_models_ ] )
        return self.meta_model_.predict(meta_feats)

In [492]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

rmsle_cv(stacked_averaged_models).mean()

0.10932746211725297

In [None]:
import li

In [495]:
submission = pd.read_csv('sample_submission.csv')
stacked_averaged_models.fit(x,y)
submission['SalePrice'] = np.expm1( stacked_averaged_models.predict(test) ) 
submission.to_csv('submission_stacked_blended.csv',index=False)