In [303]:
import pandas as pd
import numpy as np
from scipy import stats
from scipy.stats import norm,skew
from sklearn import preprocessing

In [260]:
train=pd.read_csv('train.csv')
test=pd.read_csv('testy.csv')
print train.shape
print test.shape

(1460, 81)
(1459, 80)


# Imputing the missing features

### Lets Get the missing features

In [194]:
miss=train.isnull().sum().sort_values(ascending=False).reset_index()
miss.columns=['Features','Percent']
miss=miss[miss['Percent']>0]
miss

Unnamed: 0,Features,Percent
0,PoolQC,1453
1,MiscFeature,1406
2,Alley,1369
3,Fence,1179
4,FireplaceQu,690
5,LotFrontage,259
6,GarageCond,81
7,GarageType,81
8,GarageYrBlt,81
9,GarageFinish,81


In [195]:
miss=test.isnull().sum().sort_values(ascending=False).reset_index()
miss.columns=['Features','Percent']
miss=miss[miss['Percent']>0]
miss

Unnamed: 0,Features,Percent
0,PoolQC,1456
1,MiscFeature,1408
2,Alley,1352
3,Fence,1169
4,FireplaceQu,730
5,LotFrontage,227
6,GarageCond,78
7,GarageQual,78
8,GarageYrBlt,78
9,GarageFinish,78


Step 1:Fill those null values

Missing Features which are of top importance are  
1.LotFrontage  
2.MSZoning

In [261]:
train_num=train.shape[0]
all_data=pd.concat([train,test])
print data.shape


(2919, 81)


In [262]:
all_data['PoolQC']=all_data['PoolQC'].fillna('None')
all_data['MiscFeature']=all_data['MiscFeature'].fillna('None')
all_data['Alley']=all_data['Alley'].fillna('None')
all_data['Fence']=all_data['Fence'].fillna('None')
all_data['FireplaceQu']=all_data['FireplaceQu'].fillna('None')

In [263]:
all_data['LotFrontage']=all_data.groupby('Neighborhood')['LotFrontage'].transform(lambda x:x.fillna(x.median()))

In [264]:
for col in ('GarageType', 'GarageFinish', 'GarageQual', 'GarageCond'):
    all_data[col] = all_data[col].fillna('None')

In [265]:
for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    all_data[col] = all_data[col].fillna(0)

In [266]:
for col in ('BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF','TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath'):
    all_data[col] = all_data[col].fillna(0)

In [267]:
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    all_data[col] = all_data[col].fillna('None')

In [268]:
all_data["MasVnrType"] = all_data["MasVnrType"].fillna("None")
all_data["MasVnrArea"] = all_data["MasVnrArea"].fillna(0)

MSZoning (The general zoning classification) : 'RL' is by far the most common value. So we can fill in missing values with 'RL'trainMSZoning (The general zoning classification) : 'RL' is by far the most common value. So we can fill in missing values with 'RL'

In [269]:
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])

In [270]:
all_data['Utilities'].value_counts()

AllPub    2916
NoSeWa       1
Name: Utilities, dtype: int64

Utilities : For this categorical feature all records are "AllPub", except for one "NoSeWa" and 2 NA . Since the house with 'NoSewa' is in the training set, this feature won't help in predictive modelling. We can then safely remove it.

In [271]:
all_data = all_data.drop(['Utilities'], axis=1)

In [272]:
print all_data['Functional'].value_counts()
print all_data['Functional'].isnull().sum()

Typ     2717
Min2      70
Min1      65
Mod       35
Maj1      19
Maj2       9
Sev        2
Name: Functional, dtype: int64
2


Functional : data description says NA means typical

In [273]:
all_data['Functional']=all_data['Functional'].fillna('Typ')

In [274]:
print all_data['Electrical'].value_counts()
print all_data['Electrical'].isnull().sum()

SBrkr    2671
FuseA     188
FuseF      50
FuseP       8
Mix         1
Name: Electrical, dtype: int64
1


In [275]:
all_data['Electrical']=all_data['Electrical'].fillna('SBrkr')

KitchenQual: Only one NA value, and same as Electrical, we set 'TA' (which is the most frequent) for the missing value in KitchenQual.

In [276]:
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])

In [277]:

all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])

In [278]:
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

In [279]:
all_data['MSSubClass'] = all_data['MSSubClass'].fillna("None")

In [280]:
#Check remaining missing values if any 
all_data_na = (all_data.isnull().sum() / len(all_data)) * 100
all_data_na = all_data_na.drop(all_data_na[all_data_na == 0].index).sort_values(ascending=False)
missing_data = pd.DataFrame({'Missing Ratio' :all_data_na})
missing_data.head()

Unnamed: 0,Missing Ratio
SalePrice,49.982871


### Transform Numerical features to Categorical

In [281]:
all_data['MSSubClass']=all_data['MSSubClass'].apply(str)
all_data['OverallCond']=all_data['OverallCond'].astype(str)

all_data['YrSold']=all_data['YrSold'].astype(str)
all_data['MoSold']=all_data['MoSold'].astype(str)


## Add New Feature

In [282]:
all_data['TotalSF']=all_data['TotalBsmtSF']+all_data['1stFlrSF']+all_data['2ndFlrSF']

In [283]:
all_data.shape

(2919, 81)

## Split all_data in train and test

In [285]:
train=all_data.head(train_num)
test=all_data.iloc[1460:]
test.drop('SalePrice',axis=1,inplace=True)

print train.shape
print test.shape

(1460, 81)
(1459, 80)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


## StandardScale the Numerical Features

In [246]:
col=[x for x in train.columns if train[x].dtype!='object']
for x in col:
    scale=preprocessing.StandardScaler()
    scale.fit(list(train[x].values))
    train[x]=scale.transform(list(train[x].values))
    print x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


1stFlrSF
2ndFlrSF
3SsnPorch
BedroomAbvGr
BsmtFinSF1




BsmtFinSF2
BsmtFullBath
BsmtHalfBath
BsmtUnfSF
EnclosedPorch
Fireplaces
FullBath




GarageArea
GarageCars
GarageYrBlt
GrLivArea
HalfBath
Id
KitchenAbvGr
LotArea




LotFrontage
LowQualFinSF
MasVnrArea
MiscVal
OpenPorchSF
OverallQual
PoolArea




SalePrice
ScreenPorch
TotRmsAbvGrd
TotalBsmtSF
WoodDeckSF
YearBuilt
YearRemodAdd
TotalSF


In [247]:
col=[x for x in test.columns if train[x].dtype!='object']
for x in col:
    scale=preprocessing.StandardScaler()
    scale.fit(list(test[x].values))
    test[x]=scale.transform(list(test[x].values))
    print x

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


1stFlrSF
2ndFlrSF
3SsnPorch
BedroomAbvGr




BsmtFinSF1
BsmtFinSF2
BsmtFullBath
BsmtHalfBath




BsmtUnfSF
EnclosedPorch
Fireplaces
FullBath
GarageArea
GarageCars
GarageYrBlt




GrLivArea
HalfBath
Id
KitchenAbvGr
LotArea
LotFrontage
LowQualFinSF
MasVnrArea




MiscVal
OpenPorchSF
OverallQual
PoolArea
ScreenPorch
TotRmsAbvGrd
TotalBsmtSF
WoodDeckSF
YearBuilt
YearRemodAdd
TotalSF




## Box-Cox transform the numerical features

### Separate target variable from train data

In [286]:
target=train['SalePrice']
train=train.drop(['SalePrice'],axis=1)
print target.shape
print train.shape

(1460,)
(1460, 80)


### Print skewness of Numerical Features

In [287]:
numeric_feature=train.dtypes[train.dtypes!='object'].index
skew_feats=train[numeric_feature].apply(lambda x:skew(x.dropna())).sort_values(ascending=False)

skewness=pd.DataFrame({'Skew':skew_feats})
skewness.head(100)

Unnamed: 0,Skew
MiscVal,24.45164
PoolArea,14.813135
LotArea,12.195142
3SsnPorch,10.293752
LowQualFinSF,9.00208
KitchenAbvGr,4.483784
BsmtFinSF2,4.250888
ScreenPorch,4.117977
BsmtHalfBath,4.099186
EnclosedPorch,3.086696


In [288]:
skewness=skewness[abs(skewness)>0.75]

from scipy.special import boxcox1p
skewed_features=skewness.index
lam=0.15
for feat in skewed_features:
    train[feat]=boxcox1p(train[feat],lam)

In [289]:
train.columns[train.isnull().any()]

Index([], dtype='object')

### Skewness of test Features

In [290]:
numeric_feature=test.dtypes[train.dtypes!='object'].index
skew_feats=train[numeric_feature].apply(lambda x:skew(x.dropna())).sort_values(ascending=False)

skewness=pd.DataFrame({'Skew':skew_feats})
skewness.head(100)

Unnamed: 0,Skew
PoolArea,14.363418
3SsnPorch,7.827826
LowQualFinSF,7.593301
MiscVal,5.50553
KitchenAbvGr,3.995746
BsmtHalfBath,3.943685
ScreenPorch,3.184431
BsmtFinSF2,2.61935
EnclosedPorch,2.160966
LotArea,0.751948


In [291]:
skewness=skewness[abs(skewness)>0.75]

from scipy.special import boxcox1p
skewed_features=skewness.index
lam=0.15
for feat in skewed_features:
    test[feat]=boxcox1p(test[feat],lam)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


In [292]:
test.columns[train.isnull().any()]

Index([], dtype='object')

## LabelEncode the categorical features

In [310]:
Col=[f for f in train.columns if train[f].dtype=='object']
for x in Col:
    lbl=preprocessing.LabelEncoder()
    lbl.fit(train[x].values.tolist()+test[x].values.tolist())
    train[x]=lbl.transform(train[x].values.tolist())
    test[x]=lbl.transform(test[x].values.tolist())

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [314]:
target=np.log1p(target)

In [329]:
train=train.drop('Id',axis=1)

In [351]:
test=test.drop('Id',axis=1)

## Modelling

In [298]:
from sklearn.linear_model import ElasticNet, Lasso,  BayesianRidge, LassoLarsIC
from sklearn.ensemble import RandomForestRegressor,  GradientBoostingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.base import BaseEstimator, TransformerMixin, RegressorMixin, clone
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb

In [300]:
#Validation function
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmse= np.sqrt(-cross_val_score(model, train.values, target, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

In [317]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

In [330]:
score = rmsle_cv(lasso)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1354 (0.0163)



In [318]:
ENet = make_pipeline(RobustScaler(), ElasticNet(alpha=0.0005, l1_ratio=.9, random_state=3))

In [331]:
score = rmsle_cv(ENet)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1354 (0.0163)



In [319]:
KRR = KernelRidge(alpha=0.6, kernel='polynomial', degree=2, coef0=2.5)

In [332]:
score = rmsle_cv(KRR)
print("KernelRidge score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1569 (0.0091)



In [320]:
GBoost = GradientBoostingRegressor(n_estimators=3000, learning_rate=0.05,
                                   max_depth=4, max_features='sqrt',
                                   min_samples_leaf=15, min_samples_split=10, 
                                   loss='huber', random_state =5)

In [333]:
score = rmsle_cv(GBoost)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1247 (0.0122)



In [312]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, gamma=0.0468, 
                             learning_rate=0.05, max_depth=3, 
                             min_child_weight=1.7817, n_estimators=2200,
                             reg_alpha=0.4640, reg_lambda=0.8571,
                             subsample=0.5213, silent=1,
                             random_state =7, nthread = -1)

In [334]:
score = rmsle_cv(model_xgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1222 (0.0097)



In [321]:
model_lgb = lgb.LGBMRegressor(objective='regression',num_leaves=5,
                              learning_rate=0.05, n_estimators=720,
                              max_bin = 55, bagging_fraction = 0.8,
                              bagging_freq = 5, feature_fraction = 0.2319,
                              feature_fraction_seed=9, bagging_seed=9,
                              min_data_in_leaf =6, min_sum_hessian_in_leaf = 11)

In [335]:
score = rmsle_cv(model_lgb)
print("Xgboost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Xgboost score: 0.1245 (0.0102)



## Stacked Modelling

In [336]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1)  

In [338]:
averaged_models = AveragingModels(models = (model_xgb, GBoost,model_lgb))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

 Averaged base models score: 0.1205 (0.0107)



In [347]:
t

False

In [340]:
class StackingAveragedModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, base_models, meta_model, n_folds=5):
        self.base_models = base_models
        self.meta_model = meta_model
        self.n_folds = n_folds
   
    # We again fit the data on clones of the original models
    def fit(self, X, y):
        self.base_models_ = [list() for x in self.base_models]
        self.meta_model_ = clone(self.meta_model)
        kfold = KFold(n_splits=self.n_folds, shuffle=True, random_state=156)
        
        # Train cloned base models then create out-of-fold predictions
        # that are needed to train the cloned meta-model
        out_of_fold_predictions = np.zeros((X.shape[0], len(self.base_models)))
        for i, model in enumerate(self.base_models):
            for train_index, holdout_index in kfold.split(X, y):
                instance = clone(model)
                self.base_models_[i].append(instance)
                instance.fit(X[train_index], y[train_index])
                y_pred = instance.predict(X[holdout_index])
                out_of_fold_predictions[holdout_index, i] = y_pred
                
        # Now train the cloned  meta-model using the out-of-fold predictions as new feature
        self.meta_model_.fit(out_of_fold_predictions, y)
        return self
   
    #Do the predictions of all base models on the test data and use the averaged predictions as 
    #meta-features for the final prediction which is done by the meta-model
    def predict(self, X):
        meta_features = np.column_stack([
            np.column_stack([model.predict(X) for model in base_models]).mean(axis=1)
            for base_models in self.base_models_ ])
        return self.meta_model_.predict(meta_features)

EnSembling Atscked

In [None]:
stacked_averaged_models = StackingAveragedModels(base_models = (ENet, GBoost, KRR),
                                                 meta_model = lasso)

score = rmsle_cv(stacked_averaged_models)
print("Stacking Averaged models score: {:.4f} ({:.4f})".format(score.mean(), score.std()))

In [348]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [352]:
stacked_averaged_models.fit(train.values, target)
stacked_train_pred = stacked_averaged_models.predict(train.values)
stacked_pred = np.expm1(stacked_averaged_models.predict(test.values))
print(rmsle(target, stacked_train_pred))

0.0709090418868


In [354]:
model_xgb.fit(train, target)
xgb_train_pred = model_xgb.predict(train)
xgb_pred = np.expm1(model_xgb.predict(test))
print(rmsle(target, xgb_train_pred))

0.07919608061


In [355]:
model_lgb.fit(train, target)
lgb_train_pred = model_lgb.predict(train)
lgb_pred = np.expm1(model_lgb.predict(test.values))
print(rmsle(target, lgb_train_pred))

0.0751596141362


In [358]:
print('RMSLE score on train data:')
print(rmsle(target,stacked_train_pred*0.70 +
               xgb_train_pred*0.15 + lgb_train_pred*0.15 ))

RMSLE score on train data:
0.0699153786537


In [359]:
ensemble = stacked_pred*0.70 + xgb_pred*0.15 + lgb_pred*0.15

In [360]:
test=pd.read_csv('testy.csv')
test_ID=test['Id']

In [362]:

sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = ensemble
sub.to_csv('submission12.csv',index=False)