In [1]:
from datetime import datetime

import numpy as np
import pandas as pd
import warnings
pd.set_option('display.max_columns', 500)
warnings.filterwarnings('ignore')

from scipy.stats import skew  # for some statistics
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

from sklearn.linear_model import ElasticNetCV, LassoCV, RidgeCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from mlxtend.regressor import StackingCVRegressor

train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
print("Train set size:", train.shape)
print("Test set size:", test.shape)
print('START data processing', datetime.now(), )

Train set size: (1460, 81)
Test set size: (1459, 80)
START data processing 2020-05-27 23:27:06.254191


In [2]:
train_ID = train['Id']
test_ID = test['Id']
# Now drop the  'Id' colum since it's unnecessary for  the prediction process.
train.drop(['Id'], axis=1, inplace=True)
test.drop(['Id'], axis=1, inplace=True)

In [3]:
# Deleting outliers
train = train[train.GrLivArea < 4500]
train.reset_index(drop=True, inplace=True)

In [4]:
# We use the numpy fuction log1p which  applies log(1+x) to all elements of the column
train["SalePrice"] = np.log1p(train["SalePrice"])
y = train.SalePrice.reset_index(drop=True)
train_features = train.drop(['SalePrice'], axis=1)
test_features = test

features = pd.concat([train_features, test_features]).reset_index(drop=True)
print(features.shape)

(2917, 79)


In [5]:
# Some of the non-numeric predictors are stored as numbers; we convert them into strings 
features['MSSubClass'] = features['MSSubClass'].apply(str)
features['YrSold'] = features['YrSold'].astype(str)
features['MoSold'] = features['MoSold'].astype(str)


features['Functional'] = features['Functional'].fillna('Typ')
features['Electrical'] = features['Electrical'].fillna("SBrkr")
features['KitchenQual'] = features['KitchenQual'].fillna("TA")
features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode()[0])
features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode()[0])
features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode()[0])

features["PoolQC"] = features["PoolQC"].fillna("None")

for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
    features[col] = features[col].fillna(0)
for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
    features[col] = features[col].fillna('None')
for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
    features[col] = features[col].fillna('None')

features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))



objects = []
for i in features.columns:
    if features[i].dtype == object:
        objects.append(i)

features.update(features[objects].fillna('None'))

features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))


In [6]:
# Filling in the rest of the NA's

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics.append(i)
features.update(features[numerics].fillna(0))

numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
numerics2 = []
for i in features.columns:
    if features[i].dtype in numeric_dtypes:
        numerics2.append(i)

skew_features = features[numerics2].apply(lambda x: skew(x)).sort_values(ascending=False)

high_skew = skew_features[skew_features > 0.5]
skew_index = high_skew.index

In [7]:
for i in skew_index:
    features[i] = boxcox1p(features[i], boxcox_normmax(features[i] + 1))

features = features.drop(['Utilities', 'Street', 'PoolQC',], axis=1)

features['YrBltAndRemod']=features['YearBuilt']+features['YearRemodAdd']
features['TotalSF']=features['TotalBsmtSF'] + features['1stFlrSF'] + features['2ndFlrSF']

features['Total_sqr_footage'] = (features['BsmtFinSF1'] + features['BsmtFinSF2'] +
                                 features['1stFlrSF'] + features['2ndFlrSF'])

features['Total_Bathrooms'] = (features['FullBath'] + (0.5 * features['HalfBath']) +
                               features['BsmtFullBath'] + (0.5 * features['BsmtHalfBath']))

features['Total_porch_sf'] = (features['OpenPorchSF'] + features['3SsnPorch'] +
                              features['EnclosedPorch'] + features['ScreenPorch'] +
                              features['WoodDeckSF'])
                              

# simplified features
features['haspool'] = features['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
features['has2ndfloor'] = features['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasgarage'] = features['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
features['hasbsmt'] = features['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
features['hasfireplace'] = features['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

In [8]:
print(features.shape)
final_features = pd.get_dummies(features).reset_index(drop=True)
print(final_features.shape)

X = final_features.iloc[:len(y), :]
X_sub = final_features.iloc[len(X):, :]

print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

(2917, 86)
(2917, 333)
X (1458, 333) y (1458,) X_sub (1459, 333)


In [9]:
X.head()

Unnamed: 0,LotFrontage,LotArea,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,BsmtFinSF1,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,TotRmsAbvGrd,Fireplaces,GarageYrBlt,GarageCars,GarageArea,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,MiscVal,YrBltAndRemod,TotalSF,Total_sqr_footage,Total_Bathrooms,Total_porch_sf,haspool,has2ndfloor,hasgarage,hasbsmt,hasfireplace,MSSubClass_120,MSSubClass_150,MSSubClass_160,MSSubClass_180,MSSubClass_190,MSSubClass_20,MSSubClass_30,MSSubClass_40,MSSubClass_45,MSSubClass_50,MSSubClass_60,MSSubClass_70,MSSubClass_75,MSSubClass_80,MSSubClass_85,MSSubClass_90,MSZoning_C (all),MSZoning_FV,MSZoning_RH,MSZoning_RL,MSZoning_RM,Alley_Grvl,Alley_None,Alley_Pave,LotShape_IR1,LotShape_IR2,LotShape_IR3,LotShape_Reg,LandContour_Bnk,LandContour_HLS,LandContour_Low,LandContour_Lvl,LotConfig_Corner,LotConfig_CulDSac,LotConfig_FR2,LotConfig_FR3,LotConfig_Inside,LandSlope_Gtl,LandSlope_Mod,LandSlope_Sev,Neighborhood_Blmngtn,Neighborhood_Blueste,Neighborhood_BrDale,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,Condition1_Artery,Condition1_Feedr,Condition1_Norm,Condition1_PosA,Condition1_PosN,Condition1_RRAe,Condition1_RRAn,Condition1_RRNe,Condition1_RRNn,Condition2_Artery,Condition2_Feedr,Condition2_Norm,Condition2_PosA,Condition2_PosN,Condition2_RRAe,Condition2_RRAn,Condition2_RRNn,BldgType_1Fam,BldgType_2fmCon,BldgType_Duplex,BldgType_Twnhs,BldgType_TwnhsE,HouseStyle_1.5Fin,HouseStyle_1.5Unf,HouseStyle_1Story,HouseStyle_2.5Fin,HouseStyle_2.5Unf,HouseStyle_2Story,HouseStyle_SFoyer,HouseStyle_SLvl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,Exterior1st_AsbShng,Exterior1st_AsphShn,Exterior1st_BrkComm,Exterior1st_BrkFace,Exterior1st_CBlock,Exterior1st_CemntBd,Exterior1st_HdBoard,Exterior1st_ImStucc,Exterior1st_MetalSd,Exterior1st_Plywood,Exterior1st_Stone,Exterior1st_Stucco,Exterior1st_VinylSd,Exterior1st_Wd Sdng,Exterior1st_WdShing,Exterior2nd_AsbShng,Exterior2nd_AsphShn,Exterior2nd_Brk Cmn,Exterior2nd_BrkFace,Exterior2nd_CBlock,Exterior2nd_CmentBd,Exterior2nd_HdBoard,Exterior2nd_ImStucc,Exterior2nd_MetalSd,Exterior2nd_Other,Exterior2nd_Plywood,Exterior2nd_Stone,Exterior2nd_Stucco,Exterior2nd_VinylSd,Exterior2nd_Wd Sdng,Exterior2nd_Wd Shng,MasVnrType_BrkCmn,MasVnrType_BrkFace,MasVnrType_None,MasVnrType_Stone,ExterQual_Ex,ExterQual_Fa,ExterQual_Gd,ExterQual_TA,ExterCond_Ex,ExterCond_Fa,ExterCond_Gd,ExterCond_Po,ExterCond_TA,Foundation_BrkTil,Foundation_CBlock,Foundation_PConc,Foundation_Slab,Foundation_Stone,Foundation_Wood,BsmtQual_Ex,BsmtQual_Fa,BsmtQual_Gd,BsmtQual_None,BsmtQual_TA,BsmtCond_Fa,BsmtCond_Gd,BsmtCond_None,BsmtCond_Po,BsmtCond_TA,BsmtExposure_Av,BsmtExposure_Gd,BsmtExposure_Mn,BsmtExposure_No,BsmtExposure_None,BsmtFinType1_ALQ,BsmtFinType1_BLQ,BsmtFinType1_GLQ,BsmtFinType1_LwQ,BsmtFinType1_None,BsmtFinType1_Rec,BsmtFinType1_Unf,BsmtFinType2_ALQ,BsmtFinType2_BLQ,BsmtFinType2_GLQ,BsmtFinType2_LwQ,BsmtFinType2_None,BsmtFinType2_Rec,BsmtFinType2_Unf,Heating_Floor,Heating_GasA,Heating_GasW,Heating_Grav,Heating_OthW,Heating_Wall,HeatingQC_Ex,HeatingQC_Fa,HeatingQC_Gd,HeatingQC_Po,HeatingQC_TA,CentralAir_N,CentralAir_Y,Electrical_FuseA,Electrical_FuseF,Electrical_FuseP,Electrical_Mix,Electrical_SBrkr,KitchenQual_Ex,KitchenQual_Fa,KitchenQual_Gd,KitchenQual_TA,Functional_Maj1,Functional_Maj2,Functional_Min1,Functional_Min2,Functional_Mod,Functional_Sev,Functional_Typ,FireplaceQu_Ex,FireplaceQu_Fa,FireplaceQu_Gd,FireplaceQu_None,FireplaceQu_Po,FireplaceQu_TA,GarageType_2Types,GarageType_Attchd,GarageType_Basment,GarageType_BuiltIn,GarageType_CarPort,GarageType_Detchd,GarageType_None,GarageFinish_Fin,GarageFinish_None,GarageFinish_RFn,GarageFinish_Unf,GarageQual_Ex,GarageQual_Fa,GarageQual_Gd,GarageQual_None,GarageQual_Po,GarageQual_TA,GarageCond_Ex,GarageCond_Fa,GarageCond_Gd,GarageCond_None,GarageCond_Po,GarageCond_TA,PavedDrive_N,PavedDrive_P,PavedDrive_Y,Fence_GdPrv,Fence_GdWo,Fence_MnPrv,Fence_MnWw,Fence_None,MiscFeature_Gar2,MiscFeature_None,MiscFeature_Othr,MiscFeature_Shed,MiscFeature_TenC,MoSold_1,MoSold_10,MoSold_11,MoSold_12,MoSold_2,MoSold_3,MoSold_4,MoSold_5,MoSold_6,MoSold_7,MoSold_8,MoSold_9,YrSold_2006,YrSold_2007,YrSold_2008,YrSold_2009,YrSold_2010,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial
0,18.144564,13.833055,7,3.991519,2003,2003,19.433175,144.117863,0.0,29.991052,422.488379,5.93903,1025.651937,0.0,8.353542,0.99344,0.0,2,1.068837,3,0.750957,2.261968,0.0,2003.0,2.0,548.0,0.0,12.08031,0.0,0.0,0.0,0.0,0.0,4006,1454.079347,1175.70883,3.527858,12.08031,0,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
1,20.673615,14.117918,6,6.000036,1976,1976,0.0,181.719186,0.0,44.13541,593.887983,6.234986,0.0,0.0,7.974692,0.0,0.710895,2,0.0,3,0.750957,1.996577,0.903334,1976.0,2.0,460.0,56.184232,0.0,0.0,0.0,0.0,0.0,0.0,3952,600.122969,187.954172,2.355448,56.184232,0,0,1,1,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
2,18.668038,14.476513,7,3.991519,2001,2002,17.768841,110.441033,0.0,56.896528,450.079575,5.994332,1040.521017,0.0,8.408063,0.99344,0.0,2,1.068837,3,0.750957,1.996577,0.903334,2001.0,2.0,608.0,0.0,9.901082,0.0,0.0,0.0,0.0,0.0,4003,1496.594925,1156.956383,3.527858,9.901082,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0
3,17.249643,14.106197,7,3.991519,1915,1970,0.0,61.795315,0.0,64.808848,378.854453,6.0277,904.477386,0.0,8.358661,0.99344,0.0,1,0.0,3,0.750957,2.137369,0.903334,1998.0,3.0,642.0,0.0,8.966116,16.020713,0.0,0.0,0.0,0.0,3885,1289.359539,972.300401,1.99344,24.98683,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0
4,21.314272,15.022008,8,3.991519,2000,2000,25.404165,136.624601,0.0,61.166371,545.30975,6.161217,1273.024809,0.0,8.66932,0.99344,0.0,2,1.068837,4,0.750957,2.373753,0.903334,2000.0,3.0,836.0,42.245708,14.27157,0.0,0.0,0.0,0.0,0.0,4000,1824.495777,1415.810628,3.527858,56.517278,0,1,1,1,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,1,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,1,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0


In [10]:
outliers = [30, 88, 462, 631, 1322]
X = X.drop(X.index[outliers])
y = y.drop(y.index[outliers])

overfit = []
for i in X.columns:
    counts = X[i].value_counts()
    zeros = counts.iloc[0]
    if zeros / len(X) * 100 > 99.94:
        overfit.append(i)
        
overfit = list(overfit)
overfit.append('MSZoning_C (all)')

X = X.drop(overfit, axis=1).copy()
X_sub = X_sub.drop(overfit, axis=1).copy()

print('X', X.shape, 'y', y.shape, 'X_sub', X_sub.shape)

X (1453, 331) y (1453,) X_sub (1459, 331)


In [11]:
from sklearn.model_selection import cross_val_score, GridSearchCV, KFold
from sklearn.metrics import mean_squared_error, mean_squared_log_error
from sklearn.preprocessing import RobustScaler
# used for stacking regessors models
from sklearn.ensemble import StackingRegressor

from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor

# 1. ElasticNet 2. Lasso 3. Ridge 4. LGBMRegressor 5. XGBRegressor 6. SVR 7.GradientBoostingRegressor
from sklearn.linear_model import ElasticNet, Lasso, Ridge, ElasticNetCV, LassoCV, RidgeCV
from sklearn.svm import LinearSVR, SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

from sklearn.pipeline import Pipeline, make_pipeline
from mlxtend.regressor import StackingCVRegressor

In [12]:
kfolds = KFold(n_splits=5, shuffle=True, random_state=2020)

def rmsle(ytest, ypred):
    return np.sqrt(mean_squared_log_error(ytest, ypred))

def rmse_cv(model):
    mse = np.sqrt(-cross_val_score(model, X, y, scoring="neg_mean_squared_error" ,cv=kfolds))
    print(f'{model.__class__.__name__} score : {mse.mean():.4f}, {mse.std():.4f}')

In [13]:
alphas = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008, 0.0009]
lasso = Lasso()
lasso_search = GridSearchCV(lasso, {'alpha' : alphas} , cv=5, scoring="neg_mean_squared_error")
lasso_search.fit(X, y)
lasso_search.best_estimator_

Lasso(alpha=0.0002, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [14]:
lasso_model = make_pipeline(RobustScaler(),
                            Lasso(alpha=0.0002, random_state=2020)
                           )
rmse_cv(lasso_model)

Pipeline score : 0.1032, 0.0060


In [15]:
ridge = Ridge()
ridge_search = GridSearchCV(ridge, {'alpha':np.linspace(10,30,10)}, 
                            cv=5,scoring="neg_mean_squared_error")
ridge_search.fit(X,y)
ridge_search.best_estimator_

Ridge(alpha=12.222222222222221, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [22]:
ridge_model = make_pipeline(RobustScaler(),
                            Ridge(alpha=12.22, random_state=2020)
                           )
rmse_cv(lasso_model)

Pipeline score : 0.1032, 0.0060


In [16]:
elastic_model = make_pipeline(RobustScaler(),
                            ElasticNet(alpha=0.000199, l1_ratio = 1.2778,random_state=2020))
rmse_cv(elastic_model)

Pipeline score : 0.1036, 0.0065


In [17]:
gbR_model = GradientBoostingRegressor(learning_rate = 0.05, n_estimators = 300,
                                random_state=2020)
rmse_cv(gbR_model)

GradientBoostingRegressor score : 0.1146, 0.0052


In [18]:
rf_model = RandomForestRegressor(min_samples_leaf=4, min_samples_split=8)
rmse_cv(rf_model)

RandomForestRegressor score : 0.1346, 0.0046


In [19]:
svr_model = make_pipeline(RobustScaler(), SVR(C=20, epsilon=0.005, gamma=0.0003))
rmse_cv(svr_model)

Pipeline score : 0.1038, 0.0069


In [20]:
xgb_model = make_pipeline(RobustScaler(), 
                           XGBRegressor(learning_rate=0.02, max_depth=3,
                                       n_estimators=2000, objective='reg:squarederror',
                                       random_state=2020))
rmse_cv(xgb_model)

Pipeline score : 0.1133, 0.0059


In [21]:
lgbm_model = make_pipeline(RobustScaler(),
                          LGBMRegressor(learning_rate=0.02,
                                       max_depth=3,
                                       objective='regression',
                                       n_estimators=2400,
                                       num_leaves=3,
                                       random_state=2020))
rmse_cv(lgbm_model)

Pipeline score : 0.1123, 0.0086


In [23]:
estimators = [lasso_model, ridge_model, elastic_model, gbR_model , rf_model,
              svr_model, xgb_model, lgbm_model]
stack_model = StackingCVRegressor(estimators, 
                                  meta_regressor=xgb_model,
                                  n_jobs=-1,
                                  use_features_in_secondary=True)
rmse_cv(stack_model)

StackingCVRegressor score : 0.1089, 0.0060


In [25]:
lasso_model = lasso_model.fit(X, y)
ridge_model = ridge_model.fit(X, y)
elastic_model = elastic_model.fit(X,y)

gbR_model = gbR_model.fit(X,y)
rf_model = rf_model.fit(X,y)

svr_model = svr_model.fit(X,y)
xgb_model = xgb_model.fit(X,y)
lgbm_model = lgbm_model.fit(X,y)

stack_model = stack_model.fit(np.array(X), np.array(y))

In [26]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y, y_pred))

In [27]:
ypred = elastic_model.predict(X)
print("RMSLE score for Elastic model !!!")
print(rmsle(y, ypred))

RMSLE score for Elastic model !!!
0.08780323753980492


In [28]:
ypred = lasso_model.predict(X)
print("RMSLE score for Lasso model !!!")
print(rmsle(y, ypred))

RMSLE score for Lasso model !!!
0.08626907599737016


In [29]:
ypred = ridge_model.predict(X)
print("RMSLE score for Ridge model !!!")
print(rmsle(y, ypred))

RMSLE score for Ridge model !!!
0.08793788951399924


In [30]:
ypred = gbR_model.predict(X)
print("RMSLE score for GBR model !!!")
print(rmsle(y, ypred))

RMSLE score for GBR model !!!
0.0669122652119969


In [31]:
ypred = rf_model.predict(X)
print("RMSLE score for RF model !!!")
print(rmsle(y, ypred))

RMSLE score for RF model !!!
0.07623005326582652


In [32]:
ypred = svr_model.predict(X)
print("RMSLE score for SVR model !!!")
print(rmsle(y, ypred))

RMSLE score for SVR model !!!
0.08145765142710115


In [33]:
ypred = xgb_model.predict(X)
print("RMSLE score for XGB model !!!")
print(rmsle(y, ypred))

RMSLE score for XGB model !!!
0.04909943193904916


In [34]:
ypred = lgbm_model.predict(X)
print("RMSLE score for LGBM model !!!")
print(rmsle(y, ypred))

RMSLE score for LGBM model !!!
0.0793295606003092


In [35]:
ypred = stack_model.predict(X)
print("RMSLE score for Stack model !!!")
print(rmsle(y, ypred))

RMSLE score for Stack model !!!
0.04873763509185985


In [47]:
def combine_model(x):
    return (
        (0.075*lasso_model.predict(x)) +
        (0.075*ridge_model.predict(x))+
        (0.15*elastic_model.predict(x)) + 
        (0.15*gbR_model.predict(x)) +
        (0.075*rf_model.predict(x))+
        (0.075*svr_model.predict(x)) +
        (0.15*xgb_model.predict(x))+
        (0.15*lgbm_model.predict(x))+
        (0.1*stack_model.predict(np.array(x)))
    )

In [48]:
print('RMSLE score on train data:')
print(rmsle(y, combine_model(X)))

RMSLE score on train data:
0.0659869588304416


In [50]:
log_result = combine_model(X_sub)
result = np.expm1(log_result)

In [53]:
test = pd.read_csv("data/test.csv")
sub = pd.DataFrame()
sub['Id'] = test['Id']
sub['SalePrice'] = result
sub.head()

Unnamed: 0,Id,SalePrice
0,1461,124119.756269
1,1462,158226.036886
2,1463,186071.546004
3,1464,196828.026181
4,1465,188788.610615


In [56]:
sub.to_csv("submission.csv", index=False)