# Imports and config

In [None]:
# Essentials
import numpy as np
import scipy
import pandas as pd
import datetime
import random
from scipy import stats
from scipy.stats import kstest
from scipy.stats import boxcox

# Plots
import seaborn as sns
import matplotlib.pyplot as plt

# Models
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, BaggingRegressor
from sklearn.kernel_ridge import KernelRidge
from sklearn.linear_model import Ridge, RidgeCV, LassoCV, ElasticNet, ElasticNetCV
from sklearn.svm import SVR
from mlxtend.regressor import StackingCVRegressor
import lightgbm as lgb
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor

# Stats
from scipy.stats import skew, norm
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax

# Misc
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import KFold, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import scale
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import RobustScaler
from sklearn.decomposition import PCA

pd.set_option('display.max_columns', None)

# Ignore useless warnings
import warnings
warnings.filterwarnings(action="ignore")
pd.options.display.max_seq_items = 8000
pd.options.display.max_rows = 8000

# Date loading

In [None]:
def read_data():
    """
    Read in train and test data
    
    Args:
        None
        
    Returns:
        train (pandas dataframe) : train data
        test (pandas dataframe) : test data
    """
    
    # Read in the dataset as a dataframe
    train = pd.read_csv('../input/house-prices-advanced-regression-techniques/train.csv')
    test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
    
    return train, test

train, test = read_data()
train.shape, test.shape

# EDA

Let's first preview the data.

In [None]:
train.head()

## SalePrice : the target

The saleprice is what we are trying to predict, so let's have a look at a plot of this.

In [None]:
sns.set_style("white")
sns.set_color_codes(palette = 'deep')
figure, ax = plt.subplots(figsize = (8, 7))
sns.distplot(train["SalePrice"])
ax.set(ylabel = "Frequency")
ax.set(xlabel = "SalePrice")
ax.set(title = "SalePrice distribution")
sns.despine(trim = True, left = True)  # remove left border from plot
plt.show()

The distribution looks quite skewed, lets measure its skewness and kurtosis.

In [None]:
print("Skewness: {}".format(train["SalePrice"].skew()))
print("Kurtosis: {}".format(train["SalePrice"].kurtosis()))

A relatively skewed distribution large tails.

## Features

Let's plot the features against the target, to see get a good feel for them.

In [None]:
def find_numeric_features(features):
    """
    Find numeric features of a given feature set
    
    Args:
        features (pandas dataframe) : dataframe to inspect
        
    Returns:
        numeric (string[]) : numeric feature names 
    
    """
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numeric.append(i)
    return numeric

In [None]:
def plot_features_against_target(data):
    """
    Create scatter subplots of all features against target
    
    Args:
        data (pandas dataframe) : data whose features we want to plot
        
    Returns:
        None
    """

    # Finding numeric features
    numeric = find_numeric_features(data)         

    # define figure
    fig, axs = plt.subplots(ncols=3, figsize=(12, 120))

    # adjust spacing
    plt.subplots_adjust(right=2)
    plt.subplots_adjust(top=2)

    # set colour palette
    sns.color_palette("husl", 8)

    # for each feature
    for i, feature in enumerate(list(data[numeric]), 1):

        # plot data
        plt.subplot(len(list(numeric)), 3, i)
        sns.scatterplot(x = feature, y='SalePrice', hue='SalePrice', palette='Blues', data = data)

        # add axis labels
        plt.xlabel('{}'.format(feature), size=15,labelpad=12.5)
        plt.ylabel('SalePrice', size=15, labelpad=12.5)

        # change tick sizes
        plt.tick_params(axis='x', labelsize=16)
        plt.tick_params(axis='y', labelsize=16)

        # add legend
        plt.legend(loc='best', prop={'size': 12})

    plt.show()
    
plot_features_against_target(train)

We see that certain features like 'TotalBsmtSF' and 'GrLivArea' are strongly correlated with 'SalePrice'. Other features like 'BsmtFinSF2' have alsmost no correlation. There is also clearly a lot of outliers and corrupt data to remove.

Let's now plot a correlation heat map to see how features are correlated to each other and to SalePrice.

In [None]:
corr = train.corr()
plt.subplots(figsize = (15, 12))
sns.heatmap(corr, vmax = 0.9, cmap = 'Blues', square = True)

We see some strong correlations here, some more useful than others. For example, "GarageYrBlt" being strongly correlated with "YearBuilt" is not a surprising relationship.

Let's look at how some specfic features are correlated with the target.

In [None]:
data = pd.concat([train['SalePrice'], train['OverallQual']], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x=train['OverallQual'], y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);

In [None]:
data = pd.concat([train['SalePrice'], train['YearBuilt']], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x=train['YearBuilt'], y="SalePrice", data=data)
fig.axis(ymin=0, ymax=800000);
plt.xticks(rotation=45);
plt.tick_params(axis='x', labelsize=7)

In [None]:
data = pd.concat([train['SalePrice'], train['GrLivArea']], axis=1)
data.plot.scatter(x='GrLivArea', y='SalePrice', alpha=0.3, ylim=(0,800000));

# Data Cleaning

First we will remove the Ids from the train and test, as they are unique for each row and hence not useful for the model.

In [None]:
def drop_ID(X):
    """
    Drop the ID column since it is unique and so useless for ML
    
    Args:
        X (pandas dataframe) : dataframe whose ID col we want to trop
    
    Returns:
        X (pandas dataframe) : dataframe with ID's dropped
    """
    X.drop(['Id'], axis = 1, inplace = True)
    return X

train = drop_ID(train)
test = drop_ID(test)
train.shape, test.shape

Now, lets look at the SalePrice distribution again.

In [None]:
sns.set_style("white")
sns.set_color_codes(palette = 'deep')
figure, ax = plt.subplots(figsize = (8, 7))
sns.distplot(train["SalePrice"])
ax.set(ylabel = "Frequency")
ax.set(xlabel = "SalePrice")
ax.set(title = "SalePrice distribution")
sns.despine(trim = True, left = True)  # remove left border from plot
plt.show()

Notice that it is skewed to the right. This is bad as models do not work well with skewed data. In order to make it more normal, apply a $\ln({1+x})$ transform.

In [None]:
def transform_target(X):
    """
    Transform target to make it normally distributed
    
    Params:
        X (pandas dataframe) : dataframe whose target we want to transform
        
    Returns:
        X (pandas dataframe) : transformed dataframe
    """
    X["SalePrice"] = np.log1p(train["SalePrice"])
    return X

# log(1+x) transform
train = transform_target(train)

Now replot the distrubition alongside a fitted normal distribution to see how they compare.

In [None]:
sns.set_style("white")
sns.set_color_codes(palette = 'deep')
figure, ax = plt.subplots(figsize = (8, 7))
sns.distplot(train["SalePrice"], fit = norm)

# Get the fitted parameters used by the function
(mu, sigma) = norm.fit(train['SalePrice'])
print('mu = {:.2f}'.format(mu))
print('sigma = {:.2f}'.format(sigma))

#Now plot the distribution
plt.legend(["Interpolated data",'Normal fit ($\mu=$ {:.2f} and $\sigma=$ {:.2f} )'.format(mu, sigma)],
            loc='center left', bbox_to_anchor=(1, 0.5))
ax.set(ylabel = "Frequency")
ax.set(xlabel = "SalePrice")
ax.set(title = "SalePrice distribution")
sns.despine(trim = True, left = True)  # remove left border from plot
plt.show()

The target is no longer skewed.

## Drop outliers

Using the plots we made before of specific features against the SalePrice, we will remove some of the outliers.

In [None]:
def drop_outliers(X):
    """
    Drop any identified outliers
    
    Args:
        X (pandas dataframe) : dataframe whose outliers we want to drop
        
    Returns:
        X (pandas dataframe) : dataframe with outliers dropped
    """
    X.drop(X[(X['OverallQual'] < 5) & (X['SalePrice'] > 200000)].index, inplace=True)
    X.drop(X[(X['GrLivArea'] > 4500) & (X['SalePrice'] < 300000)].index, inplace=True)
    X.reset_index(drop=True, inplace=True) # restores index after dropping
    return X

train = drop_outliers(train)

## Prepare dataset for cleaning

Now we split the target and features.  
Also combine the train and test features so that we can apply the all coming transformation to the entire dataset.

In [None]:
def prepare_for_cleaning(train, test):
    """
    Prepare datasets for cleaning by dropping target and combining into one dataset
    
    Args:
        train (pandas dataframe) : train dataset
        test (pandas dataframe) : test dataset
    
    Returns:
        y_train (pandas series) : target
        X (pandas dataframe) : combined data
    """
    y_train = train['SalePrice'].reset_index(drop=True)
    X_train = train.drop(['SalePrice'], axis=1) # drop the target
    X_test = test
    X = pd.concat([X_train, X_test]).reset_index(drop=True) # combine train and test
    
    return y_train, X

y_train, X = prepare_for_cleaning(train, test)
X.shape

## Fill missing values

First, lets print the percentage of missing values from each column.

In [None]:
def percent_missing(df):
    """
    Determine the percentage of missing values in each column of a data frame
    
    Args:
        df (pandas dataframe) : dataframe we want to inspect
    
    Returns:
        dict_x (dict{}) : dictionary mapping column names to percentage values missing in column
    """
    dict_x = {}
    for i in range(0, len(df.columns)):
        dict_x[df.columns[i]] = round(df[df.columns[i]].isnull().mean()*100, 2)
    return dict_x

missing = percent_missing(X)
df_miss = sorted(missing.items(), key = lambda x : x[1], reverse = True)  # reverse sort
print("Percentage of missing data")
for i in df_miss:
    if(i[1] > 0): print(i)

We see that there are a few columns with lots of missing data.  
Let's visualise this data.

In [None]:
# set up figure with colour formatting
sns.set_style("white")
fig, ax = plt.subplots(figsize=(8, 7))
sns.set_color_codes(palette='deep')

# determine columns with more than one missing value and sort in increasing orber
missing = round(X.isnull().mean()*100,2)
missing = missing[missing > 0]
missing.sort_values(inplace=True)
missing.plot.bar(color="b")

# tweak the visual presentation
ax.xaxis.grid(False)
ax.set(ylabel="Percent of missing values")
ax.set(xlabel="Features")
ax.set(title="Percent missing data by feature")
sns.despine(trim=True, left=True)

Now we want to impute missing values for each of these features.

In [None]:
def handle_missing(features):
    # the data description states that NA refers to typical ('Typ') values
    features['Functional'] = features['Functional'].fillna('Typ')
    
    # Replace the missing values in each of the columns below with their mode
    features['Electrical'] = features['Electrical'].fillna(features['Electrical'].mode())
    features['KitchenQual'] = features['KitchenQual'].fillna(features['KitchenQual'].mode())
    features['Exterior1st'] = features['Exterior1st'].fillna(features['Exterior1st'].mode())
    features['Exterior2nd'] = features['Exterior2nd'].fillna(features['Exterior2nd'].mode())
    features['SaleType'] = features['SaleType'].fillna(features['SaleType'].mode())
    
    # zoning is likely based on subclass so we fill based on the mode in that zone
    features['MSZoning'] = features.groupby('MSSubClass')['MSZoning'].transform(lambda x: x.fillna(x.mode()[0]))
    
    # the data description stats that NA refers to "No Pool"
    features["PoolQC"] = features["PoolQC"].fillna("None")
    # Replacing the missing values with 0, since no garage = no cars in garage
    for col in ('GarageYrBlt', 'GarageArea', 'GarageCars'):
        features[col] = features[col].fillna(0)
    # Replacing the missing values with None
    for col in ['GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']:
        features[col] = features[col].fillna('None')
    # NaN values for these categorical basement features, means there's no basement
    for col in ('BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2'):
        features[col] = features[col].fillna('None')
        
    # lot frontage likely based on neighbourhood so fill based on median in neighbourhood
    features['LotFrontage'] = features.groupby('Neighborhood')['LotFrontage'].transform(lambda x: x.fillna(x.median()))
    
    # We have no particular intuition around how to fill in the rest of the categorical features
    # So we replace their missing values with None
    objects = []
    for i in features.columns:
        if features[i].dtype == object:
            objects.append(i)
    features.update(features[objects].fillna('None'))
    
    # And we do the same thing for numerical features, but this time with 0s
    numeric_dtypes = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    numeric = []
    for i in features.columns:
        if features[i].dtype in numeric_dtypes:
            numeric.append(i)
    features.update(features[numeric].fillna(0))    
    return features
    
X = handle_missing(X)

Let's now check that we have handled all the missing features.

In [None]:
missing = percent_missing(X)
df_miss = sorted(missing.items(), key = lambda x : x[1], reverse = True)  # reverse sort
print("Percentage of missing data")
for i in df_miss:
    if(i[1] > 0): print(i)

As desired, there are no missing values anymore.

## Fix skewed features

Models struggle to deal with non normally distributed features, so we want to transform these in some way.  
Lets plot the numeric features.

In [None]:
def plot_numeric_feature_distributions(features):
    """
    Create boxplot distribution of numeric features
    
    Args:
        features (pandas dataframe) : dataframe to inspect
        
    Returns:
        None
    """

    # set up figure
    sns.set_style("white")
    fig, ax = plt.subplots(figsize=(8, 7))
    ax.set_xscale("log")

    # find numeric features and plot
    numeric = find_numeric_features(features)
    ax = sns.boxplot(data = features[numeric] , orient="h", palette="Set1")

    # edit figure
    ax.xaxis.grid(False)
    ax.set(ylabel="Feature names")
    ax.set(xlabel="Numeric values")
    ax.set(title="Numeric Distribution of Features")
    sns.despine(trim=True, left=True)
    
plot_numeric_feature_distributions(X)

Let's set a cutoff of skew = 0.5 and find features with a high skewness.

In [None]:
def find_skewed_features(features, cutoff):
    """
    Determine all features which are skewed above some threshold
    
    Args:
        features (pandas dataframe) : dataframe to inspect
        cutoff (int) : minimum skew
        
    Returns
        skew_features (pandas series) : skewness of each numeric feature
        high_skew_index (pandas index) : names of high skew features
    
    """
    
    numeric = find_numeric_features(features) # get numeric features
    skew_features = features[numeric].apply(lambda x : skew(x)).sort_values(ascending = False) # find skewed features and sort
    high_skew = skew_features[skew_features > skew_cutoff] # filter by skew cutoff
    high_skew_index = high_skew.index
    return skew_features, high_skew_index

In [None]:
skew_cutoff = 0.5
skew_features, high_skew_index = find_skewed_features(X, skew_cutoff)

# print results
print("There are {} numerical features with a skew > {}:".format(skew_features.shape[0], skew_cutoff))
skew_features.head(10)

We can use boxcox1 function to compute the Box-Cox transformation.

In [None]:
def fix_skewed_features(X):
    """
    Find then fix skewed features
    
    Args:
        X (pandas dataframe) : dataframe whose skewed features we want to fix
        
    Returns
        X (pandas dataframe) : dataframe with skewed features fixed
    """
    
    # find all skewed features
    kew_cutoff = 0.5
    skew_features, high_skew_index = find_skewed_features(X, skew_cutoff)
    
    # transform features
    for i in high_skew_index:
        X[i] = boxcox1p(X[i], boxcox_normmax(X[i] + 1))
        
    return X

X = fix_skewed_features(X)

Let's plot the numeric features again and see if we have corrected the skewness.

In [None]:
plot_numeric_feature_distributions(X)

The features are much less skewed now.

# Feature Engineering

## Creating features

ML models have trouble with complex features, so we will create some simpler ones using intution.

In [None]:
def create_new_features(X):
    """
    Create new features for the data
    
    Args:
        X (pandas dataframe) : dataframe which we want to determine new features for
    
    Returns:
        X (pandas dataframe) : dataframe with new features    
    """

    # features that determine whether the property possesses something
    #X['BsmtFinType1_Unf'] = (X['BsmtFinType1'] == 'Unf') * 1
    X['HasWoodDeck'] = (X['WoodDeckSF'] == 0) * 1
    X['HasOpenPorch'] = (X['OpenPorchSF'] == 0) * 1
    X['HasEnclosedPorch'] = (X['EnclosedPorch'] == 0) * 1
    X['Has3SsnPorch'] = (X['3SsnPorch'] == 0) * 1
    X['HasScreenPorch'] = (X['ScreenPorch'] == 0) * 1
    X['HasPool'] = (X['PoolArea'] == 0) * 1
    X['Has2ndFloor'] = (X['2ndFlrSF'] == 0) * 1
    X['HasGarage'] = (X['GarageArea'] == 0) * 1
    X['HasBsmt'] = (X['TotalBsmtSF'] == 0) * 1
    X['HasFireplace'] = (X['Fireplaces'] == 0) * 1

    # add some features together
    X['Total_Home_Quality'] = X['OverallQual'] + X['OverallCond']
    X['TotalSF'] = X['TotalBsmtSF'] + X['1stFlrSF'] + X['2ndFlrSF']
    X['Total_sqr_footage'] = (X['BsmtFinSF1'] + X['BsmtFinSF2'] +
                                     X['1stFlrSF'] + X['2ndFlrSF'])
    X['Total_Bathrooms'] = (X['FullBath'] + (0.5 * X['HalfBath']) +
                                   X['BsmtFullBath'] + (0.5 * X['BsmtHalfBath']))
    X['Total_porch_sf'] = (X['OpenPorchSF'] + X['3SsnPorch'] +
                                  X['EnclosedPorch'] + X['ScreenPorch'] +
                                  X['WoodDeckSF'])

    # other interesting things
    X['YearsSinceRemodel'] = X['YrSold'].astype(int) - X['YearRemodAdd'].astype(int)
    X['YrBltAndRemod'] = X['YearBuilt'] + X['YearRemodAdd']
    
    return X

X = create_new_features(X)

## Feature transformations

ML models struggle to tell if log(feature) or (feature)^2 is a good predictor of the target, so we will manually add these.

In [None]:
def log_transform(features, log_features):
    """
    Use log transform on certain features and store them in the dataframe
    
    Args:
        features (pandas dataframe) : dataframe where new features will be stored
        log_features (string[]) : names of features to log transform
        
    Returns:
        features (pandas dataframe) : transformed dataframe
    """
    for log_feature in log_features:
        features[log_feature + '_log'] = np.log(1.01 + features[log_feature])
    return features

# get all features to log transform
log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalSF']

# apply log transform
X = log_transform(X, log_features)

In [None]:
def square_transform(features, squ_features):
    """
    Use square transform on certain features and store them in the dataframe
    
    Args:
        features (pandas dataframe) : dataframe where new features will be stored
        squ_features (string[]) : names of features to square transform
        
    Returns:
        features (pandas dataframe) : transformed dataframe
    """
    for squ_feature in squ_features:
        features[squ_feature + "_squ"] = features[squ_feature] * features[squ_feature]
    return features

# get all features to square transform
squared_features = ['YearRemodAdd', 'LotFrontage_log', 
                    'TotalBsmtSF_log', '1stFlrSF_log', '2ndFlrSF_log', 'GrLivArea_log',
                    'GarageCars_log', 'GarageArea_log']

# apply square transform
X = square_transform(X, squared_features)

## Encode categorical features

Most models can only handle numerical features so we will convert our categorical features.

In [None]:
def encode_categorical(X):
    """
    Encode caletgorical features
    
    Args:
        X (pandas dataframe) : dataframe whose categorical features we want to encode
        
    Returns:
        X (pandas dataframe) : dataframe with categorical features encoded
    """
    X = pd.get_dummies(X).reset_index(drop = True)
    return X

X = encode_categorical(X)
X.head()

# Recreate training and test sets

In [None]:
def extract_train_and_test(X, target):
    """
    Extract the train and test sets from the combined data
    
    Args:
        X (pandas dataframe) : dataframe to split up
        target (pandas series) : target data
        
    Return:
        X_test (pandas dataframe) : test data
        X_train (pandas dataframe) : train data
    """
    X_train = X.iloc[:len(y_train), :]
    X_test = X.iloc[len(y_train):, :]
    return X_train, X_test

X_train, X_test = extract_train_and_test(X, y_train)
X_train.shape, X_test.shape

In [None]:
X_train.head()

# Data pipeline

We know want to combine all of the above steps into a single pipeline function.

In [None]:
def data_pipeline():
    """
    Data pipeline to read, clean and engineer features
    
    Args:
        None
        
    Returns:
        X_train (pandas dataframe) : train data
        X_test (pandas dataframe) : test data
        y_train (pandas series) : target
    """
    # read in data
    train, test = read_data()

    # drop id column
    train = drop_ID(train)
    test = drop_ID(test)
    
    # transform the target to normalise
    #train = transform_target(train)
    train["SalePrice"] = np.log1p(train["SalePrice"])

    # drop outliers
    train = drop_outliers(train)

    # extract target and combine train and test for cleaning
    y_train, X = prepare_for_cleaning(train, test)

    # fill all missing values
    X = handle_missing(X)

    # normalise skewed features
    X = fix_skewed_features(X)

    # create new features
    X = create_new_features(X)

    # log transform 
    log_features = ['LotFrontage','LotArea','MasVnrArea','BsmtFinSF1','BsmtFinSF2','BsmtUnfSF',
                    'TotalBsmtSF','1stFlrSF','2ndFlrSF','LowQualFinSF','GrLivArea',
                    'BsmtFullBath','BsmtHalfBath','FullBath','HalfBath','BedroomAbvGr','KitchenAbvGr',
                    'TotRmsAbvGrd','Fireplaces','GarageCars','GarageArea','WoodDeckSF','OpenPorchSF',
                    'EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MiscVal','YearRemodAdd','TotalSF']
    X = log_transform(X, log_features)

    # square transform
    squared_features = ['YearRemodAdd', 'LotFrontage_log', 
                        'TotalBsmtSF_log', '1stFlrSF_log', '2ndFlrSF_log', 'GrLivArea_log',
                        'GarageCars_log', 'GarageArea_log']
    X = square_transform(X, squared_features)

    # encode categorical features
    X = encode_categorical(X)

    # decouple the train and test data
    X_train, X_test = extract_train_and_test(X, y_train)

    return X_train, X_test, y_train

X_train, X_test, y_train = data_pipeline()
X_train.shape, X_test.shape, y_train.shape

In [None]:
X_train.isnull().values.any(), X_test.isnull().values.any(), y_train.isnull().values.any()

# Building a model

## Key features of training a model:
- **Cross Validation** Using 12-fold cross-validation
- **Models:** On each run of cross-validation I fit 7 models (ridge, svr, gradient boosting, random forest, xgboost, lightgbm regressors)
- **Stacking:** In addition, I trained a meta StackingCVRegressor optimized using xgboost
- **Blending:** All models trained will overfit the training data to varying degrees. Therefore, to make final predictions, I blended their predictions together to get more robust predictions.

## Setup cross validation and define error metrics

We will use a 12 fold validation.

In [None]:
kf = KFold(n_splits = 12, random_state = 42, shuffle = True)

The error metric we use is the root mean squared log error (rmsle).

In [None]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(np.log(y), np.log(y_pred)))

def cv_rmse(model, X, y):
    score = cross_val_score(
        model, X, y, cv=kf, scoring="neg_mean_squared_error",
    )
    score = np.sqrt(-score)
    score_mean = score.mean()
    score_std = score.std()
    return score_mean, score_std

## Setup models

Let's setup the models now. To begin with we will keep all hyperparameters to their default values. These can be tweaked later.

In [None]:
# Light Gradient Boosting Regressor
lightgbm = LGBMRegressor(objective='regression', 
                       num_leaves=4,
                       learning_rate=0.01, 
                       n_estimators=5000,
                       max_bin=200, 
                       bagging_fraction=0.75,
                       bagging_freq=5, 
                       bagging_seed=7,
                       feature_fraction=0.2,
                       feature_fraction_seed=7,
                       verbose=-1)

# XGBoost Regressor
xgboost = XGBRegressor(learning_rate=0.01,
                       n_estimators=3460,
                       max_depth=3, 
                       min_child_weight=0,
                       gamma=0, 
                       subsample=0.7,
                       colsample_bytree=0.7,
                       objective='reg:squarederror', 
                       nthread=-1,
                       scale_pos_weight=1, 
                       seed=27,
                       reg_alpha=0.00006)

# Support Vector Regressor
svr = make_pipeline(RobustScaler(), SVR(C= 20, epsilon= 0.008, gamma=0.0003,))

# Ridge Regressor
alphas_alt = [14.5, 14.6, 14.7, 14.8, 14.9, 15, 15.1, 15.2, 15.3, 15.4, 15.5]
ridge = make_pipeline(RobustScaler(), RidgeCV(alphas=alphas_alt, cv=kf))

# Lasso
alphas2 = [5e-05, 0.0001, 0.0002, 0.0003, 0.0004, 0.0005, 0.0006, 0.0007, 0.0008]
lasso = make_pipeline(RobustScaler(), LassoCV(max_iter=1e7, alphas=alphas2, random_state=42, cv=kf))

# Elasticnet
e_alphas = [0.0001, 0.0002, 0.0003, 0.0004,0.0005, 0.0006, 0.0007]
e_l1ratio = [0.8, 0.85, 0.9, 0.95, 0.99, 1]
elasticnet = make_pipeline(RobustScaler(), ElasticNetCV(max_iter=1e7, alphas=e_alphas, cv=kf, l1_ratio=e_l1ratio))   

# Gradient Boosting Regressor
gbr = GradientBoostingRegressor(n_estimators=3000, 
                                learning_rate=0.05, 
                                max_depth=4, 
                                max_features='sqrt', 
                                min_samples_leaf=15, 
                                min_samples_split=10, 
                                loss='huber', 
                                random_state =42)  

# Random Forest Regressor
rf = RandomForestRegressor()

# Stack up all the models above, optimized using xgboost
stack_gen = StackingCVRegressor(regressors=(ridge, lasso, elasticnet, gbr, xgboost, lightgbm),
                                meta_regressor=xgboost,
                                use_features_in_secondary=True)

## Get cross validation scores

In [None]:
def score_models_individually(X_train, y_train):
    scores = {}

    # lightgbm
    score_mean, score_std = cv_rmse(lightgbm, X_train, y_train)
    print("lightgbm: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['lgb'] = (score_mean, score_std)

    # xgboost
    score_mean, score_std = cv_rmse(xgboost, X_train, y_train)
    print("xgboost: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['xgb'] = (score_mean, score_std)
    
    # svr
    score_mean, score_std = cv_rmse(svr, X_train, y_train)
    print("SVR: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['svr'] = (score_mean, score_std)

    # ridge
    score_mean, score_std = cv_rmse(ridge, X_train, y_train)
    print("ridge: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['ridge'] = (score_mean, score_std)
    
    # lasso
    score_mean, score_std = cv_rmse(lasso, X_train, y_train)
    print("lasso: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['ridge'] = (score_mean, score_std)
    
    # elasticnet
    score_mean, score_std = cv_rmse(elasticnet, X_train, y_train)
    print("elasticnet: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['ridge'] = (score_mean, score_std)
    
    # gbr
    score_mean, score_std = cv_rmse(gbr, X_train, y_train)
    print("gbr: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['gbr'] = (score_mean, score_std)

    # rf
    score_mean, score_std = cv_rmse(rf, X_train, y_train)
    print("rf: {:.4f} ({:.4f})".format(score_mean, score_std))
    scores['rf'] = (score_mean, score_std)
    
    return scores

scores = score_models_individually(X_train, y_train)

## Fit the models

Now we fit all the models on the trainning data.

In [None]:
print('stack_gen')
stack_gen_model = stack_gen.fit(np.array(X_train), np.array(y_train))

print('lightgbm')
lgb_model_full_data = lightgbm.fit(X_train, y_train)

print('xgboost')
xgb_model_full_data = xgboost.fit(X_train, y_train)

print('Svr')
svr_model_full_data = svr.fit(X_train, y_train)

print('Ridge')
ridge_model_full_data = ridge.fit(X_train, y_train)

print('elasticnet')
elastic_model_full_data = elasticnet.fit(X_train, y_train)

print('Lasso')
lasso_model_full_data = lasso.fit(X_train, y_train)

print('GradientBoosting')
gbr_model_full_data = gbr.fit(X_train, y_train)

print('RandomForest')
rf_model_full_data = rf.fit(X_train, y_train)

## Blend models and get predictions

Finally we blend all of the models which make the predictions more robust to overfitting.

In [None]:
def blend_models_predict(X):
    return ((0.15 * elastic_model_full_data.predict(X)) + \
            (0.15 * lasso_model_full_data.predict(X)) + \
            (0.05 * rf_model_full_data.predict(X)) + \
            (0.15 * ridge_model_full_data.predict(X)) + \
            (0.05 * svr_model_full_data.predict(X)) + \
            (0.05 * gbr_model_full_data.predict(X)) + \
            (0.05 * xgb_model_full_data.predict(X)) + \
            (0.05 * lgb_model_full_data.predict(X)) + \
            (0.3 * stack_gen_model.predict(np.array(X))))

# Get final precitions from the blended model
blended_score = rmsle(y_train, blend_models_predict(X_train))
scores['blended'] = (blended_score, 0)
print('RMSLE score on train data:')
print(blended_score)

Let's plot the scores of each of our models to see how they compare.

In [None]:
# set up figure
sns.set_style("white")
fig = plt.figure(figsize=(24, 12))

# plot points
ax = sns.pointplot(x=list(scores.keys()), y=[score for score, _ in scores.values()], markers=['o'], linestyles=['-'])
for i, score in enumerate(scores.values()):
    ax.text(i, score[0] + 0.002, '{:.6f}'.format(score[0]), horizontalalignment='left', size='large', color='black', weight='semibold')

# edit graph
plt.ylabel('Score (RMSE)', size=20, labelpad=12.5)
plt.xlabel('Model', size=20, labelpad=12.5)
plt.tick_params(axis='x', labelsize=13.5)
plt.tick_params(axis='y', labelsize=12.5)
plt.title('Scores of Models', size=20)

plt.show()

# Submitting final predictions

In [None]:
predictions = np.floor(np.expm1(blend_models_predict(X_test))) # exponentialise because the predictions are logged
test = pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
output = pd.DataFrame({'Id': test["Id"], 'SalePrice': predictions})
output.to_csv('my_submission.csv', index=False)
print("Your submission was successfully saved!")