# Import packages and Settings

In [1]:
import os

n_core = os.cpu_count()-3
print(n_core)

9


In [30]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

import matplotlib.pyplot as plt

from xgboost.sklearn import XGBRegressor

from catboost import CatBoostRegressor

%matplotlib inline

pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [3]:
from sklearn.base import clone
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Load Data and Preprocessing

In [4]:
train_data = pd.read_csv('./Data/train.csv', index_col='Id')
test_data = pd.read_csv('./Data/test.csv', index_col='Id')

feature = 'MSSubClass'
train_data[feature] = train_data[feature].apply(lambda x: str(x))
test_data[feature] = test_data[feature].apply(lambda x: str(x))

assert(not is_numeric_dtype(train_data[feature]))

feature = 'Street'
train_data[feature] = train_data[feature].map({'Pave': 0, 'Grvl':1})
test_data[feature] = test_data[feature].map({'Pave': 0, 'Grvl':1})

assert(is_numeric_dtype(train_data[feature]))

feature = 'Alley'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

feature = 'MasVnrType'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

feature = 'MasVnrArea'
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(train_data[feature].isna().sum() == 0)

quality_map = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}

feature = 'ExterQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

feature = 'ExterCond'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

feature = 'BsmtQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'BsmtCond'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

exposure_map = {'No':1, 'Mn':2, 'Av':3, 'Gd':4}

feature = 'BsmtExposure'
train_data[feature] = train_data[feature].map(exposure_map)
test_data[feature] = test_data[feature].map(exposure_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

type_map = {'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}

feature = 'BsmtFinType1'
train_data[feature] = train_data[feature].map(type_map)
test_data[feature] = test_data[feature].map(type_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'BsmtFinType2'
train_data[feature] = train_data[feature].map(type_map)
test_data[feature] = test_data[feature].map(type_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'HeatingQC'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

feature = 'CentralAir'
train_data[feature] = train_data[feature].map({'N': 0, 'Y':1})
test_data[feature] = test_data[feature].map({'N': 0, 'Y':1})

assert(is_numeric_dtype(train_data[feature]))

feature = 'KitchenQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

function_map = {'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8}

feature = 'Functional'
train_data[feature] = train_data[feature].map(function_map)
test_data[feature] = test_data[feature].map(function_map)
test_data[feature] = test_data[feature].fillna(8)

assert(is_numeric_dtype(train_data[feature]))

feature = 'FireplaceQu'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'GarageType'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

feature = 'GarageYrBlt'
train_data[feature] = train_data[feature].fillna(-1)
test_data[feature] = test_data[feature].fillna(-1)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(train_data[feature].isna().sum() == 0)

exposure_map = {'Unf':1, 'RFn':2, 'Fin':3}

feature = 'GarageFinish'
train_data[feature] = train_data[feature].map(exposure_map)
test_data[feature] = test_data[feature].map(exposure_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'GarageQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'GarageCond'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'PoolQC'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

fence_map = {'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}

feature = 'Fence'
train_data[feature] = train_data[feature].map(fence_map)
test_data[feature] = test_data[feature].map(fence_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)
train_data[feature] = train_data[feature].astype(int)
test_data[feature] = test_data[feature].astype(int)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'MiscFeature'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

test_data.loc[test_data['GarageCars'].isnull(), ['GarageType', 'GarageCars', 'GarageArea']] = ['None', 0, 0]
test_data.loc[test_data['BsmtFinSF1'].isnull(), ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']] = [0, 0, 0, 0, 0, 0]
test_data.loc[test_data['BsmtFullBath'].isnull(), ['BsmtFullBath', 'BsmtHalfBath']] = [0, 0]

assert(test_data['GarageCars'].isna().sum() == 0)
assert(test_data['BsmtFinSF1'].isna().sum() == 0)
assert(test_data['BsmtFullBath'].isna().sum() == 0)

columns_float = train_data.select_dtypes(include=['float64']).columns
train_data[columns_float] = train_data[columns_float].astype('float32')
test_data[columns_float] = test_data[columns_float].astype('float32')

label = 'SalePrice'

id = test_data.index

In [5]:
models = [
    LinearRegression(),
    Lasso(random_state=0),
    Ridge(random_state=0),
    DecisionTreeRegressor(random_state=0),
    RandomForestRegressor(random_state=0, oob_score=True),
    GradientBoostingRegressor(random_state=0),
    XGBRegressor(seed=0),
    CatBoostRegressor(random_state=0, verbose=0, allow_writing_files=False)
]

# Functions

In [6]:
def prepareDataDrop(columns_to_drop, label, train_data, test_data, islog):
    X_train = train_data.drop([label] + columns_to_drop, axis=1)
    Y_train = np.log(train_data[label]) if islog else train_data[label]
    X_test = test_data.drop(columns_to_drop, axis=1)
    return X_train, Y_train, X_test

In [7]:
def prepareDataSelect(columns_to_select, label, train_data, test_data, islog):
    X_train = train_data[columns_to_select]
    Y_train = np.log(train_data[label]) if islog else train_data[label]
    X_test = test_data[columns_to_select]
    return X_train, Y_train, X_test

In [8]:
def preprocess(X_train, X_test):
    columns_num = X_train.select_dtypes(include='number').columns
    columns_cat = X_train.select_dtypes(include='O').columns

    X_train[columns_num] = X_train[columns_num].fillna(X_train[columns_num].median())
    X_train[columns_cat] = X_train[columns_cat].fillna(X_train[columns_cat].mode().loc[0])

    X_test[columns_num] = X_test[columns_num].fillna(X_train[columns_num].median())
    X_test[columns_cat] = X_test[columns_cat].fillna(X_train[columns_cat].mode().loc[0])

    encoder_onehot = OneHotEncoder(dtype=int, sparse_output=False, handle_unknown='ignore')

    train_data_onehot = encoder_onehot.fit_transform(X_train[columns_cat])
    train_feature_name_onehot = encoder_onehot.get_feature_names_out()
    train_data_onehot = pd.DataFrame(train_data_onehot, columns=train_feature_name_onehot)

    test_data_onehot = encoder_onehot.transform(X_test[columns_cat])
    test_data_onehot = pd.DataFrame(test_data_onehot, columns=train_feature_name_onehot)

    X_train = pd.concat([X_train[columns_num], train_data_onehot], axis=1)
    X_test = pd.concat([X_test[columns_num], test_data_onehot], axis=1)

    return X_train, X_test

In [9]:
def evaluation(models, X_train, Y_train, scoring, n_core):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []

    cv = ShuffleSplit(random_state=0)

    for model in models:
        model = clone(model)
        cv_scores = cross_val_score(model, X_train, Y_train, cv=cv, scoring=scoring, n_jobs=n_core)
        model_scores.append(-cv_scores.mean())

    cv_results = pd.DataFrame(columns=['name', 'score_mean'])
    cv_results['name'] = model_names
    cv_results['score_mean'] = model_scores

    return cv_results

In [10]:
def tuneParameters(models, tuning_parameters, X_train, Y_train, scoring, n_core):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []
    model_params = []

    cv = ShuffleSplit(random_state=0)

    for model, params in zip(models, tuning_parameters.items()):
        model = clone(model)
        tune_model = GridSearchCV(model, param_grid=params[1], scoring=scoring, cv=cv, n_jobs=n_core)
        tune_model.fit(X_train, Y_train)
        model_scores.append(-tune_model.best_score_)
        model_params.append(tune_model.best_params_)
    
    tuning_results = pd.DataFrame(columns=['name', 'best_score', 'best_params'])
    tuning_results['name'] = model_names
    tuning_results['best_score'] = model_scores
    tuning_results['best_params'] = model_params

    return tuning_results

In [11]:
def predict(model, X_train, Y_train, X_test, islog, id, label, file_name = 'result.csv'):
    model = clone(model)
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)

    result = pd.DataFrame({
        id.name: id,
        label: np.exp(predictions) if islog else predictions
    })
    result.to_csv('./Result/' + file_name, index=False)

In [84]:
train_data.columns[train_data.isna().any()].tolist()

['LotFrontage', 'Electrical']

In [85]:
test_data.columns[test_data.isna().any()].tolist()

['MSZoning',
 'LotFrontage',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'KitchenQual',
 'SaleType']

# Base Model

In [86]:
columns_to_drop = train_data.select_dtypes(include='O').columns.to_list() + \
    train_data.columns[train_data.isna().any()].tolist()
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data, test_data, False)
print(X_train.columns)

Index(['LotArea', 'Street', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC',
       'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'PoolQC', 'Fence', 'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')


In [88]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_log_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.175144
1                      Lasso    0.175102
2                      Ridge    0.174548
3      DecisionTreeRegressor    0.206714
4      RandomForestRegressor    0.139624
5  GradientBoostingRegressor    0.126800
6               XGBRegressor    0.138030
7          CatBoostRegressor    0.124243


In [13]:
model = DecisionTreeRegressor(random_state=0)
predict(model, X_train, Y_train, X_test, False, id, label, file_name = 'result_basic.csv')

# Improvement

In [64]:
columns_to_drop = []
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data, test_data, False)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [65]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1460, 245) (1459, 245)


In [68]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_log_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression         NaN
1                      Lasso         NaN
2                      Ridge         NaN
3      DecisionTreeRegressor    0.204164
4      RandomForestRegressor    0.139421
5  GradientBoostingRegressor    0.122607
6               XGBRegressor    0.136823
7          CatBoostRegressor    0.114957


In [22]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_log_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,-0.138408,"{'min_samples_split': 2, 'n_estimators': 300}"


In [30]:
model = RandomForestRegressor(n_estimators=300, min_samples_split=10, random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, False, id, label)

## Take Log of SalePrice

In [89]:
columns_to_drop = []
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data, test_data, True)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [90]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1460, 245) (1459, 245)


In [27]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.140600
1                      Lasso    0.186531
2                      Ridge    0.133502
3      DecisionTreeRegressor    0.198165
4      RandomForestRegressor    0.136668
5  GradientBoostingRegressor    0.122738
6               XGBRegressor    0.141680
7          CatBoostRegressor    0.113981


In [29]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,-0.136071,"{'min_samples_split': 2, 'n_estimators': 300}"


In [21]:
model = RandomForestRegressor(n_estimators=300, min_samples_split=10, random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, True, id, label)

## Remove Outliers

In [70]:
train_data_new = train_data.drop(train_data[(train_data['OverallQual']==4) & (train_data['SalePrice']>200000)].index)
train_data_new = train_data_new.drop(train_data_new[(train_data_new['GrLivArea']>4000) & (train_data_new['SalePrice']<300000)].index)

In [71]:
columns_to_drop = []
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data_new, test_data, True)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [72]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1457, 244) (1459, 244)


In [23]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.122964
1                      Lasso    0.164459
2                      Ridge    0.113877
3      DecisionTreeRegressor    0.195040
4      RandomForestRegressor    0.128458
5  GradientBoostingRegressor    0.117576
6               XGBRegressor    0.126859
7          CatBoostRegressor    0.107263


In [24]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.127632,"{'min_samples_split': 4, 'n_estimators': 300}"


In [25]:
model = RandomForestRegressor(n_estimators=300, min_samples_split=4, random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, True, id, label)

## Top 10 Features

In [91]:
train_data_new = train_data.drop(train_data[(train_data['OverallQual']==4) & (train_data['SalePrice']>200000)].index)
train_data_new = train_data_new.drop(train_data_new[(train_data_new['GrLivArea']>4000) & (train_data_new['SalePrice']<300000)].index).reset_index(drop=True)

test_data_new = test_data.copy().reset_index(drop=True)

columns_to_select = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
X_train, Y_train, X_test = prepareDataSelect(columns_to_select, label, train_data_new, test_data_new, True)
print(X_train.columns)

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath'], dtype='object')


In [92]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1457, 5) (1459, 5)


In [30]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.160072
1                      Lasso    0.223392
2                      Ridge    0.160073
3      DecisionTreeRegressor    0.225016
4      RandomForestRegressor    0.165167
5  GradientBoostingRegressor    0.155705
6               XGBRegressor    0.171244
7          CatBoostRegressor    0.157339


In [31]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.160232,"{'min_samples_split': 10, 'n_estimators': 300}"


## Drop Duplicated Features

In [45]:
train_data_new = train_data.drop(train_data[(train_data['OverallQual']==4) & (train_data['SalePrice']>200000)].index)
train_data_new = train_data_new.drop(train_data_new[(train_data_new['GrLivArea']>4000) & (train_data_new['SalePrice']<300000)].index).reset_index(drop=True)

test_data_new = test_data.copy().reset_index(drop=True)

columns_to_drop = ['GarageArea', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2']
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data_new, test_data_new, True)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageCars', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 

In [46]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1457, 236) (1459, 236)


In [47]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression   43.704947
1                      Lasso    0.171139
2                      Ridge    0.113969
3      DecisionTreeRegressor    0.191408
4      RandomForestRegressor    0.128808
5  GradientBoostingRegressor    0.116933
6               XGBRegressor    0.127917
7          CatBoostRegressor    0.108737


In [35]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.128318,"{'min_samples_split': 4, 'n_estimators': 300}"


## Take Log of Top 10 Features

In [95]:
train_data_new = train_data.drop(train_data[(train_data['OverallQual']==4) & (train_data['SalePrice']>200000)].index)
train_data_new = train_data_new.drop(train_data_new[(train_data_new['GrLivArea']>4000) & (train_data_new['SalePrice']<300000)].index).reset_index(drop=True)

train_data_new['GrLivArea'] = np.log1p(train_data_new['GrLivArea'])
train_data_new['TotalBsmtSF'] = np.log1p(train_data_new['TotalBsmtSF'])

test_data_new = test_data.copy().reset_index(drop=True)

test_data_new['GrLivArea'] = np.log1p(test_data_new['GrLivArea'])
test_data_new['TotalBsmtSF'] = np.log1p(test_data_new['TotalBsmtSF'])

In [96]:
columns_to_select = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
X_train, Y_train, X_test = prepareDataSelect(columns_to_select, label, train_data_new, test_data_new, True)
print(X_train.columns)

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath'], dtype='object')


In [97]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1457, 5) (1459, 5)


In [39]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.170753
1                      Lasso    0.386474
2                      Ridge    0.170686
3      DecisionTreeRegressor    0.223832
4      RandomForestRegressor    0.165115
5  GradientBoostingRegressor    0.155808
6               XGBRegressor    0.171244
7          CatBoostRegressor    0.156945


In [40]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.160311,"{'min_samples_split': 10, 'n_estimators': 300}"


##  Drop Duplicated Features and Take Log

In [59]:
train_data_new = train_data.drop(train_data[(train_data['OverallQual']==4) & (train_data['SalePrice']>200000)].index)
train_data_new = train_data_new.drop(train_data_new[(train_data_new['GrLivArea']>4000) & (train_data_new['SalePrice']<300000)].index).reset_index(drop=True)

skews = train_data_new.select_dtypes(include='number').skew().sort_values(ascending=False)
skews

MiscVal          24.451726
PoolQC           17.547392
PoolArea         15.943422
Street           15.502652
LotArea          12.769096
3SsnPorch        10.293487
LowQualFinSF      9.001760
KitchenAbvGr      4.483125
BsmtFinSF2        4.250256
ScreenPorch       4.117285
BsmtHalfBath      4.098469
BsmtFinType2      3.292727
EnclosedPorch     3.085809
MasVnrArea        2.695262
OpenPorchSF       2.344707
SalePrice         1.883749
Fence             1.803839
WoodDeckSF        1.548359
LotFrontage       1.543043
ExterCond         1.394587
BsmtExposure      1.109770
GrLivArea         1.011340
BsmtUnfSF         0.919853
1stFlrSF          0.890096
ExterQual         0.818972
2ndFlrSF          0.811915
BsmtFinSF1        0.767050
OverallCond       0.691649
HalfBath          0.678958
TotRmsAbvGrd      0.659714
Fireplaces        0.632378
BsmtFullBath      0.592257
TotalBsmtSF       0.513082
KitchenQual       0.385510
MoSold            0.214440
BedroomAbvGr      0.211133
OverallQual       0.200484
G

In [60]:
columns_to_log = ['LotFrontage', 'LotArea', 'MasVnrArea', 
                  'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                  'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
                  'LowQualFinSF', 'GrLivArea', 'WoodDeckSF', 
                  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 
                  'ScreenPorch', 'PoolArea', 'MiscVal', 'GarageArea']

columns_to_skew = skews[columns_to_log][skews > 0.75].index
print(columns_to_skew)

train_data_new[columns_to_skew] = np.log1p(train_data_new[columns_to_skew])

test_data_new = test_data.copy().reset_index(drop=True)
test_data_new[columns_to_skew] = np.log1p(test_data_new[columns_to_skew])

Index(['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'MiscVal'],
      dtype='object')


In [61]:
columns_to_drop = ['GarageArea', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtCond', 'BsmtFinType1', 'BsmtFinType2']
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data_new, test_data_new, True)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure',
       'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageCars', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeature', 

In [62]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1457, 236) (1459, 236)


In [63]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.119472
1                      Lasso    0.243360
2                      Ridge    0.112626
3      DecisionTreeRegressor    0.191381
4      RandomForestRegressor    0.128903
5  GradientBoostingRegressor    0.116848
6               XGBRegressor    0.127917
7          CatBoostRegressor    0.108743


In [65]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10],
            'max_features': ['sqrt', 'log2', 1]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.126241,"{'max_features': 'sqrt', 'min_samples_split': 4, 'n_estimators': 300}"


## New Features

In [71]:
train_data_new, test_data_new = train_data.copy(), test_data.copy().reset_index(drop=True)

train_data_new = train_data_new.drop(train_data_new[(train_data_new['OverallQual']==4) & (train_data_new['SalePrice']>200000)].index)
train_data_new = train_data_new.drop(train_data_new[(train_data_new['GrLivArea']>4000) & (train_data_new['SalePrice']<300000)].index).reset_index(drop=True)

train_data_new['TotalSF'] = train_data_new[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']].sum(axis=1)
test_data_new['TotalSF'] = test_data_new[['TotalBsmtSF', '1stFlrSF', '2ndFlrSF']].sum(axis=1)

train_data_new['TotalFinSF'] = train_data_new[['BsmtFinSF1', 'BsmtFinSF2', '1stFlrSF', '2ndFlrSF']].sum(axis=1)
test_data_new['TotalFinSF'] = test_data_new[['BsmtFinSF1', 'BsmtFinSF2', '1stFlrSF', '2ndFlrSF']].sum(axis=1)

train_data_new['YearSinceBuilt'] = train_data_new['YrSold'] - train_data_new['YearBuilt']
test_data_new['YearSinceBuilt'] = test_data_new['YrSold'] - test_data_new['YearBuilt']

train_data_new['YearSinceRemod'] = train_data_new['YrSold'] - train_data_new['YearRemodAdd']
test_data_new['YearSinceRemod'] = test_data_new['YrSold'] - test_data_new['YearRemodAdd']

train_data_new['TotalBath'] = train_data_new[['FullBath', 'BsmtFullBath']].sum(axis=1) + 0.5 * train_data_new[['HalfBath', 'BsmtHalfBath']].sum(axis=1)
test_data_new['TotalBath'] = test_data_new[['FullBath', 'BsmtFullBath']].sum(axis=1) + 0.5 * test_data_new[['HalfBath', 'BsmtHalfBath']].sum(axis=1)

train_data_new['TotalPorch'] = train_data_new[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)
test_data_new['TotalPorch'] = test_data_new[['OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch']].sum(axis=1)

train_data_new['HasPool'] = train_data_new['PoolArea'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasPool'] = test_data_new['PoolArea'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['Has2ndFloor'] = train_data_new['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['Has2ndFloor'] = test_data_new['2ndFlrSF'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['HasBsmt'] = train_data_new['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasBsmt'] = test_data_new['TotalBsmtSF'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['HasFireplace'] = train_data_new['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasFireplace'] = test_data_new['Fireplaces'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['HasGarage'] = train_data_new['GarageArea'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasGarage'] = test_data_new['GarageArea'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['HasFence'] = train_data_new['Fence'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasFence'] = test_data_new['Fence'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['HasPorch'] = train_data_new['TotalPorch'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasPorch'] = test_data_new['TotalPorch'].apply(lambda x: 1 if x > 0 else 0)

train_data_new['HasWoodDeck'] = train_data_new['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)
test_data_new['HasWoodDeck'] = test_data_new['WoodDeckSF'].apply(lambda x: 1 if x > 0 else 0)

columns_to_log = ['LotFrontage', 'LotArea', 'MasVnrArea', 
                  'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 
                  'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
                  'LowQualFinSF', 'GrLivArea', 'WoodDeckSF', 
                  'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 
                  'ScreenPorch', 'PoolArea', 'MiscVal']

skews = train_data_new.select_dtypes(include='number').skew().sort_values(ascending=False)
columns_to_skew = skews[columns_to_log][skews > 0.75].index

train_data_new[columns_to_skew] = np.log1p(train_data_new[columns_to_skew])
test_data_new[columns_to_skew] = np.log1p(test_data_new[columns_to_skew])

In [72]:
columns_to_drop = []
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data_new, test_data_new, True)

print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [73]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1457, 258) (1459, 258)


In [74]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.120556
1                      Lasso    0.166238
2                      Ridge    0.111984
3      DecisionTreeRegressor    0.191161
4      RandomForestRegressor    0.129078
5  GradientBoostingRegressor    0.117355
6               XGBRegressor    0.129019
7          CatBoostRegressor    0.104871


In [75]:
cv = ShuffleSplit(random_state=0)
model = RandomForestRegressor(random_state=0, oob_score=True)
selector = RFECV(estimator=model, cv=cv, scoring='neg_root_mean_squared_error', n_jobs=n_core)
selector.fit(X_train, Y_train)

In [76]:
[f for f in X_train.columns if f not in selector.get_feature_names_out()]

['Street',
 'LowQualFinSF',
 'PoolArea',
 'PoolQC',
 'HasPool',
 'Has2ndFloor',
 'HasBsmt',
 'HasGarage',
 'MSSubClass_120',
 'MSSubClass_160',
 'MSSubClass_180',
 'MSSubClass_190',
 'MSSubClass_40',
 'MSSubClass_45',
 'MSSubClass_75',
 'MSSubClass_80',
 'MSSubClass_85',
 'MSSubClass_90',
 'MSZoning_FV',
 'MSZoning_RH',
 'Alley_Grvl',
 'Alley_Pave',
 'LotShape_IR2',
 'LotShape_IR3',
 'LandContour_Low',
 'Utilities_AllPub',
 'Utilities_NoSeWa',
 'LotConfig_CulDSac',
 'LotConfig_FR2',
 'LotConfig_FR3',
 'LandSlope_Sev',
 'Neighborhood_Blmngtn',
 'Neighborhood_Blueste',
 'Neighborhood_BrDale',
 'Neighborhood_BrkSide',
 'Neighborhood_ClearCr',
 'Neighborhood_Gilbert',
 'Neighborhood_MeadowV',
 'Neighborhood_Mitchel',
 'Neighborhood_NPkVill',
 'Neighborhood_NWAmes',
 'Neighborhood_NoRidge',
 'Neighborhood_SWISU',
 'Neighborhood_SawyerW',
 'Neighborhood_Timber',
 'Neighborhood_Veenker',
 'Condition1_Feedr',
 'Condition1_PosA',
 'Condition1_PosN',
 'Condition1_RRAe',
 'Condition1_RRAn',
 'Con

In [77]:
importances = pd.DataFrame(columns=['feature', 'importance'])
importances['feature'] = selector.get_feature_names_out()
importances['importance'] = selector.estimator_.feature_importances_

importances.sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
2,OverallQual,0.45774
49,TotalSF,0.233315
50,TotalFinSF,0.105848
19,CentralAir,0.013032
22,GrLivArea,0.009377
3,OverallCond,0.009364
1,LotArea,0.009257
37,GarageArea,0.00925
51,YearSinceBuilt,0.007473
52,YearSinceRemod,0.007473


### Feature Selection

In [78]:
columns_to_drop = ['GarageArea', 'GarageYrBlt', 'GarageFinish', 
                   'GarageQual', 'GarageCond', 'BsmtCond', 
                   'BsmtFinType1', 'BsmtFinType2', 'Utilities',
                   'Street', 'PoolQC', 'PoolArea',
                   'YearBuilt', 'YearRemodAdd', 'FullBath', 
                   'BsmtFullBath', 'HalfBath', 'TotalBsmtSF',
                   '1stFlrSF', 'GrLivArea']

X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data_new, test_data_new, True)

print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Alley', 'LotShape',
       'LandContour', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'OverallQual', 'OverallCond',
       'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType',
       'MasVnrArea', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual',
       'BsmtExposure', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '2ndFlrSF', 'LowQualFinSF',
       'BsmtHalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageCars', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'Fence', 'MiscFeature',
       'MiscVal', 'MoSold', 'YrSold', 'SaleType', 'SaleCondition', 'TotalSF',
       'TotalFinSF', 'YearSinceBuilt', 'YearSinceRemod', 'TotalBath',
       'TotalPorch', 'HasPo

In [79]:
X_train, X_test = preprocess(X_train, X_test)
print(X_train.shape, X_test.shape)

(1457, 237) (1459, 237)


In [80]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error', n_core)
print(cv_results)

                        name  score_mean
0           LinearRegression    0.120746
1                      Lasso    0.173150
2                      Ridge    0.112678
3      DecisionTreeRegressor    0.190586
4      RandomForestRegressor    0.128212
5  GradientBoostingRegressor    0.118241
6               XGBRegressor    0.130424
7          CatBoostRegressor    0.106526
