# Import packages and Settings

In [23]:
import os

n_core = os.cpu_count()-3
print(n_core)

3


In [2]:
import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

import matplotlib.pyplot as plt

from xgboost.sklearn import XGBRegressor

from catboost import CatBoostRegressor

%matplotlib inline

In [3]:
from sklearn.base import clone
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import ShuffleSplit, cross_val_score, GridSearchCV

from sklearnex import patch_sklearn

patch_sklearn()

Intel(R) Extension for Scikit-learn* enabled (https://github.com/intel/scikit-learn-intelex)


## Load Data and Preprocessing

In [4]:
train_data = pd.read_csv('./Data/train.csv')
test_data = pd.read_csv('./Data/test.csv')

feature = 'MSSubClass'
train_data[feature] = train_data[feature].apply(lambda x: str(x))
test_data[feature] = test_data[feature].apply(lambda x: str(x))

assert(not is_numeric_dtype(train_data[feature]))

feature = 'Street'
train_data[feature] = train_data[feature].map({'Pave': 0, 'Grvl':1})
test_data[feature] = test_data[feature].map({'Pave': 0, 'Grvl':1})

assert(is_numeric_dtype(train_data[feature]))

feature = 'Alley'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

feature = 'MasVnrType'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

feature = 'MasVnrArea'
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(train_data[feature].isna().sum() == 0)

quality_map = {'Po':1, 'Fa':2, 'TA':3, 'Gd':4, 'Ex':5}

feature = 'ExterQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

feature = 'ExterCond'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

feature = 'BsmtQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'BsmtCond'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

exposure_map = {'No':1, 'Mn':2, 'Av':3, 'Gd':4}

feature = 'BsmtExposure'
train_data[feature] = train_data[feature].map(exposure_map)
test_data[feature] = test_data[feature].map(exposure_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

type_map = {'Unf':1, 'LwQ':2, 'Rec':3, 'BLQ':4, 'ALQ':5, 'GLQ':6}

feature = 'BsmtFinType1'
train_data[feature] = train_data[feature].map(type_map)
test_data[feature] = test_data[feature].map(type_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'BsmtFinType2'
train_data[feature] = train_data[feature].map(type_map)
test_data[feature] = test_data[feature].map(type_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'HeatingQC'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

feature = 'CentralAir'
train_data[feature] = train_data[feature].map({'N': 0, 'Y':1})
test_data[feature] = test_data[feature].map({'N': 0, 'Y':1})

assert(is_numeric_dtype(train_data[feature]))

feature = 'KitchenQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)

assert(is_numeric_dtype(train_data[feature]))

function_map = {'Sal':1, 'Sev':2, 'Maj2':3, 'Maj1':4, 'Mod':5, 'Min2':6, 'Min1':7, 'Typ':8}

feature = 'Functional'
train_data[feature] = train_data[feature].map(function_map)
test_data[feature] = test_data[feature].map(function_map)
test_data[feature] = test_data[feature].fillna(8)

assert(is_numeric_dtype(train_data[feature]))

feature = 'FireplaceQu'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'GarageType'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

feature = 'GarageYrBlt'
train_data[feature] = train_data[feature].fillna(-1)
test_data[feature] = test_data[feature].fillna(-1)

assert(train_data[feature].isna().sum() == 0)

exposure_map = {'Unf':1, 'RFn':2, 'Fin':3}

feature = 'GarageFinish'
train_data[feature] = train_data[feature].map(exposure_map)
test_data[feature] = test_data[feature].map(exposure_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'GarageQual'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'GarageCond'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'PoolQC'
train_data[feature] = train_data[feature].map(quality_map)
test_data[feature] = test_data[feature].map(quality_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

fence_map = {'MnWw':1, 'GdWo':2, 'MnPrv':3, 'GdPrv':4}

feature = 'Fence'
train_data[feature] = train_data[feature].map(fence_map)
test_data[feature] = test_data[feature].map(fence_map)
train_data[feature] = train_data[feature].fillna(0)
test_data[feature] = test_data[feature].fillna(0)

assert(is_numeric_dtype(train_data[feature]))
assert(train_data[feature].isna().sum() == 0)

feature = 'MiscFeature'
train_data[feature] = train_data[feature].fillna('None')
test_data[feature] = test_data[feature].fillna('None')

assert(train_data[feature].isna().sum() == 0)

test_data.loc[test_data['GarageCars'].isnull(), ['GarageType', 'GarageCars', 'GarageArea']] = ['None', 0, 0]
test_data.loc[test_data['BsmtFinSF1'].isnull(), ['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'BsmtFullBath', 'BsmtHalfBath']] = [0, 0, 0, 0, 0, 0]
test_data.loc[test_data['BsmtFullBath'].isnull(), ['BsmtFullBath', 'BsmtHalfBath']] = [0, 0]

assert(test_data['GarageCars'].isna().sum() == 0)
assert(test_data['BsmtFinSF1'].isna().sum() == 0)
assert(test_data['BsmtFullBath'].isna().sum() == 0)

columns_float = train_data.select_dtypes(include=['float64']).columns
train_data[columns_float] = train_data[columns_float].astype('float32')
test_data[columns_float] = test_data[columns_float].astype('float32')

label = 'SalePrice'

id = test_data['Id']

In [6]:
models = [
    LinearRegression(),
    Lasso(random_state=0),
    Ridge(random_state=0),
    DecisionTreeRegressor(random_state=0),
    RandomForestRegressor(random_state=0, oob_score=True),
    GradientBoostingRegressor(random_state=0),
    XGBRegressor(random_state=0),
    CatBoostRegressor(random_state=0, verbose=0, allow_writing_files=False)
]

# Functions

In [7]:
def prepareDataDrop(columns_to_drop, label, train_data, test_data, islog):
    X_train = train_data.drop([label] + columns_to_drop, axis=1)
    Y_train = np.log(train_data[label]) if islog else train_data[label]
    X_test = test_data.drop(columns_to_drop, axis=1)
    return X_train, Y_train, X_test

In [43]:
def prepareDataSelect(columns_to_select, label, train_data, test_data, islog):
    X_train = train_data[columns_to_select]
    Y_train = np.log(train_data[label]) if islog else train_data[label]
    X_test = test_data[columns_to_select]
    return X_train, Y_train, X_test

In [9]:
def preprocess(X_train, X_test):
    transformer_num = SimpleImputer(strategy='median')

    transformer_cat = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))
    ])

    cols_num = X_train.select_dtypes(include='number').columns
    cols_cat = X_train.select_dtypes(include='O').columns

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', transformer_num, cols_num),
            ('cat', transformer_cat, cols_cat)
        ])
    
    X_train = preprocessor.fit_transform(X_train)
    X_test = preprocessor.transform(X_test)

    return X_train, X_test

In [10]:
def evaluation(models, X_train, Y_train, scoring):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []

    cv = ShuffleSplit(random_state=0)

    for model in models:
        model = clone(model)
        cv_scores = cross_val_score(model, X_train, Y_train, cv=cv, scoring=scoring)
        model_scores.append(-cv_scores.mean())

    cv_results = pd.DataFrame(columns=['name', 'score_mean'])
    cv_results['name'] = model_names
    cv_results['score_mean'] = model_scores

    return cv_results

In [30]:
def tuneParameters(models, tuning_parameters, X_train, Y_train, scoring, n_core):
    model_names = [model.__class__.__name__ for model in models]
    model_scores = []
    model_params = []

    cv = ShuffleSplit(random_state=0)

    for model, params in zip(models, tuning_parameters.items()):
        model = clone(model)
        tune_model = GridSearchCV(model, param_grid=params[1], scoring=scoring, cv=cv, n_jobs=n_core)
        tune_model.fit(X_train, Y_train)
        model_scores.append(-tune_model.best_score_)
        model_params.append(tune_model.best_params_)
    
    tuning_results = pd.DataFrame(columns=['name', 'best_score', 'best_params'])
    tuning_results['name'] = model_names
    tuning_results['best_score'] = model_scores
    tuning_results['best_params'] = model_params

    return tuning_results

In [12]:
def predict(model, X_train, Y_train, X_test, islog, id, label, file_name = 'result.csv'):
    model = clone(model)
    model.fit(X_train, Y_train)
    predictions = model.predict(X_test)

    result = pd.DataFrame({
        id.name: id,
        label: np.exp(predictions) if islog else predictions
    })
    result.to_csv('./Result/' + file_name, index=False)

In [13]:
train_data.columns[train_data.isna().any()].tolist()

['LotFrontage', 'Electrical']

In [14]:
test_data.columns[test_data.isna().any()].tolist()

['MSZoning',
 'LotFrontage',
 'Utilities',
 'Exterior1st',
 'Exterior2nd',
 'KitchenQual',
 'SaleType']

# Base Model

In [15]:
columns_to_drop = ['Id'] + \
    train_data.select_dtypes(include='O').columns.to_list() + \
    train_data.columns[train_data.isna().any()].tolist()
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data, test_data, False)
print(X_train.columns)

Index(['LotArea', 'Street', 'OverallQual', 'OverallCond', 'YearBuilt',
       'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond', 'BsmtQual',
       'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1',
       'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'HeatingQC',
       'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea',
       'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
       'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd', 'Functional',
       'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'PoolQC', 'Fence', 'MiscVal', 'MoSold', 'YrSold'],
      dtype='object')


In [17]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_log_error')
print(cv_results)

                        name  score_mean
0           LinearRegression    0.175144
1                      Lasso    0.175103
2                      Ridge    0.174548
3      DecisionTreeRegressor    0.206714
4      RandomForestRegressor    0.139200
5  GradientBoostingRegressor    0.126800
6               XGBRegressor    0.138030
7          CatBoostRegressor    0.124243


In [13]:
model = DecisionTreeRegressor(random_state=0)
predict(model, X_train, Y_train, X_test, False, id, label, file_name = 'result_basic.csv')

# Improvement

In [18]:
columns_to_drop = ['Id']
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data, test_data, False)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [19]:
X_train, X_test = preprocess(X_train, X_test)

In [20]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_log_error')
print(cv_results)

Traceback (most recent call last):
  File "c:\Users\User\anaconda3\envs\ml\lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
  File "c:\Users\User\anaconda3\envs\ml\lib\site-packages\sklearn\metrics\_scorer.py", line 376, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "c:\Users\User\anaconda3\envs\ml\lib\site-packages\sklearn\utils\_param_validation.py", line 213, in wrapper
    return func(*args, **kwargs)
  File "c:\Users\User\anaconda3\envs\ml\lib\site-packages\sklearn\metrics\_regression.py", line 768, in root_mean_squared_log_error
    raise ValueError(
ValueError: Root Mean Squared Logarithmic Error cannot be used when targets contain negative values.

Traceback (most recent call last):
  File "c:\Users\User\anaconda3\envs\ml\lib\site-packages\sklearn\metrics\_scorer.py", line 139, in __call__
    score = scorer._score(
  File "c:\Users\User\anaconda3\envs\ml\lib\site-packages\sklearn\metri

                        name  score_mean
0           LinearRegression         NaN
1                      Lasso         NaN
2                      Ridge         NaN
3      DecisionTreeRegressor    0.204164
4      RandomForestRegressor    0.139421
5  GradientBoostingRegressor    0.122607
6               XGBRegressor    0.136823
7          CatBoostRegressor    0.114957


In [22]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_log_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,-0.138408,"{'min_samples_split': 2, 'n_estimators': 300}"


In [30]:
model = RandomForestRegressor(n_estimators=300, min_samples_split=10, random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, False, id, label)

## Take Log of SalePrice

In [24]:
columns_to_drop = ['Id']
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data, test_data, True)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating', 'HeatingQC',
       'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType', 'GarageYrBlt',
       'GarageFinish', 'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
       'PavedDrive', 'Wo

In [26]:
X_train, X_test = preprocess(X_train, X_test)

In [27]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error')
print(cv_results)

                        name  score_mean
0           LinearRegression    0.140600
1                      Lasso    0.186531
2                      Ridge    0.133502
3      DecisionTreeRegressor    0.198165
4      RandomForestRegressor    0.136668
5  GradientBoostingRegressor    0.122738
6               XGBRegressor    0.141680
7          CatBoostRegressor    0.113981


In [29]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,-0.136071,"{'min_samples_split': 2, 'n_estimators': 300}"


In [21]:
model = RandomForestRegressor(n_estimators=300, min_samples_split=10, random_state=0, oob_score=True)
predict(model, X_train, Y_train, X_test, True, id, label)

## Top 10 Features

In [31]:
columns_to_select = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
X_train, Y_train, X_test = prepareDataSelect(columns_to_select, label, train_data, test_data, True)
print(X_train.columns)

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath'], dtype='object')


In [32]:
X_train, X_test = preprocess(X_train, X_test)

In [33]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error')
print(cv_results)

                        name  score_mean
0           LinearRegression    0.173850
1                      Lasso    0.247382
2                      Ridge    0.173847
3      DecisionTreeRegressor    0.217094
4      RandomForestRegressor    0.169240
5  GradientBoostingRegressor    0.163544
6               XGBRegressor    0.180380
7          CatBoostRegressor    0.165611


In [34]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.16701,"{'min_samples_split': 10, 'n_estimators': 100}"


## Drop Duplicated Features

In [35]:
columns_to_drop = ['Id', 'GarageArea', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtCond', 'BsmtFinSF1', 'BsmtFinSF2']
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data, test_data, True)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageCars', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeatur

In [36]:
X_train, X_test = preprocess(X_train, X_test)

In [37]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error')
print(cv_results)

                        name  score_mean
0           LinearRegression    0.137710
1                      Lasso    0.194112
2                      Ridge    0.158227
3      DecisionTreeRegressor    0.204177
4      RandomForestRegressor    0.137304
5  GradientBoostingRegressor    0.124008
6               XGBRegressor    0.136576
7          CatBoostRegressor    0.114739


In [38]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.136925,"{'min_samples_split': 4, 'n_estimators': 50}"


## Take Log of Top 10 Features

In [48]:
train_data_new, test_data_new = train_data.copy(), test_data.copy()

train_data_new['GrLivArea'] = np.log1p(train_data_new['GrLivArea'])
train_data_new['TotalBsmtSF'] = np.log1p(train_data_new['TotalBsmtSF'])

test_data_new['GrLivArea'] = np.log1p(test_data_new['GrLivArea'])
test_data_new['TotalBsmtSF'] = np.log1p(test_data_new['TotalBsmtSF'])

In [50]:
columns_to_select = ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath']
X_train, Y_train, X_test = prepareDataSelect(columns_to_select, label, train_data_new, test_data_new, True)
print(X_train.columns)

Index(['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath'], dtype='object')


In [51]:
X_train, X_test = preprocess(X_train, X_test)

In [52]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error')
print(cv_results)

                        name  score_mean
0           LinearRegression    0.176015
1                      Lasso    0.404022
2                      Ridge    0.176025
3      DecisionTreeRegressor    0.216723
4      RandomForestRegressor    0.169540
5  GradientBoostingRegressor    0.163602
6               XGBRegressor    0.180380
7          CatBoostRegressor    0.165746


In [53]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.167096,"{'min_samples_split': 10, 'n_estimators': 100}"


##  Drop Duplicated Features and Take Log

In [57]:
train_data_new, test_data_new = train_data.copy(), test_data.copy()

skews = train_data_new.select_dtypes(include='number').skew()
columns_skew = skews[skews > 0.5].index
columns_skew = columns_skew.drop(label)

print(columns_skew)

Index(['LotFrontage', 'LotArea', 'Street', 'OverallCond', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'BsmtExposure', 'BsmtFinSF1', 'BsmtFinType2',
       'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'HalfBath',
       'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'WoodDeckSF',
       'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
       'PoolQC', 'Fence', 'MiscVal'],
      dtype='object')


In [58]:
train_data_new[columns_skew] = np.log1p(train_data_new[columns_skew])
test_data_new[columns_skew] = np.log1p(test_data_new[columns_skew])

In [59]:
columns_to_drop = ['Id', 'GarageArea', 'GarageYrBlt', 'GarageFinish', 'GarageQual', 'GarageCond', 'BsmtCond', 'BsmtFinSF1', 'BsmtFinSF2']
X_train, Y_train, X_test = prepareDataDrop(columns_to_drop, label, train_data_new, test_data_new, True)
print(X_train.columns)

Index(['MSSubClass', 'MSZoning', 'LotFrontage', 'LotArea', 'Street', 'Alley',
       'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope',
       'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle',
       'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 'RoofStyle',
       'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'MasVnrArea',
       'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtExposure',
       'BsmtFinType1', 'BsmtFinType2', 'BsmtUnfSF', 'TotalBsmtSF', 'Heating',
       'HeatingQC', 'CentralAir', 'Electrical', '1stFlrSF', '2ndFlrSF',
       'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath',
       'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual',
       'TotRmsAbvGrd', 'Functional', 'Fireplaces', 'FireplaceQu', 'GarageType',
       'GarageCars', 'PavedDrive', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'PoolQC',
       'Fence', 'MiscFeatur

In [64]:
X_train, X_test = preprocess(X_train, X_test)

In [65]:
cv_results = evaluation(models, X_train, Y_train, 'neg_root_mean_squared_error')
print(cv_results)

                        name  score_mean
0           LinearRegression    0.133716
1                      Lasso    0.312743
2                      Ridge    0.126092
3      DecisionTreeRegressor    0.204854
4      RandomForestRegressor    0.137337
5  GradientBoostingRegressor    0.124076
6               XGBRegressor    0.136576
7          CatBoostRegressor    0.114717


In [66]:
tuning_parameters = {
        'RandomForestRegressor': {
            'n_estimators': [50, 100, 300],
            'min_samples_split': [2, 4, 6, 8, 10]
        }
    }

model = RandomForestRegressor(random_state=0, oob_score=True)

tuneParameters([model], tuning_parameters, X_train, Y_train, 'neg_root_mean_squared_error', n_core)

Unnamed: 0,name,best_score,best_params
0,RandomForestRegressor,0.136919,"{'min_samples_split': 4, 'n_estimators': 50}"
