In [None]:
import numpy as np
import pandas as pd

In [None]:
# need to hot one encode and other stuff both train and test data
# the save it as test and train data in csv
# That I can use for tensorflow model

# Preprocess of house prices dataset
In the following notebooks we're going to preprocess the data, that is remove missing variables, transform the variables and treat outliers. We're also going to build a specialized pipeline for those transformations.

In this notebook specifically, model features will be chosen and transformed.

In [None]:
# import dataset and libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline
pd.plotting.register_matplotlib_converters()
plt.rc('figure', figsize=(16, 6))

In [None]:
orig_data = pd.read_csv("data/train_preprocessed.csv", index_col="Id")

In [None]:
# copying the dataset for analysis
house_data = orig_data.copy()
house_data.head()

Let's first divide the features into separate categories depending if they are nominal, ordinal, interval or ratio variables:

In [None]:
categorical_features = ["MSSubClass", "MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig",
                        "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual",
                        "OverallCond", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual",
                        "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
                        "Heating", "HeatingQC", "CentralAir", "Electrical", "KitchenQual", "Functional", "FireplaceQu",
                        "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", 'PoolQC', 'Fence',
                        'MiscFeature', 'SaleType', 'SaleCondition']

numerical_features = ["LotFrontage", "LotArea", "YearBuilt", "YearRemodAdd", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
                      "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath",
                      "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", 'Fireplaces',
                      "GarageYrBlt", "GarageCars", "GarageArea", 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                      'ScreenPorch', 'PoolArea', "MiscVal", 'MoSold', 'YrSold']

def plot_numerical_feature(feature, include_non_positive=True):
    plot_column = X[feature]
    if not include_non_positive:
        plot_column = plot_column[plot_column > 0]

    print(f"Description of {feature}")
    print(plot_column.describe(), "\n")
    print(f"Histogram of {feature}")
    plt.figure()
    sns.histplot(data=plot_column)
    plt.show()
    print(f"Kernel density estimation plot of {feature}")
    plt.figure()
    sns.kdeplot(data=plot_column, shade=True)
    plt.show()
    print(f"Box plot of {feature}")
    plt.figure()
    sns.boxplot(x=plot_column)
    plt.show()

In [None]:
# It might have been beneficial to perform this in the first place, but it's never too late.

nominal = ["MSSubClass", "MSZoning", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType",
           "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating",
           "GarageType", 'MiscFeature', 'SaleType', 'SaleCondition']
ordinal = ["Street", "Alley", "LotShape", "Utilities", "LandSlope", "OverallQual", "OverallCond", "ExterQual", "ExterCond",
           "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "CentralAir", "Electrical",
           "KitchenQual", "Functional", "FireplaceQu", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", 'PoolQC',
           'Fence']
interval = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", 'MoSold', 'YrSold']
ratio = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF",
         "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr",
         "KitchenAbvGr", "TotRmsAbvGrd", 'Fireplaces', "GarageCars", "GarageArea", 'WoodDeckSF', 'OpenPorchSF',
         'EnclosedPorch', '3SsnPorch','ScreenPorch', 'PoolArea', "MiscVal"]
print(len(nominal + ordinal + interval + ratio))

Now for the first model I will choose a subset of those features which represent most of the characteristics of a house. I tried to choose variables with high variety.

In [None]:
ord_model = ["OverallQual", "ExterQual", "BsmtQual", "BsmtExposure", "CentralAir", "KitchenQual", "FireplaceQu", "GarageFinish",
             "GarageCond", "Fence"]
int_model = ["YearBuilt", 'MoSold', 'YrSold']
nom_model = ["MSZoning"]
rat_model = ["LotArea", "MasVnrArea", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "BsmtFullBath", "FullBath", "BedroomAbvGr",
             "TotRmsAbvGrd", 'Fireplaces', "GarageCars", "GarageArea", 'WoodDeckSF', 'OpenPorchSF']

model_features = nom_model + ord_model + int_model + rat_model

All ratio variables will be treated with standards scaler.

All interval variables will be normalized to start from 0.

I will encode ord_model features with ordinal encoding. "MSZoning" will be replaced by one-hot encoding.

Ratio features which are heavily skewed or have large number of outliers will be treated with log transformation.

In [None]:
X = house_data[model_features]
y = house_data["SalePrice"]

In [None]:
log_features = ["LotArea", "MasVnrArea", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GarageArea", 'WoodDeckSF',
                'OpenPorchSF', "TotRmsAbvGrd"]

In [None]:
for feature in log_features:
    plt.figure()
    sns.histplot(data=X[feature])
    plt.show()

In comparison the variable with taken logarithm:

In [None]:
for feature in log_features:
    plt.figure()
    sns.histplot(data=np.log(X[feature] + 1))  # +1 so the 0 is mapped to itself
    plt.show()

Where we see that number of outliers decreased significantly. Couple of notes:
- LotArea still has a lot of outliers, might need to adjust it in future models
- many variables have an accumulation in one point and then a special value of 0 which indicates that house didn't have a specific feature, might need to adjust these variables to remove these special 0 values.

In [None]:
for feature in log_features:
    transformed_column = np.log(X[feature] + 1)
    X['log_' + feature] = transformed_column

In [None]:
X = X.drop(log_features, axis=1)

In [None]:
for feature in int_model:
    minimum = X[feature].min()
    X[feature] = X[feature] - minimum

In [None]:
# changing ordinal variables to their ordinal encodings:
ord_model

In [None]:
exter_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1}
bsmt_qual_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NoBsmt":0}
bsmt_exp_dict = {"Gd":4, "Av":3, "Mn":2, "No":1, "NoBsmt":0}
cent_dict = {"Y":1, "N":0}
kitch_dict = exter_dict
fire_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NoFireplace":0}
garg_fin_dict = {"Fin":3, "RFn":2, "Unf":1, "NoGarage":0}
garg_cond_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NoGarage":0}
fence_dict = {"GdPrv":2, "GdWo":2, "MnPrv":1, "MnWw":1, "NoFence":0}

dict_list = [exter_dict, bsmt_qual_dict, bsmt_exp_dict, cent_dict, kitch_dict, fire_dict,
             garg_fin_dict, garg_cond_dict, fence_dict]
replacement_dict = dict(zip(ord_model[1:], dict_list))

In [None]:
X = X.replace(replacement_dict)

In [None]:
# changing MSZoning with one_hot_encoding
column = X["MSZoning"]
ordinal_encoding = column.replace(dict(zip(sorted(set(column)), range(1, len(sorted(set(column))) + 1))))
one_hot_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series(np.diag(np.ones(len(set(column))))[oe - 1].astype(int))
)
one_hot_encoding.columns = sorted(set(column))
X_one_coded = pd.concat([X, one_hot_encoding], axis=1)

In [None]:
X_one_coded.head()

In [None]:
X_one_coded = X_one_coded.drop("MSZoning", axis=1)

In [None]:
X_one_coded.head()

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_one_coded,y,random_state=37,test_size=0.2)

At last let's rescale ratio variables:

In [None]:
ratio_log_features = ["log_LotArea", "log_MasVnrArea", "log_TotalBsmtSF", "log_1stFlrSF", "log_2ndFlrSF", "BsmtFullBath",
                      "FullBath", "BedroomAbvGr", "log_TotRmsAbvGrd", 'Fireplaces', "GarageCars", "log_GarageArea",
                      'log_WoodDeckSF', 'log_OpenPorchSF']

scaler = StandardScaler()
scaler.fit(X_train[ratio_log_features])
X_train[ratio_log_features] = scaler.transform(X_train[ratio_log_features])

In [None]:
X_train.head()

And the same with X_test:

In [None]:
scaler = StandardScaler()
scaler.fit(X_test[ratio_log_features])
X_test[ratio_log_features] = scaler.transform(X_test[ratio_log_features])

In [None]:
X_test.head()

In [None]:
# Let's save those values for model building:
X_train.to_csv("data/X_train.csv")
X_test.to_csv("data/X_test.csv")
y_train.to_csv("data/y_train.csv")
y_test.to_csv("data/y_test.csv")