In [28]:
# need to hot one encode and other stuff both train and test data
# the save it as test and train data in csv
# That I can use for tensorflow model

# Preprocess of house prices dataset

In this notebook, model features will be chosen and transformed.
Final datasets for tensorflow are gonna be prepared both from test and train sets

In [29]:
# import dataset and libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
%matplotlib inline
pd.plotting.register_matplotlib_converters()
plt.rc('figure', figsize=(16, 6))

In [30]:
orig_data = pd.read_csv("data/train_preprocessed.csv", index_col="Id")
orig_test_data = pd.read_csv("data/test_preprocessed.csv", index_col="Id")

In [31]:
# copying the dataset for analysis
house_test_data = orig_test_data.copy()
house_data = orig_data.copy()
house_data.head()

Unnamed: 0_level_0,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,60,RL,65.0,8450,Pave,NoAlley,Reg,Lvl,AllPub,Inside,...,0,NoPool,NoFence,NoFeature,0,2,2008,WD,Normal,208500
2,20,RL,80.0,9600,Pave,NoAlley,Reg,Lvl,AllPub,FR2,...,0,NoPool,NoFence,NoFeature,0,5,2007,WD,Normal,181500
3,60,RL,68.0,11250,Pave,NoAlley,IR1,Lvl,AllPub,Inside,...,0,NoPool,NoFence,NoFeature,0,9,2008,WD,Normal,223500
4,70,RL,60.0,9550,Pave,NoAlley,IR1,Lvl,AllPub,Corner,...,0,NoPool,NoFence,NoFeature,0,2,2006,WD,Abnorml,140000
5,60,RL,84.0,14260,Pave,NoAlley,IR1,Lvl,AllPub,FR2,...,0,NoPool,NoFence,NoFeature,0,12,2008,WD,Normal,250000


Let's first divide the features into separate categories depending if they are nominal, ordinal, interval or ratio variables:

In [32]:
categorical_features = ["MSSubClass", "MSZoning", "Street", "Alley", "LotShape", "LandContour", "Utilities", "LotConfig",
                        "LandSlope", "Neighborhood", "Condition1", "Condition2", "BldgType", "HouseStyle", "OverallQual",
                        "OverallCond", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "ExterQual",
                        "ExterCond", "Foundation", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2",
                        "Heating", "HeatingQC", "CentralAir", "Electrical", "KitchenQual", "Functional", "FireplaceQu",
                        "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", 'PoolQC', 'Fence',
                        'MiscFeature', 'SaleType', 'SaleCondition']

numerical_features = ["LotFrontage", "LotArea", "YearBuilt", "YearRemodAdd", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
                      "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath",
                      "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", 'Fireplaces',
                      "GarageYrBlt", "GarageCars", "GarageArea", 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
                      'ScreenPorch', 'PoolArea', "MiscVal", 'MoSold', 'YrSold']

def plot_numerical_feature(feature, include_non_positive=True):
    plot_column = X[feature]
    if not include_non_positive:
        plot_column = plot_column[plot_column > 0]

    print(f"Description of {feature}")
    print(plot_column.describe(), "\n")
    print(f"Histogram of {feature}")
    plt.figure()
    sns.histplot(data=plot_column)
    plt.show()
    print(f"Kernel density estimation plot of {feature}")
    plt.figure()
    sns.kdeplot(data=plot_column, shade=True)
    plt.show()
    print(f"Box plot of {feature}")
    plt.figure()
    sns.boxplot(x=plot_column)
    plt.show()

In [33]:
# It might have been beneficial to perform this in the first place, but it's never too late.

nominal = ["MSSubClass", "MSZoning", "LandContour", "LotConfig", "Neighborhood", "Condition1", "Condition2", "BldgType",
           "HouseStyle", "RoofStyle", "RoofMatl", "Exterior1st", "Exterior2nd", "MasVnrType", "Foundation", "Heating",
           "GarageType", 'MiscFeature', 'SaleType', 'SaleCondition']
ordinal = ["Street", "Alley", "LotShape", "Utilities", "LandSlope", "OverallQual", "OverallCond", "ExterQual", "ExterCond",
           "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "HeatingQC", "CentralAir", "Electrical",
           "KitchenQual", "Functional", "FireplaceQu", "GarageFinish", "GarageQual", "GarageCond", "PavedDrive", 'PoolQC',
           'Fence']
interval = ["YearBuilt", "YearRemodAdd", "GarageYrBlt", 'MoSold', 'YrSold']
ratio = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF",
         "2ndFlrSF", "LowQualFinSF", "GrLivArea", "BsmtFullBath", "BsmtHalfBath", "FullBath", "HalfBath", "BedroomAbvGr",
         "KitchenAbvGr", "TotRmsAbvGrd", 'Fireplaces', "GarageCars", "GarageArea", 'WoodDeckSF', 'OpenPorchSF',
         'EnclosedPorch', '3SsnPorch','ScreenPorch', 'PoolArea', "MiscVal"]
print(len(nominal + ordinal + interval + ratio))

79


Now for the first model I will choose a subset of those features which represent most of the characteristics of a house. I tried to choose variables with high variety.

In [34]:
ord_model = ["OverallQual", "ExterQual", "BsmtQual", "BsmtExposure", "CentralAir", "KitchenQual", "FireplaceQu", "GarageFinish",
             "GarageCond", "Fence"]
int_model = ["YearBuilt", 'MoSold', 'YrSold']
nom_model = ["MSZoning"]
rat_model = ["LotArea", "MasVnrArea", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "BsmtFullBath", "FullBath", "BedroomAbvGr",
             "TotRmsAbvGrd", 'Fireplaces', "GarageCars", "GarageArea", 'WoodDeckSF', 'OpenPorchSF']

model_features = nom_model + ord_model + int_model + rat_model

All ratio variables will be treated with standards scaler.

All interval variables will be normalized to start from 0.

I will encode ord_model features with ordinal encoding. "MSZoning" will be replaced by one-hot encoding.

Ratio features which are heavily skewed or have large number of outliers will be treated with log transformation.

In [35]:
test_X = house_test_data[model_features]
X = house_data[model_features]
y = house_data["SalePrice"]

In [36]:
log_features = ["LotArea", "MasVnrArea", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "GarageArea", 'WoodDeckSF',
                'OpenPorchSF', "TotRmsAbvGrd"]

In [37]:
for feature in log_features:
    transformed_column = np.log(X[feature] + 1)
    X['log_' + feature] = transformed_column

for feature in log_features:
    transformed_column = np.log(test_X[feature] + 1)
    test_X['log_' + feature] = transformed_column

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['log_' + feature] = transformed_column
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_X['log_' + feature] = transformed_column


In [38]:
test_X = test_X.drop(log_features, axis=1)
X = X.drop(log_features, axis=1)

In [39]:
for feature in int_model:
    minimum = X[feature].min()
    X[feature] = X[feature] - minimum

for feature in int_model:
    minimum = test_X[feature].min()
    test_X[feature] = test_X[feature] - minimum

In [40]:
# changing ordinal variables to their ordinal encodings:
ord_model

['OverallQual',
 'ExterQual',
 'BsmtQual',
 'BsmtExposure',
 'CentralAir',
 'KitchenQual',
 'FireplaceQu',
 'GarageFinish',
 'GarageCond',
 'Fence']

In [41]:
exter_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1}
bsmt_qual_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NoBsmt":0}
bsmt_exp_dict = {"Gd":4, "Av":3, "Mn":2, "No":1, "NoBsmt":0}
cent_dict = {"Y":1, "N":0}
kitch_dict = exter_dict
fire_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NoFireplace":0}
garg_fin_dict = {"Fin":3, "RFn":2, "Unf":1, "NoGarage":0}
garg_cond_dict = {"Ex":5, "Gd":4, "TA":3, "Fa":2, "Po":1, "NoGarage":0}
fence_dict = {"GdPrv":2, "GdWo":2, "MnPrv":1, "MnWw":1, "NoFence":0}

dict_list = [exter_dict, bsmt_qual_dict, bsmt_exp_dict, cent_dict, kitch_dict, fire_dict,
             garg_fin_dict, garg_cond_dict, fence_dict]
replacement_dict = dict(zip(ord_model[1:], dict_list))

In [42]:
test_X = test_X.replace(replacement_dict)
X = X.replace(replacement_dict)

In [43]:
# changing MSZoning with one_hot_encoding
column = X["MSZoning"]
ordinal_encoding = column.replace(dict(zip(sorted(set(column)), range(1, len(sorted(set(column))) + 1))))
one_hot_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series(np.diag(np.ones(len(set(column))))[oe - 1].astype(int))
)
one_hot_encoding.columns = sorted(set(column))
X_one_coded = pd.concat([X, one_hot_encoding], axis=1)


# changing MSZoning with one_hot_encoding for test_X
column = test_X["MSZoning"]
ordinal_encoding = column.replace(dict(zip(sorted(set(column)), range(1, len(sorted(set(column))) + 1))))
one_hot_encoding = ordinal_encoding.apply(
    lambda oe: pd.Series(np.diag(np.ones(len(set(column))))[oe - 1].astype(int))
)
one_hot_encoding.columns = sorted(set(column))
test_X_one_coded = pd.concat([test_X, one_hot_encoding], axis=1)

In [44]:
test_X_one_coded = test_X_one_coded.drop("MSZoning", axis=1)
X_one_coded = X_one_coded.drop("MSZoning", axis=1)

In [45]:
X_one_coded.head()

Unnamed: 0_level_0,OverallQual,ExterQual,BsmtQual,BsmtExposure,CentralAir,KitchenQual,FireplaceQu,GarageFinish,GarageCond,Fence,...,log_2ndFlrSF,log_GarageArea,log_WoodDeckSF,log_OpenPorchSF,log_TotRmsAbvGrd,C (all),FV,RH,RL,RM
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,7,4,4,1,1,4,0,2,3,0,...,6.751101,6.308098,0.0,4.127134,2.197225,0,0,0,1,0
2,6,3,4,4,1,3,3,2,3,0,...,0.0,6.133398,5.700444,0.0,1.94591,0,0,0,1,0
3,7,4,4,2,1,4,3,2,3,0,...,6.765039,6.411818,0.0,3.7612,1.94591,0,0,0,1,0
4,7,3,3,1,1,4,4,1,3,0,...,6.629363,6.466145,0.0,3.583519,2.079442,0,0,0,1,0
5,8,4,4,3,1,4,3,2,3,0,...,6.960348,6.729824,5.26269,4.442651,2.302585,0,0,0,1,0


In [46]:
# Split for train and validation data
X_train,X_valid,y_train,y_valid = train_test_split(X_one_coded,y,random_state=37,test_size=0.2)

At last let's rescale ratio variables:

In [47]:
ratio_log_features = ["log_LotArea", "log_MasVnrArea", "log_TotalBsmtSF", "log_1stFlrSF", "log_2ndFlrSF", "BsmtFullBath",
                      "FullBath", "BedroomAbvGr", "log_TotRmsAbvGrd", 'Fireplaces', "GarageCars", "log_GarageArea",
                      'log_WoodDeckSF', 'log_OpenPorchSF']

scaler = StandardScaler()
scaler.fit(X_train[ratio_log_features])
X_train[ratio_log_features] = scaler.transform(X_train[ratio_log_features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_train[ratio_log_features] = scaler.transform(X_train[ratio_log_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [48]:
# Let's scale valid and test data in the same way as train data (for consistency)
X_valid[ratio_log_features] = scaler.transform(X_valid[ratio_log_features])
test_X_one_coded[ratio_log_features] = scaler.transform(test_X_one_coded[ratio_log_features])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X_valid[ratio_log_features] = scaler.transform(X_valid[ratio_log_features])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, value[:, i].tolist(), pi)


In [49]:
# Let's save those values for model building:
X_train.to_csv("data/X_train.csv")
X_valid.to_csv("data/X_valid.csv")
y_train.to_csv("data/y_train.csv")
y_valid.to_csv("data/y_valid.csv")
test_X_one_coded.to_csv("data/X_test.csv")