## Advance house price prediction - feature engineering

## Import libraries

In [7]:
# Let's import the required libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

## Import training dataset

In [9]:
# Let's import dataset

df = pd.read_csv(r'E:\Study\Projects\EDA\KirshNaik\train.csv')

In [12]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


## Handling Missing Values of Categorical features

In [33]:
# Let's see the missing values of categorical features

categorical_nan = [feature for feature in df.columns if df[feature].isnull().sum() > 1 and df[feature].dtypes == 'O']
categorical_nan

['Alley',
 'MasVnrType',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'FireplaceQu',
 'GarageType',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [57]:
for feature in categorical_nan:
    data = df.copy()
    #data[feature] = np.round(data[feature].isnull().mean(), 4)
    print(f'{feature} has {np.round(data[feature].isnull().mean(), 4)} % of missing values')
    #print(feature, 'has', np.round(data[feature].isnull().mean(), 4), '% of missing values')

Alley has 0.9377 % of missing values
MasVnrType has 0.5973 % of missing values
BsmtQual has 0.0253 % of missing values
BsmtCond has 0.0253 % of missing values
BsmtExposure has 0.026 % of missing values
BsmtFinType1 has 0.0253 % of missing values
BsmtFinType2 has 0.026 % of missing values
FireplaceQu has 0.4726 % of missing values
GarageType has 0.0555 % of missing values
GarageFinish has 0.0555 % of missing values
GarageQual has 0.0555 % of missing values
GarageCond has 0.0555 % of missing values
PoolQC has 0.9952 % of missing values
Fence has 0.8075 % of missing values
MiscFeature has 0.963 % of missing values


In [93]:
# Fill nan values as 'missing'
# write a definition to fill
# check whether the sum of missing values is > 0.
# Add new column with + nan to fill 1 or 0
# Add median() of those missing values in the respective features
# Do all above steps except creation of a defintion for handling numerical nan values
# check the temporal variables
# Ignore YrSold alone
# Calculate the difference of all feature with 'yrsold' feature so that it displays numbers for those feature
# Finally display those number temporal variables alone

In [75]:
# Let's create a definition to fill those categorical nan values as 'missing'.
# This will create a new label/category as 'missing'

def fill_missing(df, categorical_nan):
    data = df.copy()
    data[categorical_nan] = data[categorical_nan].fillna('missing')
    return data

In [89]:
dataset = fill_missing(df, categorical_nan)
dataset[categorical_nan].isnull().sum()

Alley           0
MasVnrType      0
BsmtQual        0
BsmtCond        0
BsmtExposure    0
BsmtFinType1    0
BsmtFinType2    0
FireplaceQu     0
GarageType      0
GarageFinish    0
GarageQual      0
GarageCond      0
PoolQC          0
Fence           0
MiscFeature     0
dtype: int64

In [91]:
dataset.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,missing,Reg,Lvl,AllPub,...,0,missing,missing,missing,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,missing,Reg,Lvl,AllPub,...,0,missing,missing,missing,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,missing,IR1,Lvl,AllPub,...,0,missing,missing,missing,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,missing,IR1,Lvl,AllPub,...,0,missing,missing,missing,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,missing,IR1,Lvl,AllPub,...,0,missing,missing,missing,0,12,2008,WD,Normal,250000
