In [14]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

plt.style.use('ggplot')

In [None]:
data = pd.read_csv("feature_HYJ.csv")

In [3]:
data.columns

Index([u'Id', u'source', u'SalePrice', u'OpenPorchSF', u'OverallCond',
       u'OverallQual', u'PavedDrive', u'PoolArea', u'PoolQC', u'RoofMatl',
       u'RoofStyle', u'SaleCondition', u'SaleType', u'ScreenPorch', u'Street',
       u'TotRmsAbvGrd', u'TotalBsmtSF', u'Utilities', u'WoodDeckSF',
       u'YearBuilt', u'YearRemodAdd', u'YrSold'],
      dtype='object')

# OpenPorchSF
Open porch area in square feet

In [5]:
np.sum(data['OpenPorchSF'].isnull()) # No missing value

0

In [6]:
data['OpenPorchSF'].describe()

count    2919.000000
mean       47.486811
std        67.575493
min         0.000000
25%         0.000000
50%        26.000000
75%        70.000000
max       742.000000
Name: OpenPorchSF, dtype: float64

# OverallCond
Rates the overall condition of the house

In [7]:
np.sum(data['OverallCond'].isnull()) # No missing values

0

In [8]:
data['OverallCond'].describe() # This feature is in fact ordinal, but it is read as numerical ones in pandas

count    2919.000000
mean        5.564577
std         1.113131
min         1.000000
25%         5.000000
50%         5.000000
75%         6.000000
max         9.000000
Name: OverallCond, dtype: float64

# OverallQual
Rates the overall material and finish of the house

In [9]:
np.sum(data['OverallQual'].isnull()) # No missing value

0

In [10]:
data['OverallQual'].describe() # This feature is in fact ordinal, but it is read as numerical ones in pandas

count    2919.000000
mean        6.089072
std         1.409947
min         1.000000
25%         5.000000
50%         6.000000
75%         7.000000
max        10.000000
Name: OverallQual, dtype: float64

# PavedDrive

In [11]:
np.sum(data['PavedDrive'].isnull()) # No missing value

0

In [12]:
data['PavedDrive'].describe()

count     2919
unique       3
top          Y
freq      2641
Name: PavedDrive, dtype: object

# PoolArea

In [13]:
np.sum(data['PoolArea'].isnull())

0

In [16]:
data["PoolArea"].describe()

count    2919.000000
mean        2.251799
std        35.663946
min         0.000000
25%         0.000000
50%         0.000000
75%         0.000000
max       800.000000
Name: PoolArea, dtype: float64

In [19]:
# Only 13 houses have pools
data['PoolArea'][data['PoolArea'] > 0]

197     512
810     648
1170    576
1182    555
1298    480
1386    519
1423    738
1974    144
2420    368
2503    444
2573    228
2599    561
2710    800
Name: PoolArea, dtype: int64

In [20]:
# Scale the pool area to [0, 1]
data["PoolArea"] = data["PoolArea"] / 800

# PoolQC

In [25]:
# The quality of pool No. 2420, 2503, 2599 are missing
data["PoolQC"][data["PoolQC"].isnull() == False]

197     Ex
810     Fa
1170    Gd
1182    Ex
1298    Gd
1386    Fa
1423    Gd
1974    Ex
2573    Ex
2710    Gd
Name: PoolQC, dtype: object

In [34]:
# Label No. 2420, 2503, 2599 as "missing", label the ramaining as "none'
data.loc[2420, 'PoolQC'] = "missing"
data.loc[2503, 'PoolQC'] = "missing"
data.loc[2599, 'PoolQC'] = "missing"

In [35]:
data["PoolQC"][data["PoolQC"].isnull() == False]

197          Ex
810          Fa
1170         Gd
1182         Ex
1298         Gd
1386         Fa
1423         Gd
1974         Ex
2420    missing
2503    missing
2573         Ex
2599    missing
2710         Gd
Name: PoolQC, dtype: object

In [37]:
# Replace the NA with "none"
data["PoolQC"] = data["PoolQC"].fillna("none")

In [40]:
np.sum(data["PoolQC"].isnull())

0