# EDA [House Prices]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv('./data/train.csv', index_col='Id')
test = pd.read_csv('./data/test.csv', index_col='Id')

data = pd.concat([train, test])

## Numeric features

#### LotFrontage

In [None]:
train['LotFrontage'].sort_values(ascending=False).head(5), test['LotFrontage'].sort_values(ascending=False).head(5)

In [None]:
front_outliers = (data['LotFrontage'] > 300)
data.drop(data.loc[front_outliers].index, inplace=True)

#### LotArea

In [None]:
train['LotArea'].sort_values(ascending=False).head(5), test['LotArea'].sort_values(ascending=False).head(5)

In [None]:
area_outliers = (data['LotArea'] > 100000)
data.drop(data.loc[area_outliers].index, inplace=True)

#### BsmtFinSF1

In [None]:
# BsmtFinSF1
train['BsmtFinSF1'].sort_values(ascending=False).head(5), test['BsmtFinSF1'].sort_values(ascending=False).head(5)

In [None]:
bsmt1_outliers = (data['BsmtFinSF1'] > 3000)
data['BsmtFinSF1_Outliers'] = np.zeros(data.shape[0])
data.loc[bsmt1_outliers, 'BsmtFinSF1_Outliers'] = 100

#### TotalBsmtSF

In [None]:
# TotalBsmtSF
train['TotalBsmtSF'].sort_values(ascending=False).head(5), test['TotalBsmtSF'].sort_values(ascending=False).head(5)

In [None]:
bsmt_tot_outliers = (data['TotalBsmtSF'] > 4000)
data['TotalBsmtSF_Outliers'] = np.zeros(data.shape[0])
data.loc[bsmt_tot_outliers, 'TotalBsmtSF_Outliers'] = 100

#### 1stFlrSF

In [None]:
# 1stFlrSF
train['1stFlrSF'].sort_values(ascending=False).head(5), test['1stFlrSF'].sort_values(ascending=False).head(5)

In [None]:
f_flr_outliers = (data['1stFlrSF'] > 4000)
data['1stFlrSF_Outliers'] = np.zeros(data.shape[0])
data.loc[f_flr_outliers, '1stFlrSF_Outliers'] = 100

#### GrLivArea

In [None]:
# GrLivArea
train['GrLivArea'].sort_values(ascending=False).head(5), test['GrLivArea'].sort_values(ascending=False).head(5)

In [None]:
main_outliers = (data['GrLivArea'] > 4500)
data['GrLivArea_Outliers'] = np.zeros(data.shape[0])
data.loc[main_outliers, 'GrLivArea_Outliers'] = 100

#### GarageYrBlt

In [None]:
# GarageYrBlt
train['GarageYrBlt'].sort_values(ascending=False).head(5), test['GarageYrBlt'].sort_values(ascending=False).head(5)

In [None]:
garage_year_outlier = data['GarageYrBlt'] > 2010
data.loc[garage_year_outlier, 'GarageYrBlt'] = 2007

#### WoodDeckSF

In [None]:
# WoodDeckSF
train['WoodDeckSF'].sort_values(ascending=False).head(5), test['WoodDeckSF'].sort_values(ascending=False).head(5)

In [None]:
wood_deck_outliers = (data['WoodDeckSF'] > 1000)
data['WoodDeckSF_Outliers'] = np.zeros(data.shape[0])
data.loc[wood_deck_outliers, 'WoodDeckSF_Outliers'] = 100

#### EnclosedPorch

In [None]:
# EnclosedPorch
train['EnclosedPorch'].sort_values(ascending=False).head(5), test['EnclosedPorch'].sort_values(ascending=False).head(5)

In [None]:
enc_porch_outliers = (data['EnclosedPorch'] > 1000)
data['EnclosedPorch_Outliers'] = np.zeros(data.shape[0])
data.loc[enc_porch_outliers, 'EnclosedPorch_Outliers'] = 100

## Ordinal features


#### MSZoning

In [None]:
data['MSZoning'].value_counts()

In [None]:
plt.scatter(train['MSZoning'], train['SalePrice'])

In [None]:
data['MSZoning'] = data['MSZoning'].map({
    'C (all)': 0,
    'RH': 1,
    'RM': 2,
    'FV': 3,
    'RL': 4
})

#### Street

In [None]:
plt.scatter(train['Street'], train['SalePrice'])

In [None]:
data['Street'] = data['Street'].map({
    'Grvl': 0,
    'Pave': 1
})

#### LotShape

In [None]:
plt.scatter(train['LotShape'], train['SalePrice'])

In [None]:
data['LotShape'] = data['LotShape'].map({
    'IR3': 0,
    'IR2': 1,
    'Reg': 2,
    'IR1': 3
})

#### LandContour

In [None]:
plt.scatter(train['LandContour'], train['SalePrice'])

In [None]:
data['LandContour'] = data['LandContour'].map({
    'Bnk': 0,
    'Low': 1,
    'HLS': 2,
    'Lvl': 3
})

#### Utilities

In [None]:
data['Utilities'].value_counts()

In [None]:
data['Utilities'] = data['Utilities'].map({
    'AllPub': 1,
    'NoSeWa': 0
})

#### LotConfig

In [None]:
plt.scatter(train['LotConfig'], train['SalePrice'])

In [None]:
data['LotConfig'] = data['LotConfig'].map({
    'FR3': 0,
    'FR2': 1,
    'Corner': 2,
    'Inside': 3,
    'CulDSac': 4
})

#### LandSlope

In [None]:
plt.scatter(train['LandSlope'], train['SalePrice'])

In [None]:
data['LandSlope'] = data['LandSlope'].map({
    'Sev': 0,
    'Mod': 1,
    'Gtl': 2
})

#### Neighborhood

In [None]:
plt.scatter(train['Neighborhood'], train['SalePrice'])

In [None]:
train['Neighborhood'].unique()

In [None]:
temp = train['Neighborhood'].map({
    'CollgCr': 15,
    'Veenker': 20,
    'Crawfor': 14,
    'NoRidge': 24,
    'Mitchel': 10,
    'Somerst': 18,
    'NWAmes': 17,
    'OldTown': 2,
    'BrkSide': 1,
    'Sawyer': 5,
    'NridgHt': 23,
    'NAmes': 11,
    'SawyerW': 13,
    'IDOTRR': 0,
    'MeadowV': 4,
    'Edwards': 9,
    'Timber': 21,
    'Gilbert': 19,
    'StoneBr': 22,
    'ClearCr': 16,
    'NPkVill': 8,
    'Blmngtn': 12,
    'BrDale': 3,
    'SWISU': 6,
    'Blueste': 7 
})

In [None]:
plt.scatter(temp, train['SalePrice'])

In [None]:
data['Neighborhood'] = data['Neighborhood'].map({
    'CollgCr': 15,
    'Veenker': 20,
    'Crawfor': 14,
    'NoRidge': 24,
    'Mitchel': 10,
    'Somerst': 18,
    'NWAmes': 17,
    'OldTown': 2,
    'BrkSide': 1,
    'Sawyer': 5,
    'NridgHt': 23,
    'NAmes': 11,
    'SawyerW': 13,
    'IDOTRR': 0,
    'MeadowV': 4,
    'Edwards': 9,
    'Timber': 21,
    'Gilbert': 19,
    'StoneBr': 22,
    'ClearCr': 16,
    'NPkVill': 8,
    'Blmngtn': 12,
    'BrDale': 3,
    'SWISU': 6,
    'Blueste': 7 
})

#### OverallQual

In [None]:
plt.scatter(train['OverallQual'], train['SalePrice'])

#### OverallCond

In [None]:
plt.scatter(train['OverallCond'], train['SalePrice'])

#### ExterQual

In [None]:
plt.scatter(train['ExterQual'], train['SalePrice'])

In [None]:
# 'Po' --> 0 ('Poor quality')
data['ExterQual'] = data['ExterQual'].map({
    'Fa': 1,
    'TA': 2,
    'Gd': 3,
    'Ex': 4
})

#### ExterCond

In [None]:
plt.scatter(train['ExterCond'], train['SalePrice'])

In [None]:
data['ExterCond'] = data['ExterCond'].map({
    'Po': 0,
    'Fa': 1,
    'TA': 2,
    'Gd': 3,
    'Ex': 4
})

#### BsmtQual

In [None]:
plt.scatter(train['BsmtQual'].fillna('NoBsmt'), train['SalePrice'])

In [None]:
data['BsmtQual'].fillna('NoBsmt', inplace=True)

In [None]:
data['BsmtQual'] = data['BsmtQual'].map({
    'NoBsmt': 0,
    'Fa': 1,
    'TA': 2,
    'Gd': 3,
    'Ex': 4
})

#### BsmtCond

In [None]:
plt.scatter(train['BsmtCond'].fillna('NoBsmt'), train['SalePrice'])

In [None]:
data['BsmtCond'].fillna('NoBsmt', inplace=True)

In [None]:
data['BsmtCond'] = data['BsmtCond'].map({
    'Po': 0,
    'NoBsmt': 1,
    'Fa': 2,
    'TA': 3,
    'Gd': 4,
    'Ex': 5
})

#### BsmtExposure

In [None]:
plt.scatter(train['BsmtExposure'].fillna('NoBsmt'), train['SalePrice'])

In [None]:
data['BsmtExposure'].fillna('NoBsmt', inplace=True)

In [None]:
data['BsmtExposure'] = data['BsmtExposure'].map({
    'NoBsmt': 0,
    'No': 1, 
    'Mn': 2,
    'Av': 3,
    'Gd': 4
})

#### BsmtFinType1

In [None]:
plt.scatter(train['BsmtFinType1'].fillna('NoBsmt'), train['SalePrice'])

In [None]:
data['BsmtFinType1'].fillna('NoBsmt', inplace=True)

In [None]:
data['BsmtFinType1'] = data['BsmtFinType1'].map({
    'NoBsmt': 0,
    'Unf': 1,
    'LwQ': 2,
    'Rec': 3,
    'BLQ': 4,
    'ALQ': 5,
    'GLQ': 6
})

#### BsmtFinType2

In [None]:
plt.scatter(train['BsmtFinType2'].fillna('NoBsmt'), train['SalePrice'])

In [None]:
data['BsmtFinType2'].fillna('NoBsmt', inplace=True)

In [None]:
data['BsmtFinType2'] = data['BsmtFinType2'].map({
    'NoBsmt': 0,
    'Unf': 1,
    'LwQ': 2,
    'Rec': 3,
    'BLQ': 4,
    'ALQ': 5,
    'GLQ': 6
})

#### Heating

In [None]:
plt.scatter(train['Heating'], train['SalePrice'])

In [None]:
data['Heating'].value_counts()

In [None]:
data['Heating'] = data['Heating'].map({
    'Floor': 0,
    'Wall': 1,
    'OthW': 2,
    'Grav': 3,
    'GasW': 10,
    'GasA': 20
})

#### HeatingQC

In [None]:
plt.scatter(train['HeatingQC'], train['SalePrice'])

In [None]:
data['HeatingQC'] = data['HeatingQC'].map({
    'Po': 0,
    'Fa': 1,
    'TA': 2,
    'Gd': 3,
    'Ex': 4
})

#### CentralAir

In [None]:
plt.scatter(train['CentralAir'], train['SalePrice'])

In [None]:
data['CentralAir'] = data['CentralAir'].map({
    'N': 0,
    'Y': 1
})

#### Electrical

In [None]:
plt.scatter(train['Electrical'].fillna('No'), train['SalePrice'])

In [None]:
data['Electrical'].fillna('No', inplace=True)

In [None]:
data['Electrical'] = data['Electrical'].map({
    'Mix': 0,
    'FuseP': 1,
    'FuseF': 2,
    'FuseA': 3,
    'No': 4,
    'SBrkr': 5
})

#### FullBath

In [None]:
data['FullBath'].value_counts()

In [None]:
data['FullBath'] = data['FullBath'].map({
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 3
})

#### BedroomAbvGr

In [None]:
plt.scatter(train['BedroomAbvGr'], train['SalePrice'])

In [None]:
data['BedroomAbvGr'].replace({
    5: 4,
    6: 4,
    8: 4
}, inplace=True)

#### KitchenAbvGr

In [None]:
plt.scatter(train['KitchenAbvGr'], train['SalePrice'])

In [None]:
train['KitchenAbvGr'].value_counts(), test['KitchenAbvGr'].value_counts()

In [None]:
data['KitchenAbvGr'].replace({
    1: 2,
    2: 1,
    3: 1
}, inplace=True)

#### KitchenQual

In [None]:
data['KitchenQual'].value_counts()

In [None]:
data['KitchenQual'] = data['KitchenQual'].map({
    'Fa': 1,
    'TA': 2,
    'Gd': 3,
    'Ex': 4
})

#### FireplaceQu

In [None]:
plt.scatter(train['FireplaceQu'].fillna('NoFrp'), train['SalePrice'])

In [None]:
data['FireplaceQu'].fillna('NoFrp', inplace=True)

In [None]:
data['FireplaceQu'] = data['FireplaceQu'].map({
    'NoFrp': 0,
    'Po': 1,
    'Fa': 2,
    'TA': 3,
    'Gd': 4,
    'Ex': 5
})

#### GarageFinish

In [None]:
plt.scatter(train['GarageFinish'].fillna('No'), train['SalePrice'])

In [None]:
data['GarageFinish'].fillna('No', inplace=True)

In [None]:
data['GarageFinish'] = data['GarageFinish'].map({
    'No': 0,
    'Unf': 1,
    'RFn': 2,
    'Fin': 3
})

#### GarageCars

In [None]:
data['GarageCars'].value_counts()

In [None]:
# outlier mask
cars_outlier = data['GarageCars'] == 5

In [None]:
# handle outlier
data['GarageCarsOutlier'] = np.zeros(data.shape[0])
data.loc[cars_outlier, 'GarageCarsOutlier'] = 1

In [None]:
data['GarageCars'] = data['GarageCars'].map({
    0: 0,
    1: 1,
    2: 2,
    3: 3,
    4: 3,
    5: 3
})

#### GarageQual

In [None]:
plt.scatter(train['GarageQual'].fillna('No'), train['SalePrice'])

In [None]:
data['GarageQual'].fillna('No', inplace=True)

In [None]:
data['GarageQual'] = data['GarageQual'].map({
    'Po': 0,
    'No': 1,
    'Fa': 2,
    'TA': 3,
    'Gd': 4,
    'Ex': 5
})

#### GarageCond

In [None]:
plt.scatter(train['GarageCond'].fillna('No'), train['SalePrice'])

In [None]:
data['GarageCond'].fillna('No', inplace=True)

In [None]:
data['GarageCond'] = data['GarageQual'].map({
    'Po': 0,
    'No': 1,
    'Fa': 2,
    'TA': 3,
    'Gd': 4,
    'Ex': 5
})

#### PavedDrive

In [None]:
plt.scatter(train['PavedDrive'], train['SalePrice'])

In [None]:
data['PavedDrive'] = data['PavedDrive'].map({
    'N': 0,
    'P': 1,
    'Y': 2
})

#### MiscFeature

In [None]:
plt.scatter(train['MiscFeature'].fillna('No'), train['SalePrice'])

In [None]:
data['MiscFeature'].fillna('No', inplace=True)

In [None]:
data['MiscFeature'].unique()

In [None]:
data['MiscFeature'] = data['MiscFeature'].map({
    'Shed': 0,
    'Gar2': 0,
    'Othr': 0,
    'TenC': 0,
    'No': 1,
})