# EDA [House Prices]

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
train = pd.read_csv('./data/train.csv', index_col='Id')
test = pd.read_csv('./data/test.csv', index_col='Id')

data = pd.concat([train, test])

## Numeric features

#### LotFrontage

In [None]:
train['LotFrontage'].sort_values(ascending=False).head(5), test['LotFrontage'].sort_values(ascending=False).head(5)

In [None]:
front_outliers = (data['LotFrontage'] > 300)
data.drop(data.loc[front_outliers].index, inplace=True)

#### LotArea

In [None]:
train['LotArea'].sort_values(ascending=False).head(5), test['LotArea'].sort_values(ascending=False).head(5)

In [None]:
area_outliers = (data['LotArea'] > 100000)
data.drop(data.loc[area_outliers].index, inplace=True)

#### BsmtFinSF1

In [None]:
# BsmtFinSF1
train['BsmtFinSF1'].sort_values(ascending=False).head(5), test['BsmtFinSF1'].sort_values(ascending=False).head(5)

In [None]:
bsmt1_outliers = (data['BsmtFinSF1'] > 3000)
data['BsmtFinSF1_Outliers'] = np.zeros(data.shape[0])
data.loc[bsmt1_outliers, 'BsmtFinSF1_Outliers'] = 100

#### TotalBsmtSF

In [None]:
# TotalBsmtSF
train['TotalBsmtSF'].sort_values(ascending=False).head(5), test['TotalBsmtSF'].sort_values(ascending=False).head(5)

In [None]:
bsmt_tot_outliers = (data['TotalBsmtSF'] > 4000)
data['TotalBsmtSF_Outliers'] = np.zeros(data.shape[0])
data.loc[bsmt_tot_outliers, 'TotalBsmtSF_Outliers'] = 100

#### 1stFlrSF

In [None]:
# 1stFlrSF
train['1stFlrSF'].sort_values(ascending=False).head(5), test['1stFlrSF'].sort_values(ascending=False).head(5)

In [None]:
f_flr_outliers = (data['1stFlrSF'] > 4000)
data['1stFlrSF_Outliers'] = np.zeros(data.shape[0])
data.loc[f_flr_outliers, '1stFlrSF_Outliers'] = 100

#### GrLivArea

In [None]:
# GrLivArea
train['GrLivArea'].sort_values(ascending=False).head(5), test['GrLivArea'].sort_values(ascending=False).head(5)

In [None]:
main_outliers = (data['GrLivArea'] > 4500)
data['GrLivArea_Outliers'] = np.zeros(data.shape[0])
data.loc[main_outliers, 'GrLivArea_Outliers'] = 100

#### GarageYrBlt

In [None]:
# GarageYrBlt
train['GarageYrBlt'].sort_values(ascending=False).head(5), test['GarageYrBlt'].sort_values(ascending=False).head(5)

In [None]:
garage_year_outlier = data['GarageYrBlt'] > 2010
data.loc[garage_year_outlier, 'GarageYrBlt'] = 2007

#### WoodDeckSF

In [None]:
# WoodDeckSF
train['WoodDeckSF'].sort_values(ascending=False).head(5), test['WoodDeckSF'].sort_values(ascending=False).head(5)

In [None]:
wood_deck_outliers = (data['WoodDeckSF'] > 1000)
data['WoodDeckSF_Outliers'] = np.zeros(data.shape[0])
data.loc[wood_deck_outliers, 'WoodDeckSF_Outliers'] = 100

#### EnclosedPorch

In [None]:
# EnclosedPorch
train['EnclosedPorch'].sort_values(ascending=False).head(5), test['EnclosedPorch'].sort_values(ascending=False).head(5)

In [None]:
enc_porch_outliers = (data['EnclosedPorch'] > 1000)
data['EnclosedPorch_Outliers'] = np.zeros(data.shape[0])
data.loc[enc_porch_outliers, 'EnclosedPorch_Outliers'] = 100

## Ordinal features


#### MSZoning

In [None]:
data['MSZoning'].value_counts()

In [None]:
plt.scatter(train['MSZoning'], train['SalePrice'])

In [None]:
data['MSZoning'] = data['MSZoning'].map({
    'C (all)': 0,
    'RH': 1,
    'RM': 2,
    'FV': 3,
    'RL': 4
})

#### Street

In [None]:
plt.scatter(train['Street'], train['SalePrice'])

In [None]:
data['Street'] = data['Street'].map({
    'Grvl': 0,
    'Pave': 1
})

#### LotShape

In [None]:
plt.scatter(train['LotShape'], train['SalePrice'])

In [None]:
data['LotShape'] = data['LotShape'].map({
    'IR3': 0,
    'IR2': 1,
    'Reg': 2,
    'IR1': 3
})

#### LandContour

In [None]:
plt.scatter(train['LandContour'], train['SalePrice'])

In [None]:
data['LandContour'] = data['LandContour'].map({
    'Bnk': 0,
    'Low': 1,
    'HLS': 2,
    'Lvl': 3
})

#### Utilities

In [None]:
data['Utilities'].value_counts()

In [None]:
data['Utilities'] = data['Utilities'].map({
    'AllPub': 1,
    'NoSeWa': 0
})