In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

## Load Data

In [None]:
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')
train_labels = train['SalePrice'] # separate labels from train dataset
data = pd.concat([train, test], keys=['train', 'test'])

## General Analysis

In [None]:
print(data.columns) # check column decorations

In [None]:
print('rows:', data.shape[0], ', columns: ', data.shape[1]) # count rows of total dataset
print('rows in train dataset: ', train.shape[0])
print('rows in test dataset: ', test.shape[0])

In [None]:
nans = pd.concat([train.isnull().sum(), train.isnull().sum() / train.shape[0], test.isnull().sum(), test.isnull().sum()
                  / test.shape[0]], axis=1, keys=['Train', 'Percentage', 'Test', 'Percentage'])
print(nans[nans.sum(axis=1) > 0])

## Exploration on SalePrice

In [None]:
print(train_labels.describe())
print("Skewness: %f" % train_labels.skew())
print("Kurtosis: %f" % train_labels.kurt())

In [None]:
plt.subplot(1, 2, 1)
plt.title("Sale Prices Dist")
sns.distplot(train_labels, fit=stats.norm)
plt.subplot(1, 2, 2)
stats.probplot(train_labels, plot=plt)
plt.show()

In [None]:
# log transformation
plt.subplot(1, 2, 1)
plt.title("Sale Prices Dist")
sns.distplot(np.log(train_labels), fit=stats.norm)
plt.subplot(1, 2, 2)
stats.probplot(np.log(train_labels), plot=plt)
plt.show()
print("Skewness: %f" % np.log(train_labels).skew())
print("Kurtosis: %f" % np.log(train_labels).kurt())

## Explore on Features

In [None]:
train.drop(['Id', 'MiscFeature', 'Fence', 'PoolQC', 'FireplaceQu', 'Alley'], axis=1, inplace=True)

In [None]:
# draw correlation coefficient matrix
corrmat = train.corr()
f, ax = plt.subplots(figsize=(12, 9))
sns.heatmap(corrmat, vmax=.8, square=True, cmap="YlGnBu")
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

In [None]:
# number of variables for heatmap
corrmat = train.corr()
cols = corrmat.nlargest(10, 'SalePrice')['SalePrice'].index
cm = np.corrcoef(train[cols].values.T)
sns.set(font_scale=1.25)
hm = sns.heatmap(cm, cbar=True, annot=True, square=True, cmap="YlGnBu", fmt='.2f', annot_kws={'size': 10}, 
                 yticklabels=cols.values, xticklabels=cols.values)
plt.yticks(rotation=0)
plt.xticks(rotation=90)
plt.show()

In [None]:
# Bsmt relations
f, (ax1, ax2, ax3) = plt.subplots(1, 3)
data_total_bsmt = pd.concat([train['SalePrice'], train['TotalBsmtSF']], axis=1)
data_total_bsmt.plot.scatter(x='TotalBsmtSF', y='SalePrice', ylim=(0, 800000), ax=ax1)
data1 = pd.concat([train['SalePrice'], train['1stFlrSF']], axis=1)
data1.plot.scatter(x='1stFlrSF', y='SalePrice', ylim=(0, 800000), ax=ax2)
data2 = pd.concat([train['SalePrice'], train['2ndFlrSF']], axis=1)
data2.plot.scatter(x='2ndFlrSF', y='SalePrice', ylim=(0, 800000), ax=ax3)
plt.show()

In [None]:
sns.set()
cols = ['SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt']
sns.pairplot(train[cols], size=2.5)
plt.show()

In [None]:
train.drop(['BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF'], axis=1, inplace=True)
train['TotalBsmtSF'] = train['TotalBsmtSF'].fillna(0)
train['1stFlrSF'] = train['1stFlrSF'].fillna(0)
train['2ndFlrSF'] = train['2ndFlrSF'].fillna(0)
train['TotalSF'] = train['TotalBsmtSF'] + train['1stFlrSF'] + train['2ndFlrSF']
train.drop(['TotalBsmtSF', '1stFlrSF', '2ndFlrSF'], axis=1, inplace=True)
train.drop(['GarageArea'], axis=1, inplace=True) # as analysis before

In [None]:
# draw TotalSF-SalePrice
data_total = pd.concat([train['SalePrice'], train['TotalSF']], axis=1)
data_total.plot.scatter(x='TotalSF', y='SalePrice', ylim=(0, 800000))
plt.show()

In [None]:
f, (ax1, ax2, ax3) = plt.subplots(1, 3)
data_gr = pd.concat([train['SalePrice'], train['GrLivArea']], axis=1)
data_gr.plot.scatter(x='GrLivArea', y='SalePrice', ylim=(0, 800000), ax=ax1)
data_to = pd.concat([train['SalePrice'], train['TotRmsAbvGrd']], axis=1)
data_to.plot.scatter(x='TotRmsAbvGrd', y='SalePrice', ylim=(0, 800000), ax=ax2)
gr_to = pd.concat([train['GrLivArea'], train['TotRmsAbvGrd']], axis=1)
gr_to.plot.scatter(x='TotRmsAbvGrd', y='GrLivArea', ax=ax3)
plt.show()

In [None]:
train.drop(['Utilities', 'RoofMatl', 'MasVnrArea', 'MasVnrType', 'Heating', 'LowQualFinSF',
            'BsmtFullBath', 'BsmtHalfBath', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
            'Functional', 'GarageYrBlt', 'GarageCond', 'GarageType', 'GarageFinish', 'GarageQual', 'WoodDeckSF',
            'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea',
            'MiscVal'], axis=1, inplace=True)
print(train.columns)

In [None]:
# OverallQual and SalePrice
overall_qual = pd.concat([train['SalePrice'], train['OverallQual']], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='OverallQual', y="SalePrice", data=overall_qual)
fig.axis(ymin=0, ymax=800000)
plt.show()

In [None]:
# YearBuild and SalePrice
year_built = pd.concat([train['SalePrice'], train['YearBuilt']], axis=1)
f, ax = plt.subplots(figsize=(16, 8))
fig = sns.boxplot(x='YearBuilt', y="SalePrice", data=year_built)
fig.axis(ymin=0, ymax=800000)
plt.xticks(rotation=90)
plt.show()

In [None]:
plt.figure()
plt.subplot(1, 2, 1)
plt.title("GrLivArea Dist")
sns.distplot(train['GrLivArea'], fit=stats.norm)
plt.subplot(1, 2, 2)
stats.probplot(train['GrLivArea'], plot=plt)
plt.show()
print("Skewness: %f" % train['GrLivArea'].skew())
print("Kurtosis: %f" % train['GrLivArea'].kurt())

In [None]:
sf = np.log(train['GrLivArea'])
sp = np.log(train['SalePrice'])
plt.scatter(sf[sf > 0], sp[sf > 0])
plt.show()

In [None]:
# MSSubClass and SalePrice, and others
mssubclass = pd.concat([train['SalePrice'], train['ExterQual']], axis=1)
f, ax = plt.subplots(figsize=(8, 6))
fig = sns.boxplot(x='ExterQual', y="SalePrice", data=mssubclass)
fig.axis(ymin=0, ymax=800000)
plt.xticks(rotation=90)
plt.show()

In [None]:
# LotFrontage, LotArea and SalePrice, and others
# f, (ax1, ax2) = sns.plt.subplots(1, 2)
LotFrontage = pd.concat([train['SalePrice'], train['OverallCond']], axis=1)
LotFrontage.plot.scatter(x='OverallCond', y='SalePrice', ylim=(0, 800000))
# LotArea = pd.concat([train['SalePrice'], train['LotArea']], axis=1)
# LotArea.plot.scatter(x='LotArea', y='SalePrice', ylim=(0, 800000), ax=ax2)
plt.show()