# EDA - General Dataset Analyses

In [None]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime, date, time, timedelta
from scipy import stats as stats


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline


In [None]:
# import cleaned data
df_kc_clean = pd.read_csv('data/king_county_house_clean_dataset.csv')

# disable column truncation
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None) 
pd.set_option('display.max_colwidth', -1) 

## Distribution of Housing Properties

In [None]:
# plot histogram of sqm living area
plt.hist(df_kc_clean.sqm_living);

In [None]:
# plot histogram of bedrooms
plt.hist(df_kc_clean.bedrooms);

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

# histogram
condition_plot = df_kc_clean.hist(ax=ax, column='condition', bins=5, edgecolor='#a6dba0', linewidth=2, color='#7b3294')

# plot settings
ax.set_title('Housing Conditions in King County')
ax.set_ylabel('')
ax.set_xlabel('(1) = poor to (5) = very good')
plt.xticks([])
plt.yticks([0, 5000, 10000,15000])

# save figure as .jpg to include in presentation
plt.savefig("figures/condition.jpg");

## Does month of purchase affect the price?

In [None]:
# does the purchase timing throughout the year affect the price? 
# are there price differences between the desired neighborhood and others?

# plot the relationship between price/sqm and month sold, hue social neighborhood:

# Set custom color palette
colors = ['#7b3294','#a6dba0']
customPalette = sns.set_palette(sns.color_palette(colors))
sns.set_style("darkgrid")

# Create a scatterplot for categorical variables using catplot:
timing_plot = sns.catplot(
    data=df_kc_clean,
    x='month_sold', y='price_sqm_living', hue='social_ngh',
    jitter=True, 
    palette=customPalette,
    legend=False
    )

# set figure properties
timing_plot.fig.set_size_inches(15,5)
timing_plot.fig.suptitle('Does Timing Matter?', y=(1.05), size=22, color='#7b3294');
timing_plot.add_legend(title='Social Neighborhood')
timing_plot.set_axis_labels('Month of Property Selling','Price/sqm Living Area in US$', fontsize=12)

# save figure as .jpg to include in presentation
timing_plot.savefig("figures/timing.jpg")

# display plot
plt.show(timing_plot)

## Are houses in the desired neighborhood more expensive?

In [None]:
# split df in two parts to compare mean price/sqm_living per neighborhoods

df_social_1 = df_kc_clean.query('social_ngh == 1')
df_social_0 = df_kc_clean.query('social_ngh == 0')

In [None]:
df_social_1.price.describe()

In [None]:
df_social_0.price.describe()

In [None]:
# compare mean price/sqm_living per neighborhoods
print(f'mean price per sqm_living in desired neighborhoods: {df_social_1.price_sqm_living.mean().round()}')
print(f'mean price per sqm_living in other neighborhoods: {df_social_0.price_sqm_living.mean().round()}')

In [None]:
# plot mean price by neighborhood category
sns.pointplot(x = 'social_ngh', y = 'price_sqm_living', data = df_kc_clean);

## Does year of construction affect the price?

In [None]:
# plot price development by year of building construction
colors = ['#a6dba0']
sns.set_style("darkgrid")
sns.despine()
customPalette = sns.set_palette(sns.color_palette(colors))

price_yr_built = sns.lineplot(x='yr_built', y='price_sqm_living', data= df_kc_clean, ci=None, palette=customPalette)
price_yr_built.set(xlabel = 'Year of Construction', ylabel = 'Mean of Price/sqm Living Area')
price_yr_built.set(ylim= (2000, 4500))

# save figure as .jpg to include in presentation
timing_plot.savefig("figures/build_year.jpg")
# display plot
plt.show(timing_plot);

In [None]:
df_kc_clean.query('yr_built < 1950').describe()

In [None]:
df_kc_clean.query('yr_built >= 1950').describe()

## Which features are correlated with the price?

In [None]:
#descriptive stats for price per sqm living area:
df_kc_clean.price_sqm_living.describe()

In [None]:
# pearson's correlation heatmap to check for correlated features of price

corr = df_kc_clean.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
# Pearsons R for correlation of continuous variables: price, sqm living:
stats.pearsonr(df_kc_clean['price'], df_kc_clean['sqm_living'])

In [None]:
# Spearman's R for correlation with categorical variable: price and bathrooms:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['bathrooms'], axis=0, alternative='two-sided')

In [None]:
# Spearman's R for correlation with categorical variable: price and construction:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['construction'], axis=0, alternative='two-sided')

In [None]:
# Spearman's R for correlation with categorical variable: price and condition:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['condition'], axis=0, alternative='two-sided')

In [None]:
# Spearman's R for correlation with categorical variable: price and bedrooms:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['bedrooms'], axis=0, alternative='two-sided')

In [None]:
# Spearman's R for correlation with categorical variable: price and selling month:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['month_sold'], axis=0, alternative='two-sided')

## Subset of affordable housing for stakeholder (lower 15% of price range):

In [None]:
# my clients budget is limited to property in the lowest 15% of the price range
# creating a subset to only hold 'affordable' property:

np.percentile(df_kc_clean['price_sqm_living'], 15)

In [None]:
df_kc_low15 = df_kc_clean.query('price_sqm_living <= 1717')
df_kc_low15.describe()

In [None]:
# save affordable df as .csv

df_kc_low15.to_csv('data/king_county_house_affordable_dataset.csv', index=False)