# EDA - General Dataset Analyses

In [None]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime, date, time, timedelta


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline


In [None]:
# import cleaned data
df_kc_clean = pd.read_csv('data/king_county_house_clean_dataset.csv')

# disable column truncation
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None) 
pd.set_option('display.max_colwidth', -1) 

In [None]:
plt.hist(df_kc_clean.sqm_living)

In [None]:
plt.hist(df_kc_clean.bedrooms)

In [None]:
df_kc_clean.info()

In [None]:
# plot the relationship between price/sqm and month sold, hue social neighborhood

colors = ['#7b3294','#a6dba0']
# Set custom color palette
customPalette = sns.set_palette(sns.color_palette(colors))
sns.set_style("darkgrid")

# Create a scatterplot for categorical variables using catplot:
timing_plot = sns.catplot(
    data=df_kc_clean,
    x='month_sold', y='price_sqm_living', hue='social_ngh',
    jitter=True, 
    palette=customPalette,
    legend=False
    )

timing_plot.fig.set_size_inches(15,5)
timing_plot.fig.suptitle('Does Timing Matter?', y=(1.05), size=22, color='#7b3294');
timing_plot.add_legend(title='Social Neighborhood')
timing_plot.set_axis_labels('Month of Property Selling','Price/sqm Living Area in US$', fontsize=12)

plt.show(my_plot)

In [None]:
df_kc_clean.info()

In [None]:
df_kc_clean.describe()

In [None]:
df_social_1 = df_kc_clean.query('social_ngh == 1')
df_social_0 = df_kc_clean.query('social_ngh == 0')

In [None]:
df_social_1.price.describe()

In [None]:
df_social_0.price.describe()

In [None]:
df_social_1 = df_kc_clean.query('social_ngh == 1')
df_social_0 = df_kc_clean.query('social_ngh == 0')


In [None]:
df_social_0.price_sqm_living.mean().round()

In [None]:
df_social_1.price_sqm_living.mean().round()

In [None]:
sns.pointplot(x = 'social_ngh', y = 'price_sqm_living', data = df_kc_clean)



In [None]:

colors = ['#a6dba0']

sns.set_style("darkgrid")
sns.despine()
customPalette = sns.set_palette(sns.color_palette(colors))

price_yr_built = sns.lineplot(x='yr_built', y='price_sqm_living', data= df_kc_clean, ci=None, palette=customPalette)


price_yr_built.set(xlabel = 'Year of Construction', ylabel = 'Mean of Price/sqm Living Area')

price_yr_built.set(ylim= (2000, 4500))

In [None]:
df_kc_clean.query('yr_built < 1950').describe()

In [None]:
df_kc_clean.query('yr_built >= 1950').describe()

In [None]:
df_kc_clean.describe()

In [None]:
df_kc_clean.query('social_ngh == 1').describe()

In [None]:
df_kc_clean.query('social_ngh == 0').describe()

In [None]:
fig, ax = plt.subplots(figsize=(10,5))

df_kc_clean.hist(ax=ax, column='condition', bins=5, edgecolor='#a6dba0', linewidth=2, color='#7b3294')

ax.set_title('Housing Conditions in King')
ax.set_ylabel('')
ax.set_xlabel('(1) = poor to (5) = very good')
plt.xticks([])
plt.yticks([])

In [None]:
df_kc_clean.info()


In [None]:
corr = df_kc_clean.corr()
corr.style.background_gradient(cmap='coolwarm').set_precision(2)

In [None]:
stats.pearsonr(df_kc_clean['price'], df_kc_clean['sqm_living'])

In [None]:
from scipy import stats as stats
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['bathrooms'], axis=0, alternative='two-sided')

In [None]:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['construction'], axis=0, alternative='two-sided')

In [None]:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['condition'], axis=0, alternative='two-sided')

In [None]:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['bedrooms'], axis=0, alternative='two-sided')

In [None]:
stats.spearmanr(df_kc_clean['price_sqm_living'], df_kc_clean['month_sold'], axis=0, alternative='two-sided')

In [None]:
df_kc_clean.price_sqm_living.describe()

In [None]:
# my clients budget is limited to property in the lowest 15% of the price range
# creating a subset to only hold 'affordable' property:

np.percentile(df_kc_clean['price_sqm_living'], 15)

In [None]:
df_kc_low15 = df_kc_clean.query('price_sqm_living <= 1717')
df_kc_low15.describe()

In [None]:
# save affordable df as .csv

df_kc_low15.to_csv('data/king_county_house_affordable_dataset.csv', index=False)