# EDA - Stakeholder Analyses


In [None]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from datetime import datetime, date, time, timedelta


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.3f' % x)
%matplotlib inline

In [None]:
# import cleaned data
df_kc_clean = pd.read_csv('data/king_county_house_clean_dataset.csv')

# disable column truncation
pd.set_option('display.max_columns', None) 
pd.set_option('display.width', None) 
pd.set_option('display.max_colwidth', -1) 

## Stakeholder Requirement
- budget assumption: lower 15 percent of price/sqm living area
- location: social neighborhood, family friendly
- five children: 6 bedrooms or >= 4 bedrooms with at least 150 sqm living area 
- fair condition, >= 3
- good construction, >4


American Crowding Index: (WHO Housing and health guidelines)

Overcrowding occurs if there are more than three people per habitable room (88).
American Crowding Index
Crowding occurs if there is more than one person per room; severe crowding occurs if there are more than 1.5
persons per room (excluding bathrooms, balconies, porches, foyers, hall-ways and half-rooms).

Assumption: Requirements can be fulfilled with at least a fair chance if the were at least 100 listing with this criteria in the data set.

In [None]:
#lowest 15% of the price range
np.percentile(df_kc_clean['price_sqm_living'], 15)

In [None]:
df_kc_low15 = df_kc_clean.query('price_sqm_living <= 1717')


In [None]:
# save affordable df as .csv
df_kc_low15.to_csv('data/king_county_house_affordable_dataset.csv', index=False)

In [None]:
df_kc_low15.describe()

In [None]:
#starting with the perfect houses, there are no results
df_kc_low15.query('social_ngh == 1 and bedrooms >= 6 and construction >= 4 and condition >= 4').sort_values('price').shape

In [None]:
#alternative square meter instead of bedrooms do not improve things much
df_kc_low15.query('social_ngh == 1 and sqm_living >= 120 and bedrooms >= 4 and construction >= 4 and condition >= 4').sort_values('price').shape

I want to look at the distribution of the criteria variables to see what I need to relax on to get more listings:

reducing the bedrooms to at least 3 was a good idea, but not sufficient. Since this is the minimum required bedrooms, we need to look for other factors

there are only limited listing with construction rates above 3, so having this criteria reduce the findings a lot. construction should be set to at least fair (>= 3)

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(ax = ax, data = df_kc_low15, x='bedrooms', edgecolor='#a6dba0', linewidth=2, binwidth=1, color='#7b3294', discrete=True)
#df_kc_low15.hist(ax=ax, column='bedrooms', edgecolor='#a6dba0', linewidth=2, color='#7b3294', discrete=True)

sns.set_style("darkgrid")
sns.despine()

ax.set_title('Number of Bedrooms in Listed Housing', color='#7b3294')
ax.set_ylabel('')
ax.set_xlabel('')
plt.xticks([1,2,3,4,5,6,7,8,9])
plt.yticks([]);

plt.savefig("bedrooms_distribution.jpg")

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(ax = ax, data = df_kc_low15, x='construction', edgecolor='#a6dba0', linewidth=2, binwidth=1, color='#7b3294', discrete=True)
#df_kc_low15.hist(ax=ax, column='bedrooms', edgecolor='#a6dba0', linewidth=2, color='#7b3294', discrete=True)

sns.set_style("darkgrid")
sns.despine()

ax.set_title('Construction Ratings', color='#7b3294')
ax.set_ylabel('')
ax.set_xlabel('(1) = poor to (5) = very good')
plt.xticks([1,2,3,4,5])
plt.yticks([]);

plt.savefig("construction_distribution.jpg")

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(ax = ax, data = df_kc_low15, x='condition', edgecolor='#a6dba0', linewidth=2, binwidth=1, color='#7b3294', discrete=True)
#df_kc_low15.hist(ax=ax, column='bedrooms', edgecolor='#a6dba0', linewidth=2, color='#7b3294', discrete=True)

sns.set_style("darkgrid")
sns.despine()

ax.set_title('Condition Ratings', color='#7b3294')
ax.set_ylabel('')
ax.set_xlabel('(1) = poor to (5) = very good')
plt.xticks([1,2,3,4,5])
plt.yticks([]);

plt.savefig("condition_distribution.jpg")

most listing are in fair condition, so reducing this criteria makes a lot of sense

In [None]:
fig, ax = plt.subplots(figsize=(10,5))
sns.histplot(ax = ax, data = df_kc_clean, x='social_ngh', color='w', edgecolor='#463f1a', linewidth=2, binwidth=.8, bins=2, discrete=True, stat='probability', alpha=0.2)

sns.histplot(ax = ax, data = df_kc_low15, x='social_ngh', color='w', edgecolor='#ffac81', linewidth=2, binwidth=.8, bins=2, discrete=True, stat='probability', alpha=0.2)

sns.set_style("darkgrid")
sns.despine()

ax.set_title('Housing Listings in King County', color='#7b3294')
ax.set_ylabel('')
ax.set_xlabel('Other Areas                                                      Desired Neighborhoods')
plt.xticks([])

ax.legend(['Total Price Range', 'Lower 15%'])

plt.savefig("social_ngh_distribution over sample.jpg");

location, location location: as expected, the good neighborhood limits the chances greatly, a last resort would be to move to a less desired neighborhood 

In [None]:
#starting with the perfect houses, there are no results
df_kc_low15.query('social_ngh == 1 and bedrooms >= 6 and construction >= 4 and condition >= 3').sort_values('price').shape

In [None]:
#alternative square meter instead of bedrooms do not improve things much
df_kc_low15.query('social_ngh == 1 and sqm_living >= 120 and bedrooms >= 4 and construction >= 4 and condition >= 4').sort_values('price').shape

## Reducing Stakeholder Requirements
- as seen in the distributions, reducing to 4 bedrooms is a good idea, but was no enough to yield at least a sufficient amount of available housing

In [None]:
df_kc_low15.query('social_ngh == 1 and sqm_living >= 120 and bedrooms >= 4 and construction >= 4 and condition >= 3').sort_values('price').shape

In [None]:
df_kc_low15.query('social_ngh == 1 and bedrooms >= 6 and construction >= 3 and condition >= 3').sort_values('price').shape

In [None]:
# the only way to achieve a sufficient amount of listing in the past:
df_kc_low15.query('social_ngh == 1 and sqm_living >= 120 and bedrooms >= 4 and construction >= 3 and condition >= 3').sort_values('price').shape

In [None]:
df_kc_low15.query('social_ngh == 1 and sqm_living >= 120 and bedrooms >= 3 and construction >= 3 and condition >= 3').sort_values('price').shape

## Feasible Requirements
- social neighborhood
- 120 square meter living and at least four bedrooms
- fair condition
- fair construction

- searching outside of desired neighborhood would increase the changes significantly:


In [None]:
df_kc_low15.query('social_ngh == 0 and sqm_living >= 120 and bedrooms >= 3 and construction >= 3 and condition >= 3').sort_values('price').shape