# EDA

Make sure to run and check the Data prep first so that you have a clean csv table to start with. Original data is incomplete when it comes to zip to names and city evals.

## Setup

In [220]:
# import the necessary libraries you need for your analysis
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
# Floats (decimal numbers) should be displayed rounded with 1 decimal place
pd.set_option('display.float_format', lambda x: '%.1f' % x)
# Set style for plots
plt.style.use('fivethirtyeight') 

In [221]:
# Load cleaned csv
dfviz = pd.read_csv('data/king_county_data_viz.csv',)
dfviz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      21597 non-null  int64  
 1   date                    21597 non-null  object 
 2   price                   21597 non-null  float64
 3   bedrooms                21597 non-null  int64  
 4   bathrooms               21597 non-null  float64
 5   sqft_living             21597 non-null  int64  
 6   sqft_lot                21597 non-null  int64  
 7   floors                  21597 non-null  float64
 8   waterfront              21597 non-null  bool   
 9   view                    21597 non-null  float64
 10  condition               21597 non-null  int64  
 11  grade                   21597 non-null  int64  
 12  sqft_above              21597 non-null  int64  
 13  sqft_basement           21597 non-null  float64
 14  yr_built                21597 non-null

## Hypothesis testing

### **Hypothesis 1:** 

The stakeholder has keen interest in storing concrete related objects and needs an object with waterview and a rather large basement. 
Are there any good options on the market with not too many competitors?

**There are waterfront objects with basements.**

In [222]:
hypo1 = dfviz[['id','price','price_per_sqft_house','price_per_sqft_lot','price_tag_house','price_tag_lot','renovation_indicator','waterfront','basement_sizes_string','basement_sizes_int','basement_to_rest_ratio','classification']]
hypo1_waterfront = hypo1.query('waterfront == True')
hypo1_waterfront['classification'].unique()

array(['Island', 'Has Shore', 'Richie rich', 'City ', 'Forest',
       'Poor dogs', 'City', 'Belle', 'Wide land', 'Outskirt'],
      dtype=object)

In [223]:
# Plot Number of waterfront objects with basement sizes and pricetags
#sortingf1 = dfviz.groupby('basement_sizes_string')['basement_sizes_int'].count().sort_values(ascending=False)
f1x = px.histogram(hypo1_waterfront, x='basement_sizes_string',barmode='group',color='price_tag_house',
    color_discrete_sequence=px.colors.qualitative.Dark24,height=500,width=1000) # create countplot // order='sortingf1'
f1x.update_layout(title="Number of properties with basements near the waterfront by pricy category",
    xaxis_title="Basement size clusters",
    yaxis_title="Number of properties",
    legend_title="Property price cluster",
)
f1x.update_xaxes(categoryorder='array', categoryarray= ['xsmall','small','medium','large','xlarge'])
f1x.show()

In [224]:
# Plot Number of waterfront objects with basement sizes and region classification
#sortingf1 = dfviz.groupby('basement_sizes_string')['basement_sizes_int'].count().sort_values(ascending=False)
plt.figure(figsize=(15,5)) # specify figure size
f2x = px.histogram(hypo1_waterfront, x='basement_sizes_string',barmode='group',color='classification',
    color_discrete_sequence=px.colors.qualitative.Dark24, height=500,width=1000)
f2x.update_layout(title="Number of properties with basements near the waterfront by region classification",
    xaxis_title="Basement size clusters",
    yaxis_title="Number of properties",
    legend_title="Classification cluster",
)
f2x.update_xaxes(categoryorder='array', categoryarray= ['xsmall','small','medium','large','xlarge'])
f2x.show()

<Figure size 1500x500 with 0 Axes>

In [225]:
# renovation status - refine if time is left - renovation indicator needs finer granularity
# Plot Number of waterfront objects with renovation status and pricetags
#sortingf1 = dfviz.groupby('basement_sizes_string')['basement_sizes_int'].count().sort_values(ascending=False)
plt.figure(figsize=(15,5)) # specify figure size
f1x = px.bar(hypo1_waterfront, x='price_tag_house',color='renovation_indicator',color_discrete_sequence=px.colors.qualitative.Dark24) # create countplot // order='sortingf1'
f1x.update_layout(title="Number of properties with basements near the waterfront by pricy category",
    xaxis_title="Basement size clusters",
    yaxis_title="Number of properties",
    legend_title="Property price cluster",
)
f1x.update_xaxes(categoryorder='array', categoryarray= ['xsmall','small','medium','large','xlarge'])
f1x.show()

<Figure size 1500x500 with 0 Axes>

### **Hypothesis 2:** 

The stakeholder wants to know which of the Top10% are highly in demand to send a problem solver to make prices go down. Possible options include ruining the condition or overall grade of the neighborhood or killing the view with construction sites. She wants to know what will reduce the prices of other properties the most.

**Pricing for top10% priced houses is driven by view, condition, grade.**

- create pricing variable using quantile function: small, medium, large, x-tra-large (filter)
- try out correlation matrix with view, condition, grade.

In [226]:
hypo2 =  dfviz[['id','price_tag_house','price','price_per_sqft_house','sqft_living15','sqft_lot15','bedrooms','bathrooms','price_per_sqft_lot','sqft_living','sqft_basement_cleaned','sqft_lot','waterfront','view','condition','grade','classification']]
hypo2_top_10_pricing = hypo2.query('price_tag_house == "Top 10"')
hypo2_top_10_pricing

Unnamed: 0,id,price_tag_house,price,price_per_sqft_house,sqft_living15,sqft_lot15,bedrooms,bathrooms,price_per_sqft_lot,sqft_living,sqft_basement_cleaned,sqft_lot,waterfront,view,condition,grade,classification
21,2524049179,Top 10,2000000.0,655.7,4110,20336,3,2.8,44.6,3050,720.0,44867,False,4.0,3,9,Richie rich
27,3303700376,Top 10,667000.0,476.4,1860,3861,3,1.0,421.9,1400,0.0,1581,False,0.0,5,8,Richie rich
33,7589200193,Top 10,535000.0,490.8,1570,5080,3,1.0,178.3,1090,0.0,3000,False,0.0,4,8,City
43,9270200160,Top 10,685000.0,436.3,1580,2640,3,1.0,300.4,1570,0.0,2280,False,0.0,3,7,Richie rich
49,822039084,Top 10,1350000.0,490.4,2680,72513,3,2.5,20.8,2753,588.0,65005,True,2.0,5,9,Island
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21551,9521100031,Top 10,690000.0,448.1,1660,3300,3,3.2,483.2,1540,0.0,1428,False,0.0,3,9,City
21552,524059330,Top 10,1700000.0,443.9,2190,10777,4,3.5,189.7,3830,710.0,8963,False,0.0,3,10,Belle
21560,9253900271,Top 10,3570000.0,736.1,3470,18270,5,4.5,337.3,4850,1310.0,10584,True,4.0,3,10,Richie rich
21567,2025049203,Top 10,399950.0,563.3,1370,1173,2,1.0,345.7,710,0.0,1157,False,0.0,4,7,City


In [227]:
# correlation matrix between the 
corr_columns=["price","sqft_living",'sqft_lot','sqft_living15','sqft_lot15','bedrooms','bathrooms',"waterfront", "view", "grade",'condition', "classification"]
corr_mtrx= round(hypo2_top_10_pricing[corr_columns].corr(),1)
corr_mtrx

Unnamed: 0,price,sqft_living,sqft_lot,sqft_living15,sqft_lot15,bedrooms,bathrooms,waterfront,view,grade,condition
price,1.0,1.0,0.1,0.8,0.2,0.6,0.8,0.3,0.5,0.8,0.0
sqft_living,1.0,1.0,0.1,0.8,0.1,0.7,0.9,0.3,0.4,0.8,0.0
sqft_lot,0.1,0.1,1.0,0.1,0.8,0.0,0.0,0.0,0.1,0.1,0.0
sqft_living15,0.8,0.8,0.1,1.0,0.2,0.6,0.6,0.2,0.5,0.7,0.1
sqft_lot15,0.2,0.1,0.8,0.2,1.0,0.1,0.1,0.1,0.1,0.1,0.0
bedrooms,0.6,0.7,0.0,0.6,0.1,1.0,0.7,0.1,0.2,0.6,0.1
bathrooms,0.8,0.9,0.0,0.6,0.1,0.7,1.0,0.2,0.4,0.8,-0.0
waterfront,0.3,0.3,0.0,0.2,0.1,0.1,0.2,1.0,0.6,0.2,0.0
view,0.5,0.4,0.1,0.5,0.1,0.2,0.4,0.6,1.0,0.4,0.1
grade,0.8,0.8,0.1,0.7,0.1,0.6,0.8,0.2,0.4,1.0,-0.1


In [228]:
h2 = px.imshow(corr_mtrx, text_auto=True, aspect="auto", width=1000, height=800, color_continuous_scale=px.colors.sequential.Cividis_r,
    x=['Price', 'Sqft living', 'Sqft lot', 'Sqft living 15 nb', 'Sqft lot 15 nb', 'Num bedrooms', 'Num bathrooms','Is waterfront','Has view','Design&Constr. grade','House condition'],
    y=['Price', 'Sqft living', 'Sqft lot', 'Sqft living 15 nb', 'Sqft lot 15 nb', 'Num bedrooms', 'Num bathrooms','Is waterfront','Has view','Design&Constr. grade','House condition']
)
h2.update_layout(title="Predictors for property pricing in the Top10 tier",
    legend_title="Corr strength",
)
h2.show()

### **Hypothesis 3:** 

The FBI uses for no reason geo profiling and the client wants to have fair chances, therefore some dummy houses are needed in the outskirts to store and produce items in the basement

**There are currently no interesting real-estate investment opportunities in the market.**

Opportunities: 
- buildings with large basements and below average prices
- buildings in rich location that are cheaper than their surrondings

In [237]:
# Hot picks advice
dfviz.query('sqft_basement < sqft_basement_cleaned') # 170 instances

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,price_tag_lot,renovation_indicator,view_eval,condition_eval,grade_eval,sqft_basement_cleaned,has_secret_storage,basement_sizes_string,basement_sizes_int,basement_to_rest_ratio
112,2525310310,2014-09-16,272500.0,3,1.8,1540,12600,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Good,Average!,380.0,True,medium,3,0.3
115,3626039325,2014-11-21,740500.0,3,3.5,4380,6350,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Average,Above average,1600.0,True,xlarge,5,0.6
309,3204800200,2015-01-08,665000.0,4,2.8,3320,10574,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Jackpot!,Above average,1100.0,True,xlarge,5,0.5
384,713500030,2014-07-28,1350000.0,5,3.5,4800,14984,2.0,False,2.0,...,Above average,Piece of crap,Can see daylight,Average,High quality +,1320.0,True,xlarge,5,0.4
508,5113400431,2014-05-08,615000.0,2,1.0,1540,6872,1.0,False,0.0,...,Above average,Piece of crap,Wall facing,Good,Average!,720.0,True,large,4,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21000,291310180,2014-06-13,379500.0,3,2.2,1410,1287,2.0,False,0.0,...,Top 10,Piece of crap,Wall facing,Average,Average!,120.0,True,medium,3,0.1
21109,3438500250,2014-06-23,515000.0,5,3.2,2910,5027,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Average,Above average,870.0,True,large,4,0.4
21210,3278600680,2014-06-27,235000.0,1,1.5,1170,1456,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Average,Above average,100.0,True,medium,3,0.1
21356,6169901185,2014-05-20,490000.0,5,3.5,4460,2975,3.0,False,2.0,...,Above average,Piece of crap,Can see daylight,Average,Very good,1180.0,True,xlarge,5,0.4


In [238]:
# look up the secret storage facilities
df_base = dfviz[['id','sqft_living','sqft_above','sqft_basement','sqft_basement_cleaned','basement_sizes_string','lat','long','city','price','price_tag_house','classification','basement_to_rest_ratio']]
secret_storage = dfviz.query('has_secret_storage == True')
secret_storage

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,price_tag_lot,renovation_indicator,view_eval,condition_eval,grade_eval,sqft_basement_cleaned,has_secret_storage,basement_sizes_string,basement_sizes_int,basement_to_rest_ratio
112,2525310310,2014-09-16,272500.0,3,1.8,1540,12600,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Good,Average!,380.0,True,medium,3,0.3
115,3626039325,2014-11-21,740500.0,3,3.5,4380,6350,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Average,Above average,1600.0,True,xlarge,5,0.6
309,3204800200,2015-01-08,665000.0,4,2.8,3320,10574,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Jackpot!,Above average,1100.0,True,xlarge,5,0.5
384,713500030,2014-07-28,1350000.0,5,3.5,4800,14984,2.0,False,2.0,...,Above average,Piece of crap,Can see daylight,Average,High quality +,1320.0,True,xlarge,5,0.4
508,5113400431,2014-05-08,615000.0,2,1.0,1540,6872,1.0,False,0.0,...,Above average,Piece of crap,Wall facing,Good,Average!,720.0,True,large,4,0.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21000,291310180,2014-06-13,379500.0,3,2.2,1410,1287,2.0,False,0.0,...,Top 10,Piece of crap,Wall facing,Average,Average!,120.0,True,medium,3,0.1
21109,3438500250,2014-06-23,515000.0,5,3.2,2910,5027,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Average,Above average,870.0,True,large,4,0.4
21210,3278600680,2014-06-27,235000.0,1,1.5,1170,1456,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Average,Above average,100.0,True,medium,3,0.1
21356,6169901185,2014-05-20,490000.0,5,3.5,4460,2975,3.0,False,2.0,...,Above average,Piece of crap,Can see daylight,Average,Very good,1180.0,True,xlarge,5,0.4


In [241]:
# Buildings with secret basements
h3ss = px.scatter_mapbox(secret_storage, lat="lat", lon="long", color='basement_sizes_string', size = 'sqft_basement_cleaned', hover_name="id", hover_data=["id", "city","price_tag_house","sqft_basement_cleaned",'basement_sizes_string'],
                        color_discrete_sequence=px.colors.qualitative.Dark24, zoom=9,  center={'lon': -122.2, 'lat': 47.5 }, width=1000, height=600)
h3ss.update_layout(mapbox_style="open-street-map",
    title="Opportunities with secret basements by size",
    legend_title="Property price clusters",
    margin={"r":10,"t":40,"l":10,"b":10}
)
h3ss.show()

In [232]:
# buildings in rich location that are cheaper than their surroundings
h3c = px.scatter_mapbox(secret_storage, lat="lat", lon="long", color='classification', size = 'price', hover_name="id", hover_data=["id", "city","price",'basement_sizes_string','price_tag_house','classification'],
                        color_discrete_sequence=px.colors.qualitative.Dark24, zoom=9, center={'lon': -122.2, 'lat': 47.5 }, width=1000, height=600)
h3c.update_layout(mapbox_style="open-street-map",
    title="Opportunities by location classification and price indicator",
    legend_title="Property price clusters",
    margin={"r":10,"t":40,"l":10,"b":10}
    )
h3c.show()

In [233]:
# create best pics list
secret_storage.query('classification in ("Has Shore","Richie rich","City")').sort_values(['price']).head(10)

Unnamed: 0,id,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,...,price_tag_lot,renovation_indicator,view_eval,condition_eval,grade_eval,sqft_basement_cleaned,has_secret_storage,basement_sizes_string,basement_sizes_int,basement_to_rest_ratio
7921,5029451080,2014-08-21,203000.0,2,1.0,1440,6650,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Average,Average!,470.0,True,medium,3,0.5
8985,7893203480,2015-05-12,205000.0,3,1.5,1420,5000,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Average,Average!,500.0,True,medium,3,0.5
21210,3278600680,2014-06-27,235000.0,1,1.5,1170,1456,2.0,False,0.0,...,Above average,Piece of crap,Wall facing,Average,Above average,100.0,True,medium,3,0.1
11201,3210400060,2014-12-24,255000.0,3,1.0,1580,8206,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Average,Average!,480.0,True,medium,3,0.4
9364,8100900115,2015-02-06,259000.0,3,1.8,1270,4815,1.5,False,0.0,...,Below average,Piece of crap,Wall facing,Average,Below average,290.0,True,medium,3,0.3
7325,4310700020,2014-10-10,280000.0,3,1.0,1100,5132,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Average,Below average,260.0,True,medium,3,0.3
5409,8058500005,2015-02-03,290000.0,2,1.0,1620,5400,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Average,Below average,700.0,True,large,4,0.8
11319,3438500677,2014-06-13,305000.0,3,1.5,1210,5240,1.0,False,0.0,...,Below average,Piece of crap,Wall facing,Good,Average!,600.0,True,large,4,1.0
7311,4358700188,2015-03-31,305000.0,3,2.5,1260,895,3.0,False,0.0,...,Top 10,Piece of crap,Wall facing,Average,Average!,100.0,True,medium,3,0.1
6631,3126049439,2015-01-09,313000.0,2,1.5,870,747,2.0,False,0.0,...,Top 10,Piece of crap,Wall facing,Average,Above average,70.0,True,medium,3,0.1


In [242]:
dfviz.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21597 entries, 0 to 21596
Data columns (total 36 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   id                      21597 non-null  int64  
 1   date                    21597 non-null  object 
 2   price                   21597 non-null  float64
 3   bedrooms                21597 non-null  int64  
 4   bathrooms               21597 non-null  float64
 5   sqft_living             21597 non-null  int64  
 6   sqft_lot                21597 non-null  int64  
 7   floors                  21597 non-null  float64
 8   waterfront              21597 non-null  bool   
 9   view                    21597 non-null  float64
 10  condition               21597 non-null  int64  
 11  grade                   21597 non-null  int64  
 12  sqft_above              21597 non-null  int64  
 13  sqft_basement           21597 non-null  float64
 14  yr_built                21597 non-null

In [244]:
# best basement to rest ratio overall
large_basement_to_rest = dfviz.query('basement_to_rest_ratio > 0.75')
# create data frame for the question
large_basement_to_rest = large_basement_to_rest[['id','price','has_secret_storage','bedrooms','bathrooms', "city",'basement_to_rest_ratio','sqft_living','sqft_above','sqft_basement_cleaned','basement_sizes_string','price_tag_house','classification','lat','long']]
# create best pics list
large_basement_to_rest.query('has_secret_storage').sort_values(['price']).head(10)

Unnamed: 0,id,price,has_secret_storage,bedrooms,bathrooms,city,basement_to_rest_ratio,sqft_living,sqft_above,sqft_basement_cleaned,basement_sizes_string,price_tag_house,classification,lat,long
17097,8864000425,242000.0,True,3,1.8,Burien,1.0,1580,790,790.0,large,Below average,Outskirt,47.5,-122.3
12933,798000145,244500.0,True,2,1.8,Burien,1.0,1300,650,650.0,large,Below average,Outskirt,47.5,-122.3
17241,1545806720,254950.0,True,4,2.0,Maple Valley,0.9,2180,1170,1010.0,xlarge,Below average,Wide land,47.4,-122.0
15301,9122500080,275000.0,True,5,2.0,Kent,0.8,2260,1250,1010.0,xlarge,Below average,Outskirt,47.4,-122.2
8039,6928600330,278000.0,True,5,1.8,Federal Way,1.0,2170,1100,1070.0,xlarge,Below average,Outskirt,47.3,-122.3
17528,1180002745,285000.0,True,3,1.8,Seattle,0.8,2380,1320,1060.0,xlarge,Below average,City,47.5,-122.2
5409,8058500005,290000.0,True,2,1.0,Seattle,0.8,1620,920,700.0,large,Below average,City,47.7,-122.3
11319,3438500677,305000.0,True,3,1.5,Seattle,1.0,1210,610,600.0,large,Above average,City,47.6,-122.4
8094,8068000730,315000.0,True,4,2.0,Seattle,0.9,1780,930,850.0,large,Below average,City,47.5,-122.3
12161,3211200140,350000.0,True,4,2.0,Kirkland,1.0,1720,860,860.0,large,Below average,Richie rich,47.7,-122.2


In [235]:
# Opportunities by basement size and official knowledge of the basement, indication by cheap picks first
h3b = px.scatter_mapbox(large_basement_to_rest, lat="lat", lon="long", color='has_secret_storage', size = 'price', hover_name="id", hover_data=["id", "city","price",'basement_to_rest_ratio','basement_sizes_string','price_tag_house','classification'],
                        color_discrete_sequence=px.colors.qualitative.Dark24, zoom=9, center={'lon': -122.2, 'lat': 47.5 }, width=1000, height=600)
h3b.update_layout(mapbox_style="open-street-map",
    title="Opportunities by basement to rest ratio and official basement knowledge",
    legend_title="Has secret storage",
    margin={"r":10,"t":40,"l":10,"b":10}
    )
h3b.show()