## Set-up

### Second EDA file
- EDA part 1 become to slow
- df_house is called df_house2 in this notebook

In [None]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.2f' % x)

In [None]:
df_house2 = pd.read_csv('data/eda.csv')
df_house3 = pd.read_csv('data/seattle_income_zip.csv')


In [None]:
df_house2.rename({'id':'house_id'},axis=1, inplace=True)
df_house2['date'] = pd.to_datetime(df_house2['date'], format='%Y-%m-%d')
df_house2['yr_renovated'] = df_house2.yr_renovated.apply(lambda x: x*0.1)
df_house2['yr_renovated'] = df_house2['yr_renovated'].fillna(0)

df_house2['zipcode'] = df_house2['zipcode'].astype('string')



## Average House Grade by Zipcode

In [None]:
zip_price = df_house2[['zipcode', 'price']].groupby('zipcode').mean('price').sort_values('price').reset_index()
zip_grade = df_house2[['zipcode', 'grade']].groupby('zipcode').mean('grade').sort_values('grade').reset_index()
zip_grade.rename({'grade':'grade_average'},axis=1, inplace=True)

price_grade = pd.merge(zip_price, zip_grade, on='zipcode', how='inner')
price_grade

In [None]:

import plotly.express as px
from urllib.request import urlopen
import json
with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/wa_washington_zip_codes_geo.min.json') as response:
    zipcodes = json.load(response)


df_zip = price_grade

fig = px.choropleth_mapbox(df_zip, geojson=zipcodes, locations='zipcode', color='grade_average',
                            color_continuous_scale="Viridis",
                            range_color=(df_zip['grade_average'].min(),df_zip['grade_average'].max()),
                            mapbox_style="carto-positron",
                            zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                            featureidkey="properties.ZCTA5CE10",
                            opacity= 0.5,
                            labels={'grade_average':'average house grade'}
                            )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

### Insights
- average house grades seem to be lower south of Seattle city center
- average house grades seem to be higher in water-faceing zipcodes than in others
- average house grades are higher towards the East, especially the North East 
- Zipcode 98039, primarily in Medina, the highest average house grade (upper middle class area, predominantly white, household income $208K)
- Zipcode 98168 (Tukwila, Burien, SeaTac, Boulevard Park, and White Center) has the lowest average house grade (lower middle class, mixed, household income of $71K)

## Property locations in King County

In [None]:
fig = px.density_mapbox(df_house2, lat=df_house2['lat'], lon=df_house2['long'],
                        radius=10,
                        zoom=8,
                        mapbox_style="open-street-map")

fig.show()

## Changes in price per sqft_living by grade
- Assumption is that the change in price per "sqft living area" is a good proxy for general price changes across grade-based property categories

In [None]:
df_house2.date.min()

comp_1 = df_house2[['grade','date', 'price', 'sqft_living']].query('date >= "2014-05-01" and date <= "2014-7-31"').groupby('grade').mean('price')
comp_1.rename({'price':'price2014', 'sqft_living': 'sqft_living2014'},axis=1, inplace=True)


comp_2 = df_house2[['grade','date', 'price', 'sqft_living']].query('date >= "2015-03-01" and date <= "2015-5-31"').groupby('grade').mean('price')
comp_2.rename({'price':'price2015', 'sqft_living': 'sqft_living2015'},axis=1, inplace=True)

comp_1['price_sqft_ave_2014MJ'] = comp_1['price2014'] / comp_1['sqft_living2014']
comp_2['price_sqft_ave_2015MM'] = comp_2['price2015'] / comp_2['sqft_living2015']

merged_comp_range = pd.merge(comp_1, comp_2, on='grade', how='inner')

merged_comp_range['delta_price_in_%'] = ((merged_comp_range['price_sqft_ave_2015MM']/merged_comp_range['price_sqft_ave_2014MJ'])-1)*100
merged_comp_range

### Insights
- Assumption is that average price per sqft living area is a good indication for price changes.
- Property prices rose significantly between the period May to July 2014 and March to May 2015 but not for all grades.
- Super mansion prices stayed the same, while prices for high end luxury homes rose by 34%.
- Grade 4, grade 5 and grade 6 average property prices rose significantly (14% and 23%).

## Changes in price per sqft_living by zipcode
- Assumption is that the change in price per "sqft living area" is a good proxy for general price changes across zipcodes (better reflection of property price deltas than vartiable "price")

In [None]:
comp_zip_1 = df_house2[['zipcode','date', 'price', 'sqft_living']].query('date >= "2014-05-01" and date <= "2014-7-31"').groupby('zipcode').mean('price').reset_index()
comp_zip_1.rename({'price':'price2014', 'sqft_living': 'sqft_living2014'},axis=1, inplace=True)


comp_zip_2 = df_house2[['zipcode','date', 'price', 'sqft_living']].query('date >= "2015-03-01" and date <= "2015-5-31"').groupby('zipcode').mean('price').reset_index()
comp_zip_2.rename({'price':'price2015', 'sqft_living': 'sqft_living2015'},axis=1, inplace=True)

comp_zip_1['price_sqft_ave_2014MJ'] = comp_zip_1['price2014'] / comp_zip_1['sqft_living2014']
comp_zip_2['price_sqft_ave_2015MM'] = comp_zip_2['price2015'] / comp_zip_2['sqft_living2015']

merged_comp_zip = pd.merge(comp_zip_1, comp_zip_2, on='zipcode', how='inner')

merged_comp_zip['delta_price_in_%'] = ((merged_comp_zip['price_sqft_ave_2015MM']/merged_comp_zip['price_sqft_ave_2014MJ'])-1)*100
merged_comp_zip

In [None]:
df_zip = merged_comp_zip

fig = px.choropleth_mapbox(df_zip, geojson=zipcodes, locations='zipcode', color='delta_price_in_%',
                            color_continuous_scale="Viridis",
                            range_color=(df_zip['delta_price_in_%'].min(),df_zip['delta_price_in_%'].max()),
                            mapbox_style="carto-positron",
                            zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                            featureidkey="properties.ZCTA5CE10",
                            opacity= 0.5,
                            labels={'delta_price_in_%':'delta in ave house prices 2014 Q2 to 2015 Q2'}
                            )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

### Insights
- Prices in zipcode 98125 rose significantly during period (27%).
- Prices in zipcodes in the South rose less than prices in zipcodes in the North East.
- Prices in inner borough zipcodes towards the N, S and SE of Seattle centre remained flat (98108, 98109, 98144).
- Prices rose more in "outer" inner borough zipcode to the N and S of Seattle Center (e.g. 98118, 98168, 98188)

## Median income per neighborhood
- table df_house3 is based on seattle_income_zip.csv, which contains median household income data for Seattle plus King County

In [None]:
df_house3.rename({'ZIP Code':'zipcode', 'Median Household Income(2021)': 'median_income'},axis=1, inplace=True)
df_house3['zipcode'] = df_house3['zipcode'].astype('string')
df_house3.head()


In [None]:
df_zip = df_house3

fig = px.choropleth_mapbox(df_zip, geojson=zipcodes, locations='zipcode', color='median_income',
                            color_continuous_scale="Viridis",
                            range_color=(df_zip['median_income'].min(),df_zip['median_income'].max()),
                            mapbox_style="carto-positron",
                            zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                            featureidkey="properties.ZCTA5CE10",
                            opacity= 0.5,
                            labels={'median_income':'median_income_by_zip'}
                            )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

# Analysis for Nicole Johnson

### Main table for plotting median income by zipcode
- Reference dataframe for all median income zipcodes that might form part of analyis
- Contains zipcodes of "preferred areas" and "adjacent zipcodes"
- df_house3_1 is based on df_house3

In [None]:
df_house3_1 = df_house3.query('zipcode in ["98101", "98102", "98103", "98104", "98105", "98107", "98121", "98122", "98109", "98112", "98115", "98119", "98117", "98121", "98125", "98133", "98177", "98199"]')
df_house3_1

### Main table for identifying properties by zipcode
- df_trendy is a zipcode-based subset of df_house2. It contains all property information that is relevant for zipcodes relevant for the Nicole Johnson search (preferred and adjacent) 

In [None]:
df_trendy = df_house2.query('zipcode in ["98101", "98102", "98103", "98104", "98105", "98107", "98121", "98122", "98109", "98112", "98115", "98119", "98117", "98121", "98177", "98199"]')

df_trendy

### Identify prices and property sales numbers in Nicole's preferred areas
- merged table allows to identify target zipcodes

In [None]:
df_trendy1 = df_trendy[['zipcode', 'price', 'bedrooms']].groupby('zipcode').mean()
df_trendy2 = df_trendy[['zipcode', 'sale_id']].groupby('zipcode').count().reset_index()
df_trendy2.rename ({'sale_id':'turnover'},axis=1, inplace=True)
df_trendy_merge = pd.merge(df_trendy1, df_trendy2, on='zipcode', how='inner').sort_values("price")

df_trendy_merge

### Graphic analysis of Nicole's preferred zipcodes (average price and turnover)

In [None]:
df_trendy_merge.query('zipcode in ["98101", "98102", "98103", "98104", "98105", "98107", "98121", "98122"]')

In [None]:
df_trendy_merge_preferred = df_trendy_merge.query('zipcode in ["98101", "98102", "98103", "98104", "98105", "98107", "98121", "98122"]')

df_zip = df_trendy_merge_preferred

fig = px.choropleth_mapbox(df_zip, geojson=zipcodes, locations='zipcode', color='price',
                            color_continuous_scale="Viridis",
                            range_color=(df_zip['price'].min(),df_zip['price'].max()),
                            mapbox_style="carto-positron",
                            zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                            featureidkey="properties.ZCTA5CE10",
                            opacity= 0.5,
                            labels={'price':'average_price_by_zip'}
                            )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

### Identification of additional search areas 
- zipcode areas are visualized by median income per zipcode
- we select zipcodes with median household income > USD 100K


In [None]:
df_trendy_merge_selected = df_house3_1.query('zipcode in ["98109", "98112", "98115", "98119", "98117", "98121", "98125", "98133", "98144", "98177", "98199"]')


df_zip = df_trendy_merge_selected

fig = px.choropleth_mapbox(df_zip, geojson=zipcodes, locations='zipcode', color='median_income',
                            color_continuous_scale="Viridis",
                            range_color=(df_zip['median_income'].min(),df_zip['median_income'].max()),
                            mapbox_style="carto-positron",
                            zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                            featureidkey="properties.ZCTA5CE10",
                            opacity= 0.5,
                            labels={'median_income':'median_income_by_zip'}
                            )

fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})

## 4.6 Identification of properties in selected zipcodes
We need a two bedroom in zipcode 98117, 98103, 98107, 98155

In [None]:
# we need a two bedroom in zipcode 98117, 98103, 98107, 98155

df_house2.query('(zipcode in ["98117", "98103", "98107", "98155"]) and (price < 450000) and (bedrooms == 2) and (bathrooms == 2)')
