## Prepare analysis

In [None]:
import warnings

warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


from matplotlib.ticker import PercentFormatter
plt.rcParams.update({ "figure.figsize" : (8, 5),"axes.facecolor" : "white", "axes.edgecolor":  "black"})
plt.rcParams["figure.facecolor"]= "w"
pd.plotting.register_matplotlib_converters()
pd.set_option('display.float_format', lambda x: '%.2f' % x)

Upload csv file

In [None]:
df_house = pd.read_csv('data/eda.csv')


## Clean data

Overview table columns

In [None]:
df_house.head()
df_house.dtypes

Rename and change columns - clean data if necessary

In [None]:
# rename columns
df_house.rename({'id':'house_id'},axis=1, inplace=True)

In [None]:
# change date type
df_house['date'] = pd.to_datetime(df_house['date'], format='%Y-%m-%d')
type(df_house['date'][0])

In [None]:
# limit decimal points
df_house['price'] = df_house['price'].round(2)

In [None]:
# ZIPCODES
# turn zipcodes into string
df_house['zipcode'] = df_house['zipcode'].astype('string')

In [None]:
#View
#- No changes
#- NaN means  that we don't know whether change took place

In [None]:
#WATERFRONT
#- No changes
#- NaN means "we don't know whether property has waterfront view"

In [None]:
# YR_RENOVATED

# we need to change the year value (i.e., divide by 10)
df_house['yr_renovated'] = df_house.yr_renovated.apply(lambda x: x*0.1)

# we assume that NaN in DB means not yet renovated
df_house['yr_renovated'] = df_house['yr_renovated'].replace(0, np.nan)

In [None]:
# SQFT_BASEMENT
# assumption is that '0' means 'no basement'
# NaN means that we don't know whether a basement exists

## Explore data

### House price descriptive statistics

- mean     540,296.57
- std      367,368.14
- min       78,000.00
- 25%      322,000.00
- 50%      450,000.00
- 75%      645,000.00
- max     7,700,000.00

- There seem to be a lot of outliers. Roughly 200 houses (198) cost more than U>Sd 2 m.
- Median is USD 450000.
- Mode is USD 350000 and USD 450000.
- IQR is USD 323,000
- UP is USD 1,129,500

In [None]:
print(f'There are {df_house.query("price > 2000000").price.count()} houses more expensive than USD 2m.')
print(f'Median is {df_house.price.median()}.')
print(f'Mode is {df_house.price.mode()}.')


### House price box diagram

In [None]:
df_house.query('price < 2000000').price.plot(kind = 'box')

### House price frequency distribution

In [None]:
df_house.query('price < 2000000').price.plot(kind = 'hist', bins = 100)

## Zip code enquiry

In [None]:
df_house['zipcode'].nunique()
# There are 70 different zipcodes in King County

### Zip code and price

In [None]:
# Zip codes with the lowest average house prices
df_house.groupby('zipcode').price.describe().sort_values('mean').head(10)

### Ave property prices per zipcode
- averages are based on data 2014 May to 2015 May

In [None]:
# zip code plot

zip_price_plot = df_house[['zipcode', 'price']].groupby('zipcode').mean('price').sort_values('price').reset_index()
zip_price_plot


# Set the width of the bars
bar_width = 0.3

# Set positions of the bars
index = np.arange(len(zip_price_plot['zipcode']))

# Plotting the bars
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
bars1 = plt.bar(index, zip_price_plot['price'], bar_width, label='average price')

# Adding labels and title
plt.xlabel('Zipcode')
plt.ylabel('Average price')
plt.title('Average House price per zipcode')
plt.xticks(index, zip_price_plot['zipcode'], rotation=90)
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

### Insights
- Average property prices are zipcode dependent
- Highest property prices in 98039 (upper-middle class neighborhood)

### Turnover in zipcode area during period
- averages are based on data 2014 May to 2015 May

In [None]:
# Zip codes with the highest sales turnover

zip_turnover_plot = df_house[['zipcode','sale_id']].groupby('zipcode').count().sort_values('sale_id').reset_index()

# Set the width of the bars
bar_width = 0.3

# Set positions of the bars
index = np.arange(len(zip_turnover_plot['zipcode']))

# Plotting the bars
plt.figure(figsize=(12, 6))  # Adjust the figure size as needed
bars1 = plt.bar(index, zip_turnover_plot['sale_id'], bar_width, label='turnover')

# Adding labels and title
plt.xlabel('Zipcode')
plt.ylabel('Sales in Period')
plt.title('Sales in period per zipcode')
plt.xticks(index, zip_turnover_plot['zipcode'], rotation=90)
plt.legend()

# Show the plot
plt.tight_layout()
plt.show()

### Insights
- highest turnovers in zipcodes 98103, 98038, 98115 (all upper-middle and middle class white neighborhoods)
- lowest turnovers in zipcode 98039 (upper middle class, small, very high house prices) and 98092 (lower middle class, white)

### Turnover and average property price by zipcode
- averages are based on data 2014 May to 2015 May

In [None]:
merged_zip_plot = pd.merge(zip_price_plot, zip_turnover_plot, on='zipcode', how='inner')
merged_zip_plot.rename({'sale_id':'turnover_in_period'},axis=1, inplace=True)
merged_zip_plot


In [None]:
# Comparison Average Price and Turnover by Zipcode

# Set the width of the bars
bar_width = 0.3

# Set positions of the bars
index = np.arange(len(merged_zip_plot['zipcode']))

# Plotting the bars
fig, ax1 = plt.subplots(figsize=(12,6))

bars1 = ax1.bar(index - 0.5 * bar_width, merged_zip_plot['price'], bar_width, label='price')

ax1.set_xlabel('zipcode')
ax1.set_ylabel('average house price', color = 'b')
ax1.tick_params(axis='y', labelcolor='b')
ax1.set_xticks(index)
ax1.set_xticklabels(merged_zip_plot['zipcode'], rotation=90)

ax2 = ax1.twinx()
bars2 = ax2.bar(index + 0.5 * bar_width, merged_zip_plot['turnover_in_period'], bar_width, label='turnover_in_period', color = 'red')

ax2.set_ylabel('Turnover in Period', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Adding labels and title
plt.title('Zipcodes by turnover and average sales price')
plt.legend([bars1, bars2], ['price','turnover_in_period'])

# Show the plot
plt.tight_layout()
plt.show()

### Zipcode and property grades

In [None]:
# test assumption that property price averages are linked to property grades

zip_grade_plot = df_house[['zipcode', 'grade']].groupby('zipcode').mean('grade').sort_values('grade').reset_index()
zip_grade_plot.rename({'grade':'grade_average'},axis=1, inplace=True)

merged_zip_grade_plot = pd.merge(zip_price_plot, zip_grade_plot, on='zipcode', how='inner')
merged_zip_grade_plot


In [None]:
# Comparison Average Price and Average Grade by Zipcode

# Set the width of the bars
bar_width = 0.3

# Set positions of the bars
index = np.arange(len(merged_zip_grade_plot['zipcode']))

# Plotting the bars
fig, ax1 = plt.subplots(figsize=(12,6))

bars1 = ax1.bar(index - 0.5 * bar_width, merged_zip_grade_plot['price'], bar_width, label='price')

ax1.set_xlabel('zipcode')
ax1.set_ylabel('average house price', color = 'b')
ax1.tick_params(axis='y', labelcolor='b')
ax1.set_xticks(index)
ax1.set_xticklabels(merged_zip_grade_plot['zipcode'], rotation=90)

ax2 = ax1.twinx()
bars2 = ax2.bar(index + 0.5 * bar_width, merged_zip_grade_plot['grade_average'], bar_width, label='grade_average', color = 'red')

ax2.set_ylabel('Average grade', color='red')
ax2.tick_params(axis='y', labelcolor='red')

# Adding labels and title
plt.title('Comparison Average Price and Average Grade by Zipcode')
plt.legend([bars1, bars2], ['price','grade_average'])

# Show the plot
plt.tight_layout()
plt.show()

## Overview properties

In [None]:
property_overview_grade_1 = df_house[['grade','price','bedrooms', 'bathrooms','sqft_living','floors', 'condition', 'yr_built', 'yr_renovated']].groupby('grade').mean()
property_overview_grade_2 = df_house[['grade', 'sale_id']].groupby('grade').count().reset_index()
property_overview_grade_2.rename({'sale_id':'turnover_in_period'},axis=1, inplace=True)
property_overview_grade_1['sqft_price_ave'] = property_overview_grade_1['price'] / property_overview_grade_1['sqft_living']
property_overview_grade = pd.merge(property_overview_grade_1, property_overview_grade_2, on='grade', how='inner')
property_overview_grade
# grade 3 houses tend to have one bedroom. Some of them haven't got separate bedrooms
# grade 5+ tend to have two or more bedrooms
# grade 7+ tend to have more than one floor
# average condition of grade 3 homes tends to be very good; however, they tend not to be renovated (or we don't know)
# houses up to grade 6 tend to be pre-war


### Insights about King County properties
- Highest turnover with grade 4 and 5 properties. Properties have up to 4 bedrooms and 2 bathrooms on average. 
Price per sqft is approx. USD 240.
- Smaller appartments seem to be more expensive in terms of sqft prices.
- Grade 7 and above tend to have more than one bathroom.
- Properties of higher quality tend to be bigger and more expensive in terms of sqft price per living area. They might have a bigger plot size.
- Up to grade 5, properties tend to be either appartments or bungalows. Grade 6 and above tend to be multi-storied.
- Grade 5 to grade 8 properties tend to be of better condition than the rest.
- Grade 3 to grade 6 properties tend to be pre-war, grade 9 and above approx. 25 yrs old.

## Geographical map 

### Price per zipcode

In [None]:
# upload packages
from urllib.request import urlopen
import json


In [None]:
# run code
with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/wa_washington_zip_codes_geo.min.json') as response:
    zipcodes = json.load(response)

by_zipcode = df_house.groupby('zipcode').agg({'price': 'mean'}).reset_index()
#by_zipcode['zipcode'] = df_house.zipcode.astype('str')
#by_zipcode.head()

import plotly.express as px


fig = px.choropleth_mapbox(by_zipcode, geojson=zipcodes, locations='zipcode', color='price',
                           color_continuous_scale="Viridis",
                           range_color=(by_zipcode['price'].min(), 1000000),
                           mapbox_style="carto-positron",
                           zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                           featureidkey="properties.ZCTA5CE10",
                           opacity=0.5,
                           labels={'price':'average house price'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

### Average # of bedrooms

In [None]:
# Average # of bedrooms
with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/wa_washington_zip_codes_geo.min.json') as response:
    zipcodes = json.load(response)

by_zipcode = df_house.groupby('zipcode').agg({'price': 'mean', 'bedrooms': 'mean'}).reset_index()
#by_zipcode['zipcode'] = df_house.zipcode.astype('str')
#by_zipcode.head()

import plotly.express as px


fig = px.choropleth_mapbox(by_zipcode, geojson=zipcodes, locations='zipcode', color='bedrooms',
                           color_continuous_scale="Viridis",
                           range_color=(by_zipcode['bedrooms'].min(), by_zipcode['bedrooms'].max()),
                           mapbox_style="carto-positron",
                           zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                           featureidkey="properties.ZCTA5CE10",
                           opacity=0.5,
                           labels={'price':'average house price'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

### Insights
- house sizes in terms of bedrooms smaller in adjacent inner city zipcode areas
- bigger houses in 98039 and 98040, and 98004, 98005, and 98006

### Average property grade per zipcode

In [None]:
new_zip_price_grade = merged_zip_grade_plot

# run code
with urlopen('https://raw.githubusercontent.com/OpenDataDE/State-zip-code-GeoJSON/master/wa_washington_zip_codes_geo.min.json') as response:
   zipcodes = json.load(response)

by_zipcode = merged_zip_grade_plot
#by_zipcode['zipcode'] = df_house.zipcode.astype('str')
#by_zipcode.info()

import plotly.express as px


fig = px.choropleth_mapbox(by_zipcode, geojson=zipcodes, locations='zipcode', color='grade_average',
                           color_continuous_scale="Viridis",
                           range_color=(by_zipcode['grade_average'].min(), by_zipcode['grade_average'].max()),
                           mapbox_style="carto-positron",
                           zoom=8, center = {"lat": 47.553306, "lon": -122.237702},
                           featureidkey="properties.ZCTA5CE10",
                           opacity=0.5,
                           labels={'grade_average':'average house grade'}
                          )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

# 4. Analysis for Nicole Johnson
### 4.1 Main table for plotting median income by zipcode
- Reference dataframe for all median income zipcodes that might form part of analyis
- Contains zipcodes of "preferred areas" and "adjacent zipcodes"
- df_house3_1 is based on df_house3