# NYC Real Estate Sales Analysis
----
Messing around with a data set of NYC real estate sales to create analysis based on the 254 different neighborhoods

In [1]:
import pandas as pd
import math
from bokeh.plotting import figure, output_file, show
from bokeh.io import output_notebook
output_notebook(hide_banner=True)

sales = pd.read_csv("nyc-sales.csv")
nyc = sales.drop(columns = 'Unnamed: 0')
nyc.head()

Unnamed: 0,BOROUGH,NEIGHBORHOOD,BUILDING CLASS CATEGORY,TAX CLASS AT PRESENT,BLOCK,LOT,EASE-MENT,BUILDING CLASS AT PRESENT,ADDRESS,APARTMENT NUMBER,...,RESIDENTIAL UNITS,COMMERCIAL UNITS,TOTAL UNITS,LAND SQUARE FEET,GROSS SQUARE FEET,YEAR BUILT,TAX CLASS AT TIME OF SALE,BUILDING CLASS AT TIME OF SALE,SALE PRICE,SALE DATE
0,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,392,6,,C2,153 AVENUE B,,...,5,0,5,1633,6440,1900,2,C2,6625000,2017-07-19 00:00:00
1,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,26,,C7,234 EAST 4TH STREET,,...,28,3,31,4616,18690,1900,2,C7,-,2016-12-14 00:00:00
2,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2,399,39,,C7,197 EAST 3RD STREET,,...,16,1,17,2212,7803,1900,2,C7,-,2016-12-09 00:00:00
3,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2B,402,21,,C4,154 EAST 7TH STREET,,...,10,0,10,2272,6794,1913,2,C4,3936272,2016-09-23 00:00:00
4,1,ALPHABET CITY,07 RENTALS - WALKUP APARTMENTS,2A,404,55,,C2,301 EAST 10TH STREET,,...,6,0,6,2369,4615,1900,2,C2,8000000,2016-11-17 00:00:00


## Functions
----

In [2]:
def neighborhood_mean(neighborhood):
    """
    Takes the neighborhood name as input
    Returns the mean square footage
    """
    finder = nyc['NEIGHBORHOOD'] == str(neighborhood)
    found = nyc[finder]
    strip = found['LAND SQUARE FEET'].str.isnumeric()
    found_sqft = found[strip]
    sqft = found_sqft['LAND SQUARE FEET'].astype(int)
    return round(sqft.mean(),2)

def neighborhood_mean_sale(neighborhood):
    """
    Takes the neighborhood name as input
    Returns the mean sale price
    """
    finder = nyc['NEIGHBORHOOD'] == str(neighborhood)
    found = nyc[finder]
    strip = found['SALE PRICE'].str.isnumeric()
    found_sale = found[strip]
    sale = found_sale['SALE PRICE'].astype(int)
    return round(sale.mean(),2)

def neighborhood_median_year(neighborhood):
    """
    Takes the neighborhood name as input
    Returns the median year of homes built
    """
    finder = nyc['NEIGHBORHOOD'] == str(neighborhood)
    found = nyc[finder].astype(str)
    strip = found['YEAR BUILT'].str.isnumeric()
    found_year = found[strip]
    year = found_year['YEAR BUILT'].astype(int)
    return round(year.median())

def sorter(column):
    """
    Takes the column in the new data frame as input
    Returns the top 10 in the given category
    """
    print('Neighborhood DataFrame sorted (descending) by ' + str(column))
    return neighborhoods.sort_values(by=[str(column)], ascending=False)[:10]

## Visual Analysis
---

In [3]:
# Filtering neighborhoods
n = nyc['NEIGHBORHOOD'].value_counts()
neighborhoods = pd.DataFrame(n)
neighborhoods = neighborhoods.reset_index() # Fixes column/index issue
neighborhoods.columns = ['Neighborhood', 'Sale Count']
neighborhoods['Neighborhood'] = neighborhoods['Neighborhood'].astype('str')

# Slicing
top20 = neighborhoods[:20]
other = neighborhoods[20:]
other_count = sum(other['Sale Count'])
top20_count = sum(top20['Sale Count'])
percentage = (top20_count / other_count) * 100 # percentage of the top 20 vs total neighborhoods

# Plotting
names = top20['Neighborhood'].tolist()
counts = top20['Sale Count'].tolist()

print('The sales in these 20 neighborhoods make up ' + str(round(percentage, 2)) +
    '% of the ' + str(len(neighborhoods)) + ' neighborhoods in NYC')

p = figure(x_range=names, plot_height=500, title="Sale Counts for the Top 20 NYC Neighborhoods",
           toolbar_location=None)
p.vbar(x=names, top=counts, width=0.9)
p.xgrid.grid_line_color = None
p.y_range.start = 0
p.xaxis.major_label_orientation = math.pi/2
p.yaxis[0].axis_label = 'Sale Count'
show(p)

The sales in these 20 neighborhoods make up 40.85% of the 254 neighborhoods in NYC


## Creating a new Data Frame
---
Using a 'foor loop' to add new columns to the neighborhoods data frame.

In [4]:
tester = neighborhoods['Neighborhood'].tolist()

# for loop to populate new lists using created functions
result = []
sale_result = []
year_result = []
for i in tester:
    result.append(neighborhood_mean(i))
    sale_result.append(neighborhood_mean_sale(i))
    year_result.append(neighborhood_median_year(i))

# Lists into new rows
neighborhoods['Avg Sqft'] = result
neighborhoods['Avg Sale'] = sale_result
neighborhoods['Med Year'] = year_result

sorter('Med Year')

Neighborhood DataFrame sorted (descending) by Med Year


Unnamed: 0,Neighborhood,Sale Count,Avg Sqft,Avg Sale,Med Year
209,SPRING CREEK,70,17815.59,456187.24,2016
214,STAPLETON-CLIFTON,66,2797.03,404265.15,2015
225,NAVY YARD,47,1274.19,1099145.45,2014
108,DOWNTOWN-FULTON MALL,276,814.21,2880535.33,2012
67,WILLIAMSBURG-SOUTH,427,505.22,1384813.96,2009
97,CIVIC CENTER,323,3432.68,7105168.25,2007
199,HAMMELS,85,4078.22,538241.43,2002
220,ROSSVILLE-CHARLESTON,62,25709.76,509589.95,2000
103,WILLIAMSBURG-CENTRAL,291,622.05,403442.61,1996
237,CONCORD-FOX HILLS,28,1465.88,308214.8,1995


----
Data set pulled from Kaggle