In [53]:
# Import Dependencies
import pandas as pd
from pathlib import Path

## Cleaning Data

In [54]:
# Create path
housing_csv = Path("../Resources/realtor-data.csv")

In [55]:
# Use pandas to read in housing data
housing_df = pd.read_csv(housing_csv)
housing_df

Unnamed: 0,status,bed,bath,acre_lot,city,state,zip_code,house_size,prev_sold_date,price
0,for_sale,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,,105000.0
1,for_sale,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,,80000.0
2,for_sale,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,,67000.0
3,for_sale,4.0,2.0,0.10,Ponce,Puerto Rico,731.0,1800.0,,145000.0
4,for_sale,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,,65000.0
...,...,...,...,...,...,...,...,...,...,...
1204061,for_sale,3.0,3.0,0.61,North Greenbush,New York,12198.0,1900.0,,536250.0
1204062,for_sale,3.0,3.0,0.48,North Greenbush,New York,12198.0,2031.0,,550000.0
1204063,for_sale,4.0,3.0,1.00,North Greenbush,New York,12198.0,2480.0,,449900.0
1204064,for_sale,3.0,3.0,4.79,North Greenbush,New York,12198.0,2950.0,,657850.0


In [56]:
# Select columns to keep
housing_df_reduced = housing_df[['bed', 'bath', 'acre_lot', 'city', 'state', 'zip_code', 'house_size', 'price']]
housing_df_reduced

Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,105000.0
1,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,80000.0
2,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,67000.0
3,4.0,2.0,0.10,Ponce,Puerto Rico,731.0,1800.0,145000.0
4,6.0,2.0,0.05,Mayaguez,Puerto Rico,680.0,,65000.0
...,...,...,...,...,...,...,...,...
1204061,3.0,3.0,0.61,North Greenbush,New York,12198.0,1900.0,536250.0
1204062,3.0,3.0,0.48,North Greenbush,New York,12198.0,2031.0,550000.0
1204063,4.0,3.0,1.00,North Greenbush,New York,12198.0,2480.0,449900.0
1204064,3.0,3.0,4.79,North Greenbush,New York,12198.0,2950.0,657850.0


In [57]:
# Drop null values
housing_df_clean = housing_df_reduced.dropna(how='any')
housing_df_clean

Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,3.0,2.0,0.12,Adjuntas,Puerto Rico,601.0,920.0,105000.0
1,4.0,2.0,0.08,Adjuntas,Puerto Rico,601.0,1527.0,80000.0
2,2.0,1.0,0.15,Juana Diaz,Puerto Rico,795.0,748.0,67000.0
3,4.0,2.0,0.10,Ponce,Puerto Rico,731.0,1800.0,145000.0
5,4.0,3.0,0.46,San Sebastian,Puerto Rico,612.0,2520.0,179000.0
...,...,...,...,...,...,...,...,...
1204060,4.0,3.0,0.56,North Greenbush,New York,12198.0,2231.0,581625.0
1204061,3.0,3.0,0.61,North Greenbush,New York,12198.0,1900.0,536250.0
1204062,3.0,3.0,0.48,North Greenbush,New York,12198.0,2031.0,550000.0
1204063,4.0,3.0,1.00,North Greenbush,New York,12198.0,2480.0,449900.0


Because the price of homes are heavily dependent on location, we are chosing to focus on the state of New York.  This will make the relationship between the house features and price more comparable for analysis. 

## More Cleaning

In [58]:
value_NY = 'New York'
filtered_df = housing_df_clean[housing_df_clean['state'] == value_NY]
filtered_df

Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
30149,3.0,1.0,60.00,Berlin,New York,12022.0,1176.0,175000.0
54248,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,425000.0
54258,4.0,2.0,0.24,Copake,New York,12521.0,1239.0,225000.0
54259,3.0,3.0,1.90,Copake,New York,12516.0,1800.0,419000.0
54262,3.0,2.0,2.00,Copake,New York,12517.0,1482.0,365000.0
...,...,...,...,...,...,...,...,...
1204060,4.0,3.0,0.56,North Greenbush,New York,12198.0,2231.0,581625.0
1204061,3.0,3.0,0.61,North Greenbush,New York,12198.0,1900.0,536250.0
1204062,3.0,3.0,0.48,North Greenbush,New York,12198.0,2031.0,550000.0
1204063,4.0,3.0,1.00,North Greenbush,New York,12198.0,2480.0,449900.0


In [59]:
# Find number of unique cities in New York
unique_ny = housing_df_clean['city'].nunique()
unique_ny

2641

In [60]:
# Find value counts of cities in New York
count_ny = filtered_df['city'].value_counts()
count_ny

New York City        10190
Brooklyn             10038
Bronx                 8353
Staten Island         7465
Albany                5375
                     ...  
Olive                    1
North Hudson             1
Cottekill                1
Cold Spring Hrbr         1
Richfield Springs        1
Name: city, Length: 849, dtype: int64

In [61]:
# Save the clean df to csv
filtered_df.to_csv("../Resources/ny_data.csv")

## Even More Cleaning

In [62]:
# Reduce dataframe to homes with less than 5 bedrooms and 5 bathrooms
cleanest_data = filtered_df[(filtered_df['bed'] < 5) & (filtered_df['bath'] < 5)]
cleanest_data

Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
30149,3.0,1.0,60.00,Berlin,New York,12022.0,1176.0,175000.0
54248,3.0,2.0,2.02,Claverack,New York,12521.0,1600.0,425000.0
54258,4.0,2.0,0.24,Copake,New York,12521.0,1239.0,225000.0
54259,3.0,3.0,1.90,Copake,New York,12516.0,1800.0,419000.0
54262,3.0,2.0,2.00,Copake,New York,12517.0,1482.0,365000.0
...,...,...,...,...,...,...,...,...
1204060,4.0,3.0,0.56,North Greenbush,New York,12198.0,2231.0,581625.0
1204061,3.0,3.0,0.61,North Greenbush,New York,12198.0,1900.0,536250.0
1204062,3.0,3.0,0.48,North Greenbush,New York,12198.0,2031.0,550000.0
1204063,4.0,3.0,1.00,North Greenbush,New York,12198.0,2480.0,449900.0


In [63]:
# Save the clean df to csv
cleanest_data.to_csv("../Resources/ny_data_cleanest.csv")

## Exploring the cleaned data

In [64]:
# house_size, highest to lowest
price = cleanest_data.sort_values(["price"], ascending=False)
price = price.reset_index(drop=True)
price

Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,4.0,4.0,32.00,Sagaponack,New York,11962.0,3800.0,49500000.0
1,4.0,4.0,32.00,Sagaponack,New York,11962.0,3800.0,49500000.0
2,4.0,4.0,32.00,Sagaponack,New York,11962.0,3800.0,49500000.0
3,4.0,4.0,2.55,Bridgehampton,New York,11932.0,3846.0,47500000.0
4,4.0,4.0,2.55,Bridgehampton,New York,11932.0,3846.0,47500000.0
...,...,...,...,...,...,...,...,...
121315,3.0,1.0,0.38,Schenectady,New York,12304.0,1144.0,8000.0
121316,3.0,1.0,0.38,Schenectady,New York,12304.0,1144.0,8000.0
121317,3.0,3.0,0.69,Gloversville,New York,12078.0,2904.0,1.0
121318,3.0,3.0,0.69,Gloversville,New York,12078.0,2904.0,1.0


In [65]:
# acre_lot, highest to lowest 
acres = cleanest_data.sort_values(["acre_lot"], ascending=False)
acres = acres.reset_index(drop=True)
acres

Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,3.0,2.0,100000.0,Bethlehem,New York,12158.0,1396.0,160000.0
1,2.0,1.0,100000.0,Whitehall,New York,12887.0,1352.0,129000.0
2,3.0,2.0,100000.0,Bethlehem,New York,12158.0,1396.0,160000.0
3,3.0,2.0,100000.0,Bethlehem,New York,12158.0,1396.0,160000.0
4,3.0,2.0,100000.0,Bethlehem,New York,12158.0,1396.0,160000.0
...,...,...,...,...,...,...,...,...
121315,4.0,4.0,0.0,Brooklyn,New York,11233.0,2544.0,2300000.0
121316,3.0,4.0,0.0,Cortlandt Manor,New York,10567.0,3100.0,729000.0
121317,3.0,3.0,0.0,Brooklyn,New York,11236.0,1157.0,739000.0
121318,4.0,4.0,0.0,Brooklyn,New York,11233.0,2544.0,2300000.0


In [66]:
# house_size, highest to lowest 
size = filtered_df.sort_values(["house_size"], ascending=False)
size = size.reset_index(drop=True)
size

Unnamed: 0,bed,bath,acre_lot,city,state,zip_code,house_size,price
0,3.0,3.0,9.50,Palenville,New York,12463.0,352836.0,419000.0
1,3.0,3.0,9.50,Palenville,New York,12463.0,352836.0,419000.0
2,3.0,3.0,9.50,Palenville,New York,12463.0,352836.0,419000.0
3,3.0,3.0,9.50,Palenville,New York,12463.0,352836.0,419000.0
4,1.0,1.0,2.80,Port Chester,New York,10573.0,112714.0,69999.0
...,...,...,...,...,...,...,...,...
170494,3.0,4.0,0.03,Staten Island,New York,10304.0,122.0,568888.0
170495,3.0,4.0,0.03,Staten Island,New York,10304.0,122.0,568888.0
170496,3.0,4.0,0.03,Staten Island,New York,10304.0,122.0,568888.0
170497,3.0,4.0,0.03,Staten Island,New York,10304.0,122.0,568888.0


## Plots


In [67]:
import matplotlib.pyplot as plt
import numpy as np