In [1]:
# Import Dependencies
import pandas as pd
import numpy as np

In [2]:
# Load csv into df
housing_data = pd.read_csv("../Resources/housingDataUpdated.csv")
housing_data.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,city,zipcode,high_school,middle_school,elementary_school
0,"3157 NE MARINE DR, Portland OR 97035",65000,Floating Home - 1 Story,1,1.0,800,1964,,unknown,Multnomah,Portland,97035,Current Price:,Jefferson,Faubion
1,"17452 NE GLISAN ST #7, Portland OR 97230",72000,Manufactured - Double Wide Manufact,2,2.0,1152,1988,,unknown,Multnomah,Portland,97230,Reynolds,Reynolds,Hartley
2,"9034 SE 78TH PL, Portland OR 97206",79950,Manufactured - Double Wide Manufact,3,2.0,1344,1997,,unknown,Clackamas,Portland,97206,Current Price:,Milwaukie,Whitman
3,"16000 SE POWELL BLVD 75, Portland OR 97236",79950,Manufactured - Double Wide Manufact,3,2.0,1404,1990,,unknown,Multnomah,Portland,97236,Centennial,Centennial,Powell Butte
4,"12846 SE RAMONA ST 6, Portland OR 97236",93900,Manufactured - Double Wide Manufact,3,2.0,1297,1997,,unknown,Multnomah,Portland,97236,David Douglas,Alice Ott,Gilbert Hts


In [3]:
# Find lot_size with null values
null_lots = housing_data.loc[housing_data["lot_size"].isnull(), :]
null_lots.shape

(797, 15)

In [4]:
# Comparing home type data for simplification
home_type = housing_data[["home_type","price"]]
ht_cost = home_type.groupby(["home_type"]).mean()
ht_cost.sort_values(by=["price"], inplace=True)
ht_cost.head()

Unnamed: 0_level_0,price
home_type,Unnamed: 1_level_1
Manufactured - Double Wide Manufact,81450.0
Floating Home - Cabin,129500.0
Floating Home - Other,179000.0
Condo - Tri Level,185000.0
Floating Home - Manufactured Home,199999.0


In [5]:
# Simplify home types 
for i in housing_data.index:
    if "Floating" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Floating"
    if "Condo" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Condo"
    if "Single Family" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Single Family"
    if "Manufactured" in housing_data.at[i, "home_type"]:
        housing_data.at[i, "home_type"] = "Manufactured"
    
housing_data.home_type.unique()   

array(['Floating', 'Manufactured', 'Condo', 'Single Family'], dtype=object)

In [6]:
# Print data to compare how many data points lost
print(f'Current Amount of Listings: {len(housing_data)}')

# Change lot size to 0 for floating homes and condos
for i in housing_data.index:
    if housing_data.at[i, "home_type"] == "Floating":
        housing_data.at[i, "lot_size"] = 0
    if housing_data.at[i, "home_type"] == "Condo":
        housing_data.at[i, "lot_size"] = 0

# Drop listing with null lot_size
cleaned_housing_data = housing_data.drop(housing_data[housing_data["lot_size"].isnull()].index)
      
# Print length of data
print(f'Updated Amount of Listings: {len(cleaned_housing_data)}')

Current Amount of Listings: 2246
Updated Amount of Listings: 2133


In [7]:
# Drop listings with unclear Highschool data
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Current Price:"].index, inplace = True)
cleaned_housing_data.drop(cleaned_housing_data[cleaned_housing_data.high_school == "Other"].index, inplace = True)
cleaned_housing_data.shape

(2121, 15)

In [8]:
# Create a cost ranker based on zipcode
zipcode = cleaned_housing_data[["price","zipcode"]]
zipcodeAVG = zipcode.groupby(["zipcode"]).mean().sort_values(by=["price"], ascending=False)
zipcodeRanker = zipcodeAVG.reset_index(drop=False)
zipcodeRanker.reset_index(drop=False, inplace=True)
zipcodeRanker.rename(columns={"index":"zipcode_rank","price":"zipcodeAVGcost"}, inplace=True)
zipcodeRanker["zipcode_rank"]=zipcodeRanker["zipcode_rank"]+1


# Merge into df
cleaned_housing_data2 = pd.merge(cleaned_housing_data, zipcodeRanker, on="zipcode")
cleaned_housing_data2.rename(columns={"price_y":"zipcodeAVGcost"}, inplace = True)
cleaned_housing_data2.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,city,zipcode,high_school,middle_school,elementary_school,zipcode_rank,zipcodeAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,Portland,97230,Reynolds,Reynolds,Salish Pond,29,412757.415584
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,Portland,97230,Reynolds,H.B. Lee,Margaret Scott,29,412757.415584
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,Portland,97230,Reynolds,Reynolds,Salish Pond,29,412757.415584
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,Portland,97230,Reynolds,H.B. Lee,Scott,29,412757.415584
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,Portland,97230,Reynolds,H.B. Lee,Margaret Scott,29,412757.415584


In [9]:
# Create district df
school_dict = ({"high_school" : ['Reynolds', 'Parkrose', 'David Douglas', 'Centennial', 'Cleveland',
        'Lincoln', 'Madison', 'Jefferson', 'Roosevelt', 'Sunset','Westview', 'Liberty', 'Beaverton', 
        'Grant', 'Southridge', 'Tigard', 'Wilson', 'Riverdale', 'Lake Oswego', 'Franklin',
        'Tualatin', 'Milwaukie', 'Scappoose'], "district" : ['Reynolds', 'Parkrose','David Douglas',
        'Centennial', 'Portland Public', 'Portland Public', 'Portland Public', 'Portland Public',
        'Portland Public', 'Beaverton', 'Beaverton', 'Hillsboro', 'Beaverton', 'Portland Public',
        'Beaverton', 'Tigard-Tualatin', 'Portland Public', 'Riverdale', 'Lake Oswego', 'Portland Public',
        'Tigard-Tualatin', 'North Clackamas', 'Scappose']})
district_df = pd.DataFrame (school_dict)

# Merge into OG df
cleaned_housing_data3 = pd.merge(cleaned_housing_data2, district_df, on="high_school")
cleaned_housing_data3.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,city,zipcode,high_school,middle_school,elementary_school,zipcode_rank,zipcodeAVGcost,district
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,Portland,97230,Reynolds,Reynolds,Salish Pond,29,412757.415584,Reynolds
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,Portland,97230,Reynolds,H.B. Lee,Margaret Scott,29,412757.415584,Reynolds
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,Portland,97230,Reynolds,Reynolds,Salish Pond,29,412757.415584,Reynolds
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,Portland,97230,Reynolds,H.B. Lee,Scott,29,412757.415584,Reynolds
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,Portland,97230,Reynolds,H.B. Lee,Margaret Scott,29,412757.415584,Reynolds


In [10]:
# Create a cost ranker based on high schools
hs = cleaned_housing_data3[["price","high_school"]]
hsAVG = hs.groupby(["high_school"]).mean().sort_values(by=["price"], ascending=False)
hsRanker = hsAVG.reset_index(drop=False)
hsRanker.reset_index(drop=False, inplace=True)
hsRanker.rename(columns={"index":"hs_rank","price":"hsAVGcost"}, inplace=True)
hsRanker["hs_rank"]= hsRanker["hs_rank"]+1

# Create a cost ranker based on districts
district = cleaned_housing_data3[["price","district"]]
districtAVG = district.groupby(["district"]).mean().sort_values(by=["price"], ascending=False)
districtRanker = districtAVG.reset_index(drop=False)
districtRanker.reset_index(drop=False, inplace=True)
districtRanker.rename(columns={"index":"district_rank","price":"districtAVGcost"}, inplace=True)
districtRanker["district_rank"]= districtRanker["district_rank"]+1

In [11]:
# Merge high school and district rankers 
cleaned_housing_data4 = pd.merge(cleaned_housing_data3, hsRanker, on="high_school")
cleaned_housing_data_5 = pd.merge(cleaned_housing_data4, districtRanker, on="district")
cleaned_housing_data_final = cleaned_housing_data_5[['address', 'price', 'home_type', 'bedrooms', 
                                'bathrooms', 'square_feet', 'built', 'lot_size', 'neighborhood', 
                                'county', 'city', 'zipcode', 'zipcode_rank', 'zipcodeAVGcost',
                                'elementary_school', 'middle_school', 'high_school','hs_rank', 
                                'hsAVGcost', 'district', 'district_rank', 'districtAVGcost']]

cleaned_housing_data_final.head()

Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,...,zipcode_rank,zipcodeAVGcost,elementary_school,middle_school,high_school,hs_rank,hsAVGcost,district,district_rank,districtAVGcost
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,...,29,412757.415584,Salish Pond,Reynolds,Reynolds,21,396434.078125,Reynolds,10,396434.078125
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,...,29,412757.415584,Margaret Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,...,29,412757.415584,Salish Pond,Reynolds,Reynolds,21,396434.078125,Reynolds,10,396434.078125
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,...,29,412757.415584,Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,...,29,412757.415584,Margaret Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125


In [12]:
# # Save to csv
# cleaned_housing_data_final.to_csv("../Resources/housingDataUpdatedandCleaned.csv", index=False)

# Save to csv (conversion models from zipcode/district to rank and/or AVGcost)
districtRanker.to_csv("../Resources/district.csv", index=False)
zipcodeRanker.to_csv("../Resources/zipcode.csv", index=False)

In [13]:
print(districtRanker.district.tolist())
print(zipcodeRanker.zipcode.tolist())

['Riverdale', 'Scappose', 'Lake Oswego', 'Tigard-Tualatin', 'Portland Public', 'Beaverton', 'Hillsboro', 'Parkrose', 'Centennial', 'Reynolds', 'David Douglas', 'North Clackamas']
[97035, 97231, 97210, 97221, 97219, 97239, 97212, 97202, 97225, 97215, 97201, 97209, 97229, 97223, 97214, 97224, 97205, 97211, 97227, 97213, 97232, 97217, 97218, 97204, 97203, 97222, 97206, 97220, 97230, 97236, 97216, 97266, 97233]


In [14]:
# # Drop duplicate data
# duplicates_check = pd.read_csv("../Resources/housingDataUpdated.csv")
# print(duplicates_check.shape)
# bye_bye_dups = duplicates_check.drop_duplicates()
# bye_bye_dups.to_csv("../Resources/housingDataUpdated.csv", index=False)
# print(bye_bye_dups.shape)

In [16]:
cleaned_housing_data_final["house_age"] = 2020 - cleaned_housing_data_final["built"]
cleaned_housing_data_final.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,address,price,home_type,bedrooms,bathrooms,square_feet,built,lot_size,neighborhood,county,...,zipcodeAVGcost,elementary_school,middle_school,high_school,hs_rank,hsAVGcost,district,district_rank,districtAVGcost,house_age
0,"19609 NE Marine DR E-4, Portland OR 97230",129500,Floating,1,1.0,735,1960,0.0,unknown,Multnomah,...,412757.415584,Salish Pond,Reynolds,Reynolds,21,396434.078125,Reynolds,10,396434.078125,60
1,"3389 NE 162ND AVE, Portland OR 97230",160000,Condo,2,2.0,1073,1979,0.0,Fremont Village Park,Multnomah,...,412757.415584,Margaret Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125,41
2,"19609 NE MARINE DR E1, Portland OR 97230",224500,Floating,3,2.0,1150,1945,0.0,Big Eddy Marina,Multnomah,...,412757.415584,Salish Pond,Reynolds,Reynolds,21,396434.078125,Reynolds,10,396434.078125,75
3,"15041 NE SISKIYOU CT, Portland OR 97230",229900,Condo,2,2.0,1638,1973,0.0,unknown,Multnomah,...,412757.415584,Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125,47
4,"15025 NE SACRAMENTO ST 56, Portland OR 97230",239000,Condo,2,2.0,1128,1986,0.0,SUMMERPLACE,Multnomah,...,412757.415584,Margaret Scott,H.B. Lee,Reynolds,21,396434.078125,Reynolds,10,396434.078125,34
