# Isabel's Jupyter NB: Homeownership

In [1]:
import pandas as pd
import numpy as np

### Format homeownership data to be numeric

In [73]:
# read in the data
homeownership = pd.read_csv("Homeownership.csv")
# check types
print("homeownership dftypes BEFORE\n",homeownership.dtypes)

# fix types to be numeric
homeownership[['TotalPopulation','OwnedMortgage','OwnedFree','RenterOccupied']]\
= homeownership.apply(lambda x: x[['TotalPopulation','OwnedMortgage','OwnedFree','RenterOccupied']].str.replace(',','').astype(float), axis=1)

print("\nhomeownership dftypes AFTER\n",homeownership.dtypes)
#homeownership.head(10)

homeownership dftypes BEFORE
 Census Tract       float64
TotalPopulation     object
OwnedMortgage       object
OwnedFree           object
RenterOccupied      object
dtype: object

homeownership dftypes AFTER
 Census Tract       float64
TotalPopulation    float64
OwnedMortgage      float64
OwnedFree          float64
RenterOccupied     float64
dtype: object


### Format the neighborhood data to be numeric

In [66]:
# read in neighborhoods dataset
neighborhood = pd.read_csv("neighborhood.csv")

print("dtype before :", neighborhood['tractce10'].dtype)

# utilize Pandas Series to convert all elements into one datatype
census_tract = pd.Series(neighborhood.iloc[:,4]) # index 4 is tractce10 column

# need to cast census tract nums (currently objects) into floats
# start by editing the format of each element
for i in range(len(census_tract)):
    string = census_tract[i]
    # if not empty string
    if (string != ''):
        firstChar = string[0]
        # run code only if the first char can be cast to a float
        try:
            float(firstChar)
            # if the firstChar is a number that starts with 0 as a placeholder (e.g., 024500)
            if (firstChar == '0' and len(string) > 1):
                # remove the first character
                string = string[1:]
            # remove extra zeros at the end
            census_tract[i] = string[0:len(string)-2]
            
        # first char cannot be converted to a float
        except ValueError:
            census_tract[i] = 0
    
    # if empty string
    else:
        census_tract[i] = 0
# end of for loop

# create dataframe and cast elements to float
census_tracts_mapping = pd.DataFrame(census_tract).astype(float, errors = 'raise')

print("dtype after:", census_tracts_mapping['tractce10'].dtypes)
#census_tracts_mapping.head(10)

dtype before : object
dtype after: float64


### Find neighborhood with large amount of homeowners (payed off mortgage or making payments) compared to renters.
Homeowners...
* make up a higher percentage of partners with children aged K-12 compared to renters
* are typically more wealthy (we want more expensive candy bowls)

This sub-metric also reduces the likelihood of getting neighborhoods with a high volume of undergraduate students.   
  
**Goal**: find neighborhoods with settled families that would participate in giving out candy

In [67]:
# sum OwnedMortgage and OwnedFree columns
mortgage_holders = homeownership['OwnedMortgage']
house_owners = homeownership['OwnedFree']
homeowners = mortgage_holders + house_owners

# find ratio of homeowners to renters
renters = homeownership['RenterOccupied']
ratioSettled2Renting = homeowners/renters
#ratioSettled2Renting.head(4)

In [69]:
# create new dataframe
ratio = pd.DataFrame(ratioSettled2Renting, columns=["Settled/renting"])
census_tracts = pd.DataFrame(homeownership['Census Tract'])
population = pd.DataFrame(homeownership['TotalPopulation'])

my_df = census_tracts.join(ratio).join(population)

my_df.head(10)

Unnamed: 0,Census Tract,Settled/renting,TotalPopulation
0,103.0,0.473684,588.0
1,201.0,0.228461,2538.0
2,203.0,0.107914,616.0
3,305.0,0.355288,2140.0
4,402.0,0.484547,1345.0
5,404.0,0.68196,2163.0
6,405.0,0.080351,2958.0
7,406.0,0.087767,2392.0
8,409.0,0.513414,2877.0
9,501.0,0.544536,1682.0


In [71]:
# sort and filter
# sort: highest ratio of settled/renting
my_dfsorted = my_df.sort_values(by=['Settled/renting'], ascending=False)
# filter: get rid of any tract with less than 500 people
my_dfFiltered = my_dfsorted[my_dfsorted['TotalPopulation'] >= 500]

my_dfFiltered.head(10)

Unnamed: 0,Census Tract,Settled/renting,TotalPopulation
152,4268.0,43.741667,5369.0
244,4753.03,33.151261,4064.0
384,5641.0,32.923077,882.0
122,4100.0,30.648649,1171.0
237,4742.01,28.545455,2600.0
239,4742.03,27.30625,4529.0
125,4120.02,26.331461,4865.0
318,5190.0,23.680328,3011.0
149,4263.0,22.647059,6030.0
130,4134.0,21.97861,4297.0


### Filter out Allegheny census tracts not included in the City of Pittsburgh. 
### Then, map census tract to one of the 90 Pittsburgh neighborhoods.

In [64]:
# list of 90 census tracts mapped to city of Pittsburgh neighborhood
city_of_pgh_cetracts = census_tracts_mapping['tractce10']
# list of census tracts included in the homeowners dataset for allegheny county
allegheny_cetracts = my_dfFiltered['Census Tract']
# create a dictionary of city of pgh census tract -> city of pgh neighborhood
dict = {'census tract': city_of_pgh_cetracts, 'pgh neighborhood': neighborhood['hood']}
dict = pd.DataFrame(dict)

best_tracts_pgh = [0]*len(allegheny_cetracts)
best_nbhds= []
best_match = 6000
limit = 5 # ensure we find the closest census tracts to represent the neighborhood 
cnt = 0
# loop through allegheny census tracts
for i in allegheny_cetracts:
    best_match = 6000
    # loop through all 90 pgh cetracts
    for j in city_of_pgh_cetracts:
        diff = i - j
        # if diff is 0 then we found the neighborhood
        if diff == 0:
            # add i to best_tracts_pgh
            best_tracts_pgh[cnt] = j
            break
        elif abs(diff) < best_match and abs(diff) < limit:
            best_match = diff
            best_tracts_pgh[cnt] = j
        else:
            best_tracts_pgh[cnt] = 0
    cnt = cnt + 1

### Finally, retrieve the top ten neighborhoods based off of homeownership

In [72]:
# retrive best neighborhoods using key-value pairs in dict
for w in best_tracts_pgh:
    index = 0
    if (w!=0):
        for p in dict['census tract']:
            if (w==p):
                best_nbhds.append(dict['pgh neighborhood'][index])
                break
            index = index + 1
                
best_nbhds_homeownership = pd.DataFrame(best_nbhds[0:10])
best_nbhds_homeownership.columns = ['Neighborhood']
best_nbhds_homeownership     

Unnamed: 0,Neighborhood
0,New Homestead
1,Stanton Heights
2,Lincoln Place
3,Swisshelm Park
4,Overbrook
5,Summer Hill
6,Regent Square
7,Brookline
8,Squirrel Hill North
9,Brighton Heights
