# Analyzing ACS 2016 and standardized decennial Census data from 2000

This script combines and analyzes data from the 5-year estimates of the American Community Survey (2012-2016) and standardized decennial Census data from the [US2010 Longitudinal Tract Data Base](https://s4.ad.brown.edu/Projects/Diversity/Researcher/Bridging.htm) to determine the following:
* whether a tract gentrified or not according to [this methodology](http://www.governing.com/gov-data/gentrification-report-methodology.html)
* percent-point changes for different, non-overlapping racial groups

In [1]:
import pandas as pd

#### First load data and filter it down to metro level

* Load data from the [US2010 Longitudinal Tract Data Base](https://s4.ad.brown.edu/Projects/Diversity/Researcher/Bridging.htm) (used the [dictionary](https://s4.ad.brown.edu/Projects/Diversity/Researcher/LTBDDload/Dfiles/codebooks.pdf) to look up column names)
* Load data from 2016 ACS


In [2]:
CSV_OPTIONS_2000 = dict(
    encoding = 'ISO-8859-1',
    dtype = {
        'GEOID': str,
        'MHMVAL00': float,
        'HINC00': float
    },
    na_values = [ " " ]
)

In [3]:
census_data_2000 = pd.read_csv(
    '../data/edited_LTDB_Std_2000_fullcount_sample.csv',
    **CSV_OPTIONS_2000
)


print(len(census_data_2000))
census_data_2000.head()

4700


Unnamed: 0.1,Unnamed: 0,GEOID,total_population00,median_income00,median_home_value00,educational_attainment_00,white_alone00,black_alone00,native00,asian00,native_hawaiian_pacific_islander00,hispanic_or_latino00
0,0,34003001000,6683.181012,105711.9486,414634.7,2726.303365,6034.576265,26.959115,4.989669,435.983661,0.0,169.665195
1,1,34003002100,2183.0,130740.0,1000001.0,961.0,1647.0,37.0,6.0,435.0,0.0,55.0
2,2,34003002200,4851.0,74423.0,251500.0,1023.0,3889.0,56.0,3.0,657.0,0.0,230.0
3,3,34003002300,5751.0,92447.0,340800.0,1669.0,4356.0,70.0,1.0,1122.0,0.0,172.0
4,4,34003003100,5057.0,65750.0,179700.0,1021.0,2880.0,260.0,10.0,1067.0,0.0,816.0


Rename columns

In [4]:
census_data_2000 = census_data_2000.rename( columns = {
    'GEOID': 'GEOID', 
    'POP00':'total_population00', # total population    
    'NHWHT00': 'white_alone00', # persons of white race, not Hispanic origin -- equivalent of 2010 "white alone"
    'NHBLK00':'black_alone00', # persons of black race, not Hispanic origin -- equivalent of 2010 "black alone"
    'NTV00':'native00', #  persons of Native American race
    'ASIAN00':'asian00', # Asian and Pacific Islander race
    'HAW00':'native_hawaiian_pacific_islander00',  # persons of Hawaiian race
    'HISP00':'hispanic_or_latino00', # persons of Hispanic origin 
    'HINC00': 'median_income00', # Median household income, total
    'MHMVAL00':'median_home_value00', # Median home value
    'COL00':'educational_attainment_00' # percentage of persons with at least a four-year college degree
})

        
print("census: " + str(len(census_data_2000)))
print(len(census_data_2000))

census: 4700
4700


Replace median home and income values that are 0 with null

#### Join with ny metrolevel data

* load data containing 2016 data
* add columnn with nyc data

In [5]:
census_2016_data = pd.read_csv(
    '../output/2016_census_data.csv',
    dtype = {'geoid': str},
    na_values = [ -666666666 ]
).rename(
    columns = {'geoid':'GEOID'}
)

print(len(census_2016_data))

4700


In [6]:
census_merged = pd.merge(
    census_2016_data,
    census_data_2000[[
        'GEOID', 
        'total_population00',
        'median_income00',
        'median_home_value00',
        'educational_attainment_00',
        'white_alone00',
        'black_alone00',
        'native00',
        'asian00',
        'native_hawaiian_pacific_islander00',
        'hispanic_or_latino00'
    ]], 
    on = "GEOID",
    how = 'left'
)



In [7]:
census_merged['median_home_value00'] = census_merged['median_home_value00'].replace(0, pd.np.nan)

In [8]:
(census_merged == 0).sum()

GEOID                                    0
name                                     0
total_population                        51
median_income                            0
median_home_value                        0
educational_attainment                  59
white_alone                            101
black_alone                            312
native                                3151
asian                                  369
native_hawaiian_pacific_islander      4495
some_other_race_alone                 2748
two_or_more                            693
hispanic_or_latino                      74
total_population00                       7
median_income00                          0
median_home_value00                      0
educational_attainment_00               34
white_alone00                           10
black_alone00                           23
native00                               184
asian00                                 33
native_hawaiian_pacific_islander00    3577
hispanic_or

In [9]:
census_merged.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4700 entries, 0 to 4699
Data columns (total 24 columns):
GEOID                                 4700 non-null object
name                                  4700 non-null object
total_population                      4700 non-null int64
median_income                         4610 non-null float64
median_home_value                     4416 non-null float64
educational_attainment                4700 non-null float64
white_alone                           4700 non-null float64
black_alone                           4700 non-null float64
native                                4700 non-null float64
asian                                 4700 non-null float64
native_hawaiian_pacific_islander      4700 non-null float64
some_other_race_alone                 4700 non-null float64
two_or_more                           4700 non-null float64
hispanic_or_latino                    4700 non-null float64
total_population00                    4673 non-null float

In [10]:
census_merged.to_csv(
    '../output/census_data_nyc_metro.csv',
    index=False
)

# The gentrification measure
It is based on two tests detailed [here](http://www.governing.com/gov-data/gentrification-report-methodology.html).

#### Test 1: does the tract qualify for gentrification?

* The tract had a population of at least 500 residents at the beginning and end of a decade and was located within a central city

* The tract’s median household income was in the bottom 40th percentile when compared to all tracts within its metro area at the beginning of the decade.

* The tract’s median home value was in the bottom 40th percentile when compared to all tracts within its metro area at the beginning of the decade.

#### Test 2: has it gentrified?

* An increase in a tract's educational attainment, as measured by the percentage of residents age 25 and over holding bachelor’s degrees, was in the top third percentile of all tracts within a metro area.

* A tract’s median home value increased when adjusted for inflation.

* The percentage increase in a tract’s inflation-adjusted median home value was in the top third percentile of all tracts within a metro area.

*Calculate the metro-level variables needed for these tests on a city-wide level:*

* 40th percentile household income threshold for metro level data 
* 40th percentile median home value for metro level
* bachelor's degree — top third percentile for metro level
* inflation-adjusted median home value — top third percentile for all metro level

Add new columns for pct of tract population with college degree (subtracting the percentages takes into consideration the changes in total population within each tract over time).

In [11]:
census_merged['educational_attainment_pct'] = (
    100 * census_merged['educational_attainment'] /
    census_merged['total_population']
)

census_merged['educational_attainment_pct00'] = (
    100 * census_merged['educational_attainment_00'] / 
    census_merged['total_population00']
)

census_merged['educational_attainment_change'] = (
    census_merged['educational_attainment_pct'] - 
    census_merged['educational_attainment_pct00']
)

### Calculate percentiles 

Data notes on information needed to be cleaned or to calculate inflation: 
* [BLS inflation rate](https://www.bls.gov/cpi/#): \$1 in 2000 was worth \$1.40 in 2016
* [Census data dictionary](https://www.census.gov/data/developers/data-sets/acs-5year/data-notes.html) suggests that `-666666666` is a placeholder value for data that is now available:
```A '-' entry in the estimate column indicates that either no sample observations or too few sample observations were available to compute an estimate, or a ratio of medians cannot be calculated because one or both of the median estimates falls in the lowest interval or upper interval of an open-ended distribution.```

In [12]:
inflation_rate  = 1.40

Now calculate the varios thresholds for our gentrification tests: 
* `40th percentile household income threshold for metro level data`
* `40th percentile median home value for metro level` 
* `inflation-adjusted median home value — top third percentile for all metro level`

In [13]:
median_home_2000_bottom_40th = census_merged['median_home_value00'].quantile(0.4)

median_income_2000_bottom_40th = census_merged['median_income00'].quantile(0.4)

census_merged['home_pct_change'] = (
    (
        census_merged['median_home_value'] - # current median home value
        (census_merged['median_home_value00'] * inflation_rate)  # previous median home value adjusted for inflation
    ) / 
    census_merged['median_home_value'] # current median home value 
)

median_home_change_top3rd = census_merged[census_merged['home_pct_change'] !=None ]['home_pct_change'].quantile(0.66)

print("Bottom 40th percentile household income threshold for metro level data: " , median_home_2000_bottom_40th)
print("Bottom 40th percentile median home value for metro level data: " , median_income_2000_bottom_40th)
print("Top third percentile for all-metro-level percentage increase in a tract’s inflation-adjusted median home value: " , median_home_change_top3rd)

Bottom 40th percentile household income threshold for metro level data:  181600.0
Bottom 40th percentile median home value for metro level data:  43583.600072
Top third percentile for all-metro-level percentage increase in a tract’s inflation-adjusted median home value:  0.3681192142695868


Do calculations to find out whether the change in percentage points of people with a bachelor's degree was in the top third percentile for metro level. 

In [14]:
educational_attainment_threshold = census_merged['educational_attainment_change'].quantile(2.0/3)
print(census_merged['educational_attainment_change'].min())
print(census_merged['educational_attainment_change'].max())
print(educational_attainment_threshold)


-71.44026147512926
100.0
7.645906120888976


### Run tests
* pare down data to nyc alone so we can run tests
* make binary column (`True` / `False`) for each test

In [15]:
NYC_COUNTIES = [
    "005", # Bronx
    "047", # Kings (Brooklyn)
    "061", # New York County (Manhattan)
    "081", # Queens
    "085", # Richmond (Staten Island)
]

In [16]:
census_nyc = census_merged[
    (census_merged['GEOID'].str.slice(0, 2) == "36") & # NY state FIPS code,
    census_merged['GEOID'].str.slice(2, 5).isin(NYC_COUNTIES)
].copy()

print(len(census_nyc), len(census_merged))

2167 4700


### Run tests
1. has more than 500 residents
2. was in the bottom 40th percentile for metro-area-level tracts by median household income
3. tract’s median home value was in the bottom 40th percentile for all metro-area-level tracts

In [17]:
# The tract had a population of at least 500 residents at the beginning and 
# end of a decade and was located within a central city
census_nyc['has_pop_500+_2000'] = census_nyc['total_population00'] >= 500
census_nyc['has_pop_500+_2016'] = census_nyc['total_population'] >= 500

# The tract’s median household income was in the bottom 40th percentile when compared 
# to all tracts within its metro area at the beginning of the decade.
census_nyc['median_40th_home_2000'] = census_nyc['median_home_value00'] < median_home_2000_bottom_40th

# The tract’s median home value was in the bottom 40th percentile when compared to all tracts within its metro area at the beginning of the decade.
census_nyc['median_40th_income_2000'] = census_nyc['median_income00'] < median_income_2000_bottom_40th
census_nyc.head(5)

Unnamed: 0,GEOID,name,total_population,median_income,median_home_value,educational_attainment,white_alone,black_alone,native,asian,...,native_hawaiian_pacific_islander00,hispanic_or_latino00,educational_attainment_pct,educational_attainment_pct00,educational_attainment_change,home_pct_change,has_pop_500+_2000,has_pop_500+_2016,median_40th_home_2000,median_40th_income_2000
1443,36005000100,"Census Tract 1, Bronx County, New York",7503,,,76.0,510.0,4496.0,14.0,135.0,...,0.0,3443.0,1.012928,0.0,1.012928,,True,True,False,False
1444,36005000200,"Census Tract 2, Bronx County, New York",5251,70893.0,394800.0,615.0,93.0,950.0,0.0,147.0,...,0.0,2464.0,11.712055,4.964739,6.747316,0.40461,True,True,True,True
1445,36005000400,"Census Tract 4, Bronx County, New York",5980,76667.0,353500.0,972.0,243.0,1778.0,19.0,34.0,...,0.0,2255.0,16.254181,6.910078,9.344102,0.331485,True,True,True,True
1446,36005001600,"Census Tract 16, Bronx County, New York",6056,31540.0,376400.0,358.0,142.0,1776.0,0.0,0.0,...,0.0,3250.0,5.911493,6.70231,-0.790818,0.305207,True,True,False,True
1447,36005001900,"Census Tract 19, Bronx County, New York",2682,39130.0,374400.0,370.0,195.0,875.0,0.0,50.0,...,0.0,987.0,13.795675,2.335859,11.459816,0.564966,True,True,True,True


Create columns for every test that needs to be passed:
1. First whether they qualify
2. Whether they gentrified

In [18]:
# An increase in a tract's educational attainment, as measured by the 
# percentage of residents age 25 and over holding bachelor’s degrees, 
# was in the top third percentile of all tracts within a metro area.
census_nyc['educational_is_upper3rd'] = census_nyc['educational_attainment_change'] > educational_attainment_threshold

# The median home value increased, after accounting for inflation
census_nyc['median_home_higher'] = census_nyc['median_home_value'] > (census_nyc['median_home_value00'] * inflation_rate)

# The percentage increase in a tract’s inflation-adjusted median home 
# value was in the top third percentile of all tracts within a metro area.
census_nyc['median_home_top_third'] = census_nyc['home_pct_change'] > median_home_change_top3rd

census_nyc.head()

Unnamed: 0,GEOID,name,total_population,median_income,median_home_value,educational_attainment,white_alone,black_alone,native,asian,...,educational_attainment_pct00,educational_attainment_change,home_pct_change,has_pop_500+_2000,has_pop_500+_2016,median_40th_home_2000,median_40th_income_2000,educational_is_upper3rd,median_home_higher,median_home_top_third
1443,36005000100,"Census Tract 1, Bronx County, New York",7503,,,76.0,510.0,4496.0,14.0,135.0,...,0.0,1.012928,,True,True,False,False,False,False,False
1444,36005000200,"Census Tract 2, Bronx County, New York",5251,70893.0,394800.0,615.0,93.0,950.0,0.0,147.0,...,4.964739,6.747316,0.40461,True,True,True,True,False,True,True
1445,36005000400,"Census Tract 4, Bronx County, New York",5980,76667.0,353500.0,972.0,243.0,1778.0,19.0,34.0,...,6.910078,9.344102,0.331485,True,True,True,True,True,True,False
1446,36005001600,"Census Tract 16, Bronx County, New York",6056,31540.0,376400.0,358.0,142.0,1776.0,0.0,0.0,...,6.70231,-0.790818,0.305207,True,True,False,True,False,True,False
1447,36005001900,"Census Tract 19, Bronx County, New York",2682,39130.0,374400.0,370.0,195.0,875.0,0.0,50.0,...,2.335859,11.459816,0.564966,True,True,True,True,True,True,True


In [19]:
census_nyc['gentrified'] = (
    (census_nyc['has_pop_500+_2000'] == True) & 
    (census_nyc['has_pop_500+_2016'] == True) & 
    (census_nyc['median_40th_home_2000'] == True) & 
    (census_nyc['median_40th_income_2000'] == True) & 
    (census_nyc['educational_is_upper3rd'] == True) & 
    (census_nyc['median_home_higher'] == True) & 
    (census_nyc['median_home_top_third'] == True)
)

In [20]:
census_nyc.head()

Unnamed: 0,GEOID,name,total_population,median_income,median_home_value,educational_attainment,white_alone,black_alone,native,asian,...,educational_attainment_change,home_pct_change,has_pop_500+_2000,has_pop_500+_2016,median_40th_home_2000,median_40th_income_2000,educational_is_upper3rd,median_home_higher,median_home_top_third,gentrified
1443,36005000100,"Census Tract 1, Bronx County, New York",7503,,,76.0,510.0,4496.0,14.0,135.0,...,1.012928,,True,True,False,False,False,False,False,False
1444,36005000200,"Census Tract 2, Bronx County, New York",5251,70893.0,394800.0,615.0,93.0,950.0,0.0,147.0,...,6.747316,0.40461,True,True,True,True,False,True,True,False
1445,36005000400,"Census Tract 4, Bronx County, New York",5980,76667.0,353500.0,972.0,243.0,1778.0,19.0,34.0,...,9.344102,0.331485,True,True,True,True,True,True,False,False
1446,36005001600,"Census Tract 16, Bronx County, New York",6056,31540.0,376400.0,358.0,142.0,1776.0,0.0,0.0,...,-0.790818,0.305207,True,True,False,True,False,True,False,False
1447,36005001900,"Census Tract 19, Bronx County, New York",2682,39130.0,374400.0,370.0,195.0,875.0,0.0,50.0,...,11.459816,0.564966,True,True,True,True,True,True,True,True


In [21]:
census_nyc['pct_white_alone_change'] = ((census_nyc['white_alone']/census_nyc['total_population'])*100)-((census_nyc['white_alone00']/census_nyc['total_population00'])*100)
census_nyc['pct_white_alone_2016']= ((census_nyc['white_alone']/census_nyc['total_population'])*100)

census_nyc['pct_black_alone_change'] = ((census_nyc['black_alone']/census_nyc['total_population'])*100)-((census_nyc['black_alone00']/census_nyc['total_population00'])*100)
census_nyc['pct_black_alone_2016'] = ((census_nyc['black_alone']/census_nyc['total_population'])*100)

census_nyc['pct_native_alone_change'] = ((census_nyc['native00']/census_nyc['total_population'])*100)-((census_nyc['native00']/census_nyc['total_population00'])*100)
census_nyc['pct_native_alone_2016'] = ((census_nyc['native']/census_nyc['total_population'])*100)

census_nyc['pct_asian_alone_change'] = ((census_nyc['asian00']/census_nyc['total_population'])*100)-((census_nyc['asian00']/census_nyc['total_population00'])*100)
census_nyc['pct_asian_alone_2016'] = ((census_nyc['asian']/census_nyc['total_population'])*100)

census_nyc['pct_hispanic_or_latino_alone_change'] = ((census_nyc['hispanic_or_latino']/census_nyc['total_population'])*100)-((census_nyc['hispanic_or_latino00']/census_nyc['total_population00'])*100)
census_nyc['pct_hispanic_or_latino_alone_2016'] = ((census_nyc['hispanic_or_latino']/census_nyc['total_population'])*100)

census_nyc['pct_native_hawaiian_pacific_islander_change'] = ((census_nyc['native_hawaiian_pacific_islander']/census_nyc['total_population'])*100)-((census_nyc['native_hawaiian_pacific_islander00']/census_nyc['total_population00'])*100)
census_nyc['pct_native_hawaiian_pacific_islander_2016'] = ((census_nyc['native_hawaiian_pacific_islander']/census_nyc['total_population'])*100)

In [22]:
census_nyc["low_population"] = (census_nyc["total_population00"] < 500) | (census_nyc["total_population"] < 500)

In [23]:
census_nyc["eligible_for_gentrification"] = (
  (census_nyc["low_population"] == False) &
  (census_nyc["median_home_value00"] < median_home_2000_bottom_40th) &
  (census_nyc["median_income00"] < median_income_2000_bottom_40th)
)

In [24]:
census_nyc["eligible_not_gentrified_highpop"] = (
  (census_nyc["low_population"] == False) &
  (census_nyc["eligible_for_gentrification"] == True) &
  (census_nyc["gentrified"] == False)
)

In [25]:
census_nyc["not_eligible_highpop"] = (
  (census_nyc["low_population"] == False) &
  (census_nyc["eligible_for_gentrification"] == False) 
)

len(census_nyc[census_nyc["not_eligible_highpop"]])

1635

In [26]:
census_nyc.to_csv(
    '../output/gentrification.csv',
    index = False
)

---

---

---