# Wrangle New Jersey tract-level census data

In [49]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
from shapely.geometry import Point
from shapely.geometry import Polygon
from shapely.geometry import mapping
import numpy as np
from pathlib import Path


import warnings
warnings.simplefilter(action='ignore')

### Health Insurance Coverage Status By Sex By Age (2018 5-year estimates)

In [50]:
# read insurance coverage data in
insurance_in = pd.read_csv('../data/census-tables/2018-nj-insurance-data.csv', encoding='latin-1', header=1)
list(insurance_in.columns)

['id',
 'Geographic Area Name',
 'Estimate!!Total',
 'Margin of Error!!Total',
 'Estimate!!Total!!Male',
 'Margin of Error!!Total!!Male',
 'Estimate!!Total!!Male!!Under 6 years',
 'Margin of Error!!Total!!Male!!Under 6 years',
 'Estimate!!Total!!Male!!Under 6 years!!With health insurance coverage',
 'Margin of Error!!Total!!Male!!Under 6 years!!With health insurance coverage',
 'Estimate!!Total!!Male!!Under 6 years!!No health insurance coverage',
 'Margin of Error!!Total!!Male!!Under 6 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!6 to 18 years',
 'Margin of Error!!Total!!Male!!6 to 18 years',
 'Estimate!!Total!!Male!!6 to 18 years!!With health insurance coverage',
 'Margin of Error!!Total!!Male!!6 to 18 years!!With health insurance coverage',
 'Estimate!!Total!!Male!!6 to 18 years!!No health insurance coverage',
 'Margin of Error!!Total!!Male!!6 to 18 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!19 to 25 years',
 'Margin of Error!!Total!!Male!!19 to 

I only want to keep the identifying columns, and I ultimately need a column for total uninsured.

In [51]:
# first filter to keep columns that contain estimate, to eliminate the margin of error columns
colNames = insurance_in.columns.str.contains('id|Geographic Area Name|Estimate', case=False)
insurance_filter = insurance_in.iloc[:, colNames]
list(insurance_filter.columns)

['id',
 'Geographic Area Name',
 'Estimate!!Total',
 'Estimate!!Total!!Male',
 'Estimate!!Total!!Male!!Under 6 years',
 'Estimate!!Total!!Male!!Under 6 years!!With health insurance coverage',
 'Estimate!!Total!!Male!!Under 6 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!6 to 18 years',
 'Estimate!!Total!!Male!!6 to 18 years!!With health insurance coverage',
 'Estimate!!Total!!Male!!6 to 18 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!19 to 25 years',
 'Estimate!!Total!!Male!!19 to 25 years!!With health insurance coverage',
 'Estimate!!Total!!Male!!19 to 25 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!26 to 34 years',
 'Estimate!!Total!!Male!!26 to 34 years!!With health insurance coverage',
 'Estimate!!Total!!Male!!26 to 34 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!35 to 44 years',
 'Estimate!!Total!!Male!!35 to 44 years!!With health insurance coverage',
 'Estimate!!Total!!Male!!35 to 44 years!!No health insurance

In [52]:
# create list of all columns that describe a group with no health coverage
colNames = insurance_filter.columns.str.contains('No', case=False)
no_insurance = insurance_filter.iloc[:, colNames]
col_list = list(no_insurance.columns)
col_list

['Estimate!!Total!!Male!!Under 6 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!6 to 18 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!19 to 25 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!26 to 34 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!35 to 44 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!45 to 54 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!55 to 64 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!65 to 74 years!!No health insurance coverage',
 'Estimate!!Total!!Male!!75 years and over!!No health insurance coverage',
 'Estimate!!Total!!Female!!Under 6 years!!No health insurance coverage',
 'Estimate!!Total!!Female!!6 to 18 years!!No health insurance coverage',
 'Estimate!!Total!!Female!!19 to 25 years!!No health insurance coverage',
 'Estimate!!Total!!Female!!26 to 34 years!!No health insurance coverage',
 'Estimate!!Total!!Female!!35 to 44 years!!No health insu

In [53]:
# create new field that is the sum of all columns describing groups with no health coverage
insurance_filter['totalUninsured'] = insurance_filter[col_list].sum(axis=1)

insurance_filter.head()

Unnamed: 0,id,Geographic Area Name,Estimate!!Total,Estimate!!Total!!Male,Estimate!!Total!!Male!!Under 6 years,Estimate!!Total!!Male!!Under 6 years!!With health insurance coverage,Estimate!!Total!!Male!!Under 6 years!!No health insurance coverage,Estimate!!Total!!Male!!6 to 18 years,Estimate!!Total!!Male!!6 to 18 years!!With health insurance coverage,Estimate!!Total!!Male!!6 to 18 years!!No health insurance coverage,...,Estimate!!Total!!Female!!55 to 64 years,Estimate!!Total!!Female!!55 to 64 years!!With health insurance coverage,Estimate!!Total!!Female!!55 to 64 years!!No health insurance coverage,Estimate!!Total!!Female!!65 to 74 years,Estimate!!Total!!Female!!65 to 74 years!!With health insurance coverage,Estimate!!Total!!Female!!65 to 74 years!!No health insurance coverage,Estimate!!Total!!Female!!75 years and over,Estimate!!Total!!Female!!75 years and over!!With health insurance coverage,Estimate!!Total!!Female!!75 years and over!!No health insurance coverage,totalUninsured
0,1400000US34013019900,"Census Tract 199, Essex County, New Jersey",2604,1237,72,72,0,269,269,0,...,176,167,9,81,81,0,68,68,0,50
1,1400000US34013020700,"Census Tract 207, Essex County, New Jersey",4254,2203,111,111,0,544,535,9,...,314,284,30,224,214,10,154,154,0,132
2,1400000US34013012200,"Census Tract 122, Essex County, New Jersey",5122,2559,114,114,0,618,427,191,...,280,280,0,207,207,0,51,51,0,973
3,1400000US34013012400,"Census Tract 124, Essex County, New Jersey",4524,2185,153,153,0,469,433,36,...,537,413,124,309,303,6,141,141,0,713
4,1400000US34013013000,"Census Tract 130, Essex County, New Jersey",1935,1019,53,53,0,108,108,0,...,157,157,0,46,46,0,105,83,22,182


In [54]:
# create new insurance dataframe by filtering only the columns I need
insurance_coverage = insurance_filter.filter(['id', 'Geographic Area Name', 'totalUninsured', 'uninsuredUnder6'])

# rename columns
insurance_coverage = insurance_coverage.rename(columns={'Geographic Area Name': 'name'})
insurance_coverage.head()

Unnamed: 0,id,name,totalUninsured
0,1400000US34013019900,"Census Tract 199, Essex County, New Jersey",50
1,1400000US34013020700,"Census Tract 207, Essex County, New Jersey",132
2,1400000US34013012200,"Census Tract 122, Essex County, New Jersey",973
3,1400000US34013012400,"Census Tract 124, Essex County, New Jersey",713
4,1400000US34013013000,"Census Tract 130, Essex County, New Jersey",182


In [55]:
# view insurance coverage statistics
insurance_coverage.describe()

Unnamed: 0,totalUninsured
count,2010.0
mean,369.674129
std,366.057654
min,0.0
25%,126.0
50%,248.0
75%,496.0
max,3010.0


### Poverty Status in the past 12 months (2018 5-year estimates)

In [56]:
# read poverty data in
poverty_in = pd.read_csv('../data/census-tables/2018-nj-poverty-data.csv', encoding='latin-1', header=1)
list(poverty_in.columns)

['id',
 'Geographic Area Name',
 'Estimate!!Total!!Population for whom poverty status is determined',
 'Margin of Error!!Total MOE!!Population for whom poverty status is determined',
 'Estimate!!Below poverty level!!Population for whom poverty status is determined',
 'Margin of Error!!Below poverty level MOE!!Population for whom poverty status is determined',
 'Estimate!!Percent below poverty level!!Population for whom poverty status is determined',
 'Margin of Error!!Percent below poverty level MOE!!Population for whom poverty status is determined',
 'Estimate!!Total!!Population for whom poverty status is determined!!AGE!!Under 18 years',
 'Margin of Error!!Total MOE!!Population for whom poverty status is determined!!AGE!!Under 18 years',
 'Estimate!!Below poverty level!!Population for whom poverty status is determined!!AGE!!Under 18 years',
 'Margin of Error!!Below poverty level MOE!!Population for whom poverty status is determined!!AGE!!Under 18 years',
 'Estimate!!Percent below pov

I just want to keep the identifying fields, and there is already a column for the percent below poverty level so I will not need to calculate that field from population totals.

In [57]:
# create poverty dataframe by filtering the columns I need
# for this one I can just keep the id field which will be used to join later, and the percent below poverty level
poverty_level = poverty_in.filter(['id', 'Estimate!!Percent below poverty level!!Population for whom poverty status is determined'])

# rename columns
poverty_level = poverty_level.rename(columns={'Geographic Area Name': 'name',
                                                        'Estimate!!Percent below poverty level!!Population for whom poverty status is determined': 'percentBelowPoverty'})

# some rows had a '-' for a null value
# replace these with nans
poverty_level = poverty_level.replace(r'-', np.nan)

# cast column to float
poverty_level['percentBelowPoverty'] = poverty_level['percentBelowPoverty'].astype(float)

In [58]:
poverty_level.describe()

Unnamed: 0,percentBelowPoverty
count,1995.0
mean,11.166216
std,10.556708
min,0.0
25%,3.8
50%,7.2
75%,15.2
max,61.4


### Age and Sex, 2018 5-year estimates

In [59]:
# read population data in
pop_in = pd.read_csv('../data/census-tables/2018-nj-population-data.csv', encoding='latin-1', header=1)
list(pop_in.columns)

['id',
 'Geographic Area Name',
 'Estimate!!Total!!Total population',
 'Margin of Error!!Total MOE!!Total population',
 'Estimate!!Percent!!Total population',
 'Margin of Error!!Percent MOE!!Total population',
 'Estimate!!Male!!Total population',
 'Margin of Error!!Male MOE!!Total population',
 'Estimate!!Percent Male!!Total population',
 'Margin of Error!!Percent Male MOE!!Total population',
 'Estimate!!Female!!Total population',
 'Margin of Error!!Female MOE!!Total population',
 'Estimate!!Percent Female!!Total population',
 'Margin of Error!!Percent Female MOE!!Total population',
 'Estimate!!Total!!Total population!!AGE!!Under 5 years',
 'Margin of Error!!Total MOE!!Total population!!AGE!!Under 5 years',
 'Estimate!!Percent!!Total population!!AGE!!Under 5 years',
 'Margin of Error!!Percent MOE!!Total population!!AGE!!Under 5 years',
 'Estimate!!Male!!Total population!!AGE!!Under 5 years',
 'Margin of Error!!Male MOE!!Total population!!AGE!!Under 5 years',
 'Estimate!!Percent Male!!T

I only want to keep the total population and total population under 5 years, as well as the id field to be used in merging later.

In [60]:
# create new population dataframe by filtering the columns I need
pop = pop_in.filter(['id', 'Estimate!!Total!!Total population', 'Estimate!!Total!!Total population!!AGE!!Under 5 years'], axis =1)

# rename columns
pop = pop.rename(columns={'Estimate!!Total!!Total population': 'totalPop',
                                         'Estimate!!Total!!Total population!!AGE!!Under 5 years': 'under5'})

pop.head()

Unnamed: 0,id,totalPop,under5
0,1400000US34001000100,2142,157
1,1400000US34001000200,3296,92
2,1400000US34001000300,4194,348
3,1400000US34001000400,2941,167
4,1400000US34001000500,3298,299


In [61]:
pop.describe()

Unnamed: 0,totalPop,under5
count,2010.0,2010.0
mean,4418.828358,259.544279
std,1872.783828,185.518088
min,0.0,0.0
25%,3107.5,147.0
50%,4226.5,229.0
75%,5604.25,332.0
max,16571.0,2270.0


### Place of birth by education attainment in the United States, 2018 5-yr estimates (population 25 years and over in the United States)

In [62]:
# read income data in
education_in = pd.read_csv('../data/census-tables/2018-nj-education-data.csv', encoding='latin-1', header=1)
list(education_in.columns)

['id',
 'Geographic Area Name',
 'Estimate!!Total',
 'Margin of Error!!Total',
 'Estimate!!Total!!Less than high school graduate',
 'Margin of Error!!Total!!Less than high school graduate',
 'Estimate!!Total!!High school graduate (includes equivalency)',
 'Margin of Error!!Total!!High school graduate (includes equivalency)',
 "Estimate!!Total!!Some college or associate's degree",
 "Margin of Error!!Total!!Some college or associate's degree",
 "Estimate!!Total!!Bachelor's degree",
 "Margin of Error!!Total!!Bachelor's degree",
 'Estimate!!Total!!Graduate or professional degree',
 'Margin of Error!!Total!!Graduate or professional degree',
 'Estimate!!Total!!Born in state of residence',
 'Margin of Error!!Total!!Born in state of residence',
 'Estimate!!Total!!Born in state of residence!!Less than high school graduate',
 'Margin of Error!!Total!!Born in state of residence!!Less than high school graduate',
 'Estimate!!Total!!Born in state of residence!!High school graduate (includes equivale

I only need to keep the id field, and the total number with less than high school education.

In [63]:
# create new education dataframe by filtering the columns I need
education = education_in.filter(['id', 'Estimate!!Total!!Less than high school graduate'], axis =1)

# rename columns
education = education.rename(columns={'Estimate!!Total!!Less than high school graduate': 'highSchoolEd'})

education.head()

Unnamed: 0,id,highSchoolEd
0,1400000US34013019900,38
1,1400000US34013020700,115
2,1400000US34013012200,508
3,1400000US34013012400,543
4,1400000US34013013000,87


### Limited english speaking households, 2018 5-year estimates

In [64]:
# read english data in
english_in = pd.read_csv('../data/census-tables/2018-nj-english-data.csv', encoding='latin-1', header=1)
list(english_in.columns)

['id',
 'Geographic Area Name',
 'Estimate!!Total!!All households',
 'Margin of Error!!Total MOE!!All households',
 'Estimate!!Percent!!All households',
 'Margin of Error!!Percent MOE!!All households',
 'Estimate!!Limited English-speaking households!!All households',
 'Margin of Error!!Limited English-speaking households MOE!!All households',
 'Estimate!!Percent limited English-speaking households!!All households',
 'Margin of Error!!Percent limited English-speaking households MOE!!All households',
 'Estimate!!Total!!All households!!Households speaking --!!Spanish',
 'Margin of Error!!Total MOE!!All households!!Households speaking --!!Spanish',
 'Estimate!!Percent!!All households!!Households speaking --!!Spanish',
 'Margin of Error!!Percent MOE!!All households!!Households speaking --!!Spanish',
 'Estimate!!Limited English-speaking households!!All households!!Households speaking --!!Spanish',
 'Margin of Error!!Limited English-speaking households MOE!!All households!!Households speaking

I only need to keep the id field, the total number of limited english housesholds, and the percent of limited-english speaking households.

In [65]:
# create new english dataframe by filtering the columns I need
english = english_in.filter(['id', 'Estimate!!Limited English-speaking households!!All households', 'Estimate!!Percent limited English-speaking households!!All households'], axis =1)

# rename columns
english = english.rename(columns={'Estimate!!Limited English-speaking households!!All households': 'totalLimitedEnglish',
                                    'Estimate!!Percent limited English-speaking households!!All households': 'percentLimitedEnglish'})

# some rows had a '-' for a null value
# replace these with nans
english = english.replace(r'-', np.nan)

# cast column to float
english['percentLimitedEnglish'] = english['percentLimitedEnglish'].astype(float)

english.head()

Unnamed: 0,id,totalLimitedEnglish,percentLimitedEnglish
0,1400000US34001000100,100,13.3
1,1400000US34001000200,256,19.2
2,1400000US34001000300,359,28.2
3,1400000US34001000400,199,16.0
4,1400000US34001000500,225,25.7


### Merging the wrangled census datasets together

In [66]:
census_data1 = pd.merge(pop, poverty_level, on='id', how='left')
census_data1.head()

Unnamed: 0,id,totalPop,under5,percentBelowPoverty
0,1400000US34001000100,2142,157,40.1
1,1400000US34001000200,3296,92,23.3
2,1400000US34001000300,4194,348,30.2
3,1400000US34001000400,2941,167,33.3
4,1400000US34001000500,3298,299,36.5


In [67]:
census_data2 = pd.merge(census_data1, education, on='id')
census_data2.head()

Unnamed: 0,id,totalPop,under5,percentBelowPoverty,highSchoolEd
0,1400000US34001000100,2142,157,40.1,356
1,1400000US34001000200,3296,92,23.3,478
2,1400000US34001000300,4194,348,30.2,940
3,1400000US34001000400,2941,167,33.3,305
4,1400000US34001000500,3298,299,36.5,758


In [68]:
census_data3 = pd.merge(census_data2, english, on='id', how='left')
census_data3.head()

Unnamed: 0,id,totalPop,under5,percentBelowPoverty,highSchoolEd,totalLimitedEnglish,percentLimitedEnglish
0,1400000US34001000100,2142,157,40.1,356,100,13.3
1,1400000US34001000200,3296,92,23.3,478,256,19.2
2,1400000US34001000300,4194,348,30.2,940,359,28.2
3,1400000US34001000400,2941,167,33.3,305,199,16.0
4,1400000US34001000500,3298,299,36.5,758,225,25.7


In [69]:
census_data = pd.merge(census_data3, insurance_coverage, on='id', how='left')
census_data.head()

Unnamed: 0,id,totalPop,under5,percentBelowPoverty,highSchoolEd,totalLimitedEnglish,percentLimitedEnglish,name,totalUninsured
0,1400000US34001000100,2142,157,40.1,356,100,13.3,"Census Tract 1, Atlantic County, New Jersey",282
1,1400000US34001000200,3296,92,23.3,478,256,19.2,"Census Tract 2, Atlantic County, New Jersey",263
2,1400000US34001000300,4194,348,30.2,940,359,28.2,"Census Tract 3, Atlantic County, New Jersey",1288
3,1400000US34001000400,2941,167,33.3,305,199,16.0,"Census Tract 4, Atlantic County, New Jersey",553
4,1400000US34001000500,3298,299,36.5,758,225,25.7,"Census Tract 5, Atlantic County, New Jersey",842


In [70]:
# split name column into separate columns for tract, county, and state
census_data['censusTract'], census_data['county'], census_data['state'] = census_data['name'].str.split(', ', 2).str

# grab last eleven digits from the id field to create a GEOID field for joining to shapefile
census_data['GEOID'] = census_data['id'].str[-11:]

census_data.head()

Unnamed: 0,id,totalPop,under5,percentBelowPoverty,highSchoolEd,totalLimitedEnglish,percentLimitedEnglish,name,totalUninsured,censusTract,county,state,GEOID
0,1400000US34001000100,2142,157,40.1,356,100,13.3,"Census Tract 1, Atlantic County, New Jersey",282,Census Tract 1,Atlantic County,New Jersey,34001000100
1,1400000US34001000200,3296,92,23.3,478,256,19.2,"Census Tract 2, Atlantic County, New Jersey",263,Census Tract 2,Atlantic County,New Jersey,34001000200
2,1400000US34001000300,4194,348,30.2,940,359,28.2,"Census Tract 3, Atlantic County, New Jersey",1288,Census Tract 3,Atlantic County,New Jersey,34001000300
3,1400000US34001000400,2941,167,33.3,305,199,16.0,"Census Tract 4, Atlantic County, New Jersey",553,Census Tract 4,Atlantic County,New Jersey,34001000400
4,1400000US34001000500,3298,299,36.5,758,225,25.7,"Census Tract 5, Atlantic County, New Jersey",842,Census Tract 5,Atlantic County,New Jersey,34001000500


In [71]:
# calculate percent uninsured
census_data['percentUninsured'] = census_data['totalUninsured']/census_data['totalPop']*100

# calculate percent under 5
census_data['percentUnder5'] = census_data['under5']/census_data['totalPop']*100

# calculate percent with less than high school education
census_data['percentHighSchool'] = census_data['highSchoolEd']/census_data['totalPop']*100

census_data.head()

Unnamed: 0,id,totalPop,under5,percentBelowPoverty,highSchoolEd,totalLimitedEnglish,percentLimitedEnglish,name,totalUninsured,censusTract,county,state,GEOID,percentUninsured,percentUnder5,percentHighSchool
0,1400000US34001000100,2142,157,40.1,356,100,13.3,"Census Tract 1, Atlantic County, New Jersey",282,Census Tract 1,Atlantic County,New Jersey,34001000100,13.165266,7.329599,16.619981
1,1400000US34001000200,3296,92,23.3,478,256,19.2,"Census Tract 2, Atlantic County, New Jersey",263,Census Tract 2,Atlantic County,New Jersey,34001000200,7.979369,2.791262,14.502427
2,1400000US34001000300,4194,348,30.2,940,359,28.2,"Census Tract 3, Atlantic County, New Jersey",1288,Census Tract 3,Atlantic County,New Jersey,34001000300,30.710539,8.297568,22.412971
3,1400000US34001000400,2941,167,33.3,305,199,16.0,"Census Tract 4, Atlantic County, New Jersey",553,Census Tract 4,Atlantic County,New Jersey,34001000400,18.803128,5.678341,10.370622
4,1400000US34001000500,3298,299,36.5,758,225,25.7,"Census Tract 5, Atlantic County, New Jersey",842,Census Tract 5,Atlantic County,New Jersey,34001000500,25.530625,9.066101,22.983626


### Join census data to census tract shapefile

In [72]:
# read shapefile in
tracts = gpd.read_file('../data/new-jersey-tracts-shapefile/tl_2019_34_tract.shp')
tracts.head()

Unnamed: 0,STATEFP,COUNTYFP,TRACTCE,GEOID,NAME,NAMELSAD,MTFCC,FUNCSTAT,ALAND,AWATER,INTPTLAT,INTPTLON,geometry
0,34,31,256804,34031256804,2568.04,Census Tract 2568.04,G5020,S,29792621,809082,41.0787934,-74.3779531,"POLYGON ((-74.41591 41.08861, -74.41484 41.089..."
1,34,31,246300,34031246300,2463.0,Census Tract 2463,G5020,S,10100083,480508,40.910421,-74.2642143,"POLYGON ((-74.28932 40.92851, -74.28932 40.928..."
2,34,31,181200,34031181200,1812.0,Census Tract 1812,G5020,S,401624,0,40.9242986,-74.1499553,"POLYGON ((-74.15537 40.92394, -74.15535 40.924..."
3,34,31,256803,34031256803,2568.03,Census Tract 2568.03,G5020,S,14231776,740595,41.1221009,-74.3805752,"POLYGON ((-74.41296 41.11469, -74.41051 41.117..."
4,34,31,246003,34031246003,2460.03,Census Tract 2460.03,G5020,S,6580967,1795588,40.9770103,-74.2610348,"POLYGON ((-74.28242 40.97003, -74.28241 40.970..."


In [73]:
# filter to drop unneccessary columns
tracts = tracts.filter(['GEOID', 'NAME', 'NAMELSAD', 'ALAND', 'geometry'], axis=1)
tracts.head()

Unnamed: 0,GEOID,NAME,NAMELSAD,ALAND,geometry
0,34031256804,2568.04,Census Tract 2568.04,29792621,"POLYGON ((-74.41591 41.08861, -74.41484 41.089..."
1,34031246300,2463.0,Census Tract 2463,10100083,"POLYGON ((-74.28932 40.92851, -74.28932 40.928..."
2,34031181200,1812.0,Census Tract 1812,401624,"POLYGON ((-74.15537 40.92394, -74.15535 40.924..."
3,34031256803,2568.03,Census Tract 2568.03,14231776,"POLYGON ((-74.41296 41.11469, -74.41051 41.117..."
4,34031246003,2460.03,Census Tract 2460.03,6580967,"POLYGON ((-74.28242 40.97003, -74.28241 40.970..."


In [74]:
# join census data to shapefile
census_tracts = pd.merge(tracts, census_data, on='GEOID', how='left')
census_tracts.head()

Unnamed: 0,GEOID,NAME,NAMELSAD,ALAND,geometry,id,totalPop,under5,percentBelowPoverty,highSchoolEd,totalLimitedEnglish,percentLimitedEnglish,name,totalUninsured,censusTract,county,state,percentUninsured,percentUnder5,percentHighSchool
0,34031256804,2568.04,Census Tract 2568.04,29792621,"POLYGON ((-74.41591 41.08861, -74.41484 41.089...",1400000US34031256804,6760,273,2.8,179,0,0.0,"Census Tract 2568.04, Passaic County, New Jersey",275,Census Tract 2568.04,Passaic County,New Jersey,4.068047,4.038462,2.647929
1,34031246300,2463.0,Census Tract 2463,10100083,"POLYGON ((-74.28932 40.92851, -74.28932 40.928...",1400000US34031246300,4946,273,13.6,338,46,2.7,"Census Tract 2463, Passaic County, New Jersey",626,Census Tract 2463,Passaic County,New Jersey,12.656692,5.519612,6.833805
2,34031181200,1812.0,Census Tract 1812,401624,"POLYGON ((-74.15537 40.92394, -74.15535 40.924...",1400000US34031181200,4817,490,20.6,550,242,17.0,"Census Tract 1812, Passaic County, New Jersey",714,Census Tract 1812,Passaic County,New Jersey,14.822504,10.172306,11.417895
3,34031256803,2568.03,Census Tract 2568.03,14231776,"POLYGON ((-74.41296 41.11469, -74.41051 41.117...",1400000US34031256803,5129,281,4.5,194,54,2.6,"Census Tract 2568.03, Passaic County, New Jersey",329,Census Tract 2568.03,Passaic County,New Jersey,6.414506,5.478651,3.782414
4,34031246003,2460.03,Census Tract 2460.03,6580967,"POLYGON ((-74.28242 40.97003, -74.28241 40.970...",1400000US34031246003,5581,398,1.1,170,27,1.5,"Census Tract 2460.03, Passaic County, New Jersey",93,Census Tract 2460.03,Passaic County,New Jersey,1.666368,7.131338,3.046049


In [75]:
# drop redundant columns
census_tracts = census_tracts.drop(['NAMELSAD', 'id', 'NAME', 'name'], axis=1)

# create a column with land area in square miles, converted from square meters
census_tracts['ALANDsquareMiles'] = census_tracts['ALAND'] / 2589988.110336

# create population density column
census_tracts['popDensity'] = census_tracts['totalPop']/census_tracts['ALANDsquareMiles']

census_tracts.head()

Unnamed: 0,GEOID,ALAND,geometry,totalPop,under5,percentBelowPoverty,highSchoolEd,totalLimitedEnglish,percentLimitedEnglish,totalUninsured,censusTract,county,state,percentUninsured,percentUnder5,percentHighSchool,ALANDsquareMiles,popDensity
0,34031256804,29792621,"POLYGON ((-74.41591 41.08861, -74.41484 41.089...",6760,273,2.8,179,0,0.0,275,Census Tract 2568.04,Passaic County,New Jersey,4.068047,4.038462,2.647929,11.502995,587.673022
1,34031246300,10100083,"POLYGON ((-74.28932 40.92851, -74.28932 40.928...",4946,273,13.6,338,46,2.7,626,Census Tract 2463,Passaic County,New Jersey,12.656692,5.519612,6.833805,3.899664,1268.314448
2,34031181200,401624,"POLYGON ((-74.15537 40.92394, -74.15535 40.924...",4817,490,20.6,550,242,17.0,714,Census Tract 1812,Passaic County,New Jersey,14.822504,10.172306,11.417895,0.155068,31063.812739
3,34031256803,14231776,"POLYGON ((-74.41296 41.11469, -74.41051 41.117...",5129,281,4.5,194,54,2.6,329,Census Tract 2568.03,Passaic County,New Jersey,6.414506,5.478651,3.782414,5.494919,933.40768
4,34031246003,6580967,"POLYGON ((-74.28242 40.97003, -74.28241 40.970...",5581,398,1.1,170,27,1.5,93,Census Tract 2460.03,Passaic County,New Jersey,1.666368,7.131338,3.046049,2.540926,2196.443721


In [76]:
# convert crs to WGS84 for web mapping
census_tracts = census_tracts.to_crs({'init': 'epsg:4326'})

# write joined census data to geojson
census_tracts.to_file("../data/census-outputs/new-jersey-tracts.geojson", driver='GeoJSON')