In [1]:
pip install census

Note: you may need to restart the kernel to use updated packages.


In [2]:
%matplotlib inline

In [3]:
# Dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census
import os
import csv

# File path
#atm_data = "../Desktop/visa_atm_cleaned.csv"

# Census API Key
from api_keys import api_key_census
c = Census(ce, year=2017)

In [8]:
# Run Census Search to retrieve data on all zip codes (2017 ACS5 Census)
# See: https://github.com/CommerceDataService/census-wrapper for library documentation
# See: https://gist.github.com/afhaque/60558290d6efd892351c4b64e5c01e9b for labels

census_data = c.acs5.get(("NAME", "B19013_001E", "B01003_001E", "B01002_001E",
                          "B19301_001E", "B23025_002E", "B23025_007E", "B17001A_002E", 
                          "B17001B_002E",  "B17001D_002E", "B17001I_002E", "B02001_002E",
                          "B02001_003E", "B02001_005E", "B03001_003E", "B25058_001E",
                          "B25064_001E", "B25077_001E", "B25088_002E", "B15003_002E", 
                          "B15003_017E", "B15003_018E", "B15003_021E", "B15003_022E", 
                          "B15003_023E", "B15003_024E","B15003_025E", 
                          "B17001_002E"), {'for': 'zip code tabulation area:*'})


In [9]:
# Convert to DataFrame
census_pd = pd.DataFrame(census_data)


#find county code for column or state column and add below

# Column Reordering
census_pd = census_pd.rename(columns={"B01003_001E": "Population",
                                      "B02001_002E": "White Population",
                                      "B02001_003E": "Black Population",
                                      "B02001_005E": "Asian Population",
                                      "B03001_003E": "Hispanic Population",
                                      "B01002_001E": "Median Age",
                                      "B19013_001E": "Household Income",
                                      "B19301_001E": "Per Capita Income",
                                      "B17001_002E": "Poverty Count",
                                      "B17001A_002E": "White Poverty",
                                      "B17001B_002E": "Black Poverty",
                                      "B17001I_002E": "Hispanic Poverty",
                                      "B17001D_002E": "Asian Poverty",
                                      "B25058_001E": "Avg Rent",
                                      "B25064_001E": "Avg Rent & Utilities",
                                      "B25077_001E": "Median Home Value",
                                      "B25088_002E": "Avg Monthly Cost of Home",
                                      "B23025_002E": "Employment Count",
                                      "B23025_007E": "Unemployment Count",
                                      "B15003_002E": "No HSch Ed",
                                      "B15003_017E": "HSch Ed",
                                      "B15003_018E": "GED",
                                      "B15003_021E": "Associate College",
                                      "B15003_022E": "Bachelors",
                                      "B15003_023E": "Masters",
                                      "B15003_024E": "Professional",
                                      "B15003_025E": "Doctorate",
                                      "NAME": "Name", "zip code tabulation area": "Zipcode"})

# Add in Poverty Rate (Poverty Count / Population)
census_pd["Poverty Rate"] = 100 * \
    census_pd["Poverty Count"].astype(
        int) / census_pd["Population"].astype(int)

# Add in Unemployment Rate (Unemployment Count / Population)
census_pd["Unemployment Rate"] = 100 * \
    census_pd["Unemployment Count"].astype(
        int) / census_pd["Population"].astype(int)

# Add in Employment Rate (Employment Count / Population)
census_pd["Employment Rate"] = 100 * \
    census_pd["Employment Count"].astype(
        int) / census_pd["Population"].astype(int)

census_pd["HSch/GED"] = census_pd["HSch Ed"].astype(int) + census_pd["GED"].astype(int)

# Final DataFrame
census_pd = census_pd[["Zipcode", "Population", "White Population", "Black Population", "Hispanic Population", "Asian Population",
                       "Median Age", "Employment Rate", "Unemployment Rate", "Household Income", "Per Capita Income",
                       "Avg Rent", "Avg Rent & Utilities", "Median Home Value", "Avg Monthly Cost of Home",
                       "No HSch Ed", "HSch/GED", "Associate College", "Bachelors", "Masters",
                       "Professional", "Doctorate", "Poverty Count", "Poverty Rate", 
                       "White Poverty", "Black Poverty", "Hispanic Poverty", "Asian Poverty"]]

# Visualize
print(len(census_pd))
census_pd.head()

33120


Unnamed: 0,Zipcode,Population,White Population,Black Population,Hispanic Population,Asian Population,Median Age,Employment Rate,Unemployment Rate,Household Income,...,Bachelors,Masters,Professional,Doctorate,Poverty Count,Poverty Rate,White Poverty,Black Poverty,Hispanic Poverty,Asian Poverty
0,601,17599.0,13686.0,120.0,17533.0,0.0,38.9,33.825786,45.900335,11757.0,...,1787.0,269.0,15.0,80.0,11282.0,64.105915,8765.0,84.0,11223.0,0.0
1,602,39209.0,26213.0,1092.0,36736.0,0.0,40.9,36.700757,45.963937,16190.0,...,3694.0,1097.0,174.0,332.0,20428.0,52.100283,13813.0,557.0,19144.0,0.0
2,603,50135.0,35709.0,1985.0,48865.0,557.0,40.4,32.001596,49.905256,16645.0,...,5858.0,1605.0,285.0,234.0,25176.0,50.216416,17714.0,874.0,24744.0,382.0
3,606,6304.0,3045.0,160.0,6292.0,0.0,42.8,27.078046,55.076142,13387.0,...,321.0,77.0,17.0,17.0,4092.0,64.911168,1819.0,63.0,4080.0,0.0
4,610,27590.0,17038.0,845.0,26850.0,0.0,41.4,36.418992,46.208771,18741.0,...,2268.0,500.0,10.0,141.0,12553.0,45.498369,7471.0,426.0,12263.0,0.0


In [10]:
# have a df (census_pd) of 33,120 zipcodes (rows) that needs to be reduced in size
# create a list of zipcodes of interest for Richmond City, and surrounding Henrico and Chesterfield Counties
zip_list = ['23005', '23059','23060', '23063', '23111', '23112', '23113', '23114', '23116', '23120', '23139', '23219', '23221', '23222', '23223', '23224', '23225', '23226', '23227', '23228', '23229', '23230', '23231', '23232', '23233', '23234', '23235', '23236', '23238', '23294', '23298', '23806', '23831', '23832', '23838', '23860']
print(len(zip_list))

36


In [11]:
# now iterate through the original dataframe to extract only those rows with zipcodes in the above list and create a new dataframe
for zip in zip_list:
    census_va_pd = census_pd[census_pd.Zipcode.isin(zip_list)]

In [12]:
# Visualize
print(len(census_va_pd))
census_va_pd

34


Unnamed: 0,Zipcode,Population,White Population,Black Population,Hispanic Population,Asian Population,Median Age,Employment Rate,Unemployment Rate,Household Income,...,Bachelors,Masters,Professional,Doctorate,Poverty Count,Poverty Rate,White Poverty,Black Poverty,Hispanic Poverty,Asian Poverty
7025,23005,16337.0,12962.0,2555.0,786.0,201.0,37.8,52.402522,29.534186,67809.0,...,2366.0,911.0,171.0,268.0,1665.0,10.19159,1287.0,268.0,304.0,55.0
7047,23059,34031.0,23175.0,3230.0,1277.0,6288.0,41.0,53.051629,22.867386,104878.0,...,7550.0,4350.0,1304.0,843.0,1044.0,3.067791,655.0,249.0,25.0,124.0
7048,23060,37381.0,23419.0,7305.0,1857.0,4823.0,37.8,58.486932,20.68698,83802.0,...,7825.0,3579.0,749.0,337.0,1737.0,4.646746,1038.0,486.0,83.0,144.0
7051,23063,4466.0,2735.0,1618.0,112.0,3.0,44.0,51.724138,33.116883,65844.0,...,457.0,147.0,47.0,34.0,217.0,4.858934,98.0,119.0,0.0,0.0
7077,23111,37417.0,32378.0,3276.0,903.0,555.0,42.5,55.621776,24.579736,76342.0,...,6136.0,1771.0,264.0,246.0,2134.0,5.70329,1559.0,389.0,106.0,62.0
7078,23112,52485.0,39619.0,8325.0,2509.0,1855.0,37.7,55.629227,21.604268,89485.0,...,11002.0,4822.0,586.0,824.0,1713.0,3.26379,885.0,648.0,48.0,19.0
7079,23113,25195.0,21364.0,1758.0,965.0,1466.0,45.5,49.351062,29.06926,103683.0,...,6562.0,3026.0,973.0,535.0,1068.0,4.238936,844.0,81.0,76.0,124.0
7080,23114,18503.0,14790.0,1826.0,451.0,1355.0,39.2,54.510079,22.331514,100047.0,...,3889.0,2048.0,405.0,200.0,763.0,4.123656,342.0,91.0,11.0,305.0
7082,23116,29005.0,26448.0,1586.0,792.0,487.0,42.4,53.863127,24.123427,102376.0,...,6077.0,2621.0,599.0,395.0,1079.0,3.720048,935.0,142.0,0.0,2.0
7085,23120,8948.0,7770.0,568.0,245.0,273.0,35.4,52.749218,17.333482,132347.0,...,2151.0,898.0,206.0,57.0,143.0,1.598122,122.0,21.0,0.0,0.0


In [13]:
# Reset Index
census_reset = census_va_pd.reset_index(drop=False)
print(len(census_reset))
census_reset.head(10)

34


Unnamed: 0,index,Zipcode,Population,White Population,Black Population,Hispanic Population,Asian Population,Median Age,Employment Rate,Unemployment Rate,...,Bachelors,Masters,Professional,Doctorate,Poverty Count,Poverty Rate,White Poverty,Black Poverty,Hispanic Poverty,Asian Poverty
0,7025,23005,16337.0,12962.0,2555.0,786.0,201.0,37.8,52.402522,29.534186,...,2366.0,911.0,171.0,268.0,1665.0,10.19159,1287.0,268.0,304.0,55.0
1,7047,23059,34031.0,23175.0,3230.0,1277.0,6288.0,41.0,53.051629,22.867386,...,7550.0,4350.0,1304.0,843.0,1044.0,3.067791,655.0,249.0,25.0,124.0
2,7048,23060,37381.0,23419.0,7305.0,1857.0,4823.0,37.8,58.486932,20.68698,...,7825.0,3579.0,749.0,337.0,1737.0,4.646746,1038.0,486.0,83.0,144.0
3,7051,23063,4466.0,2735.0,1618.0,112.0,3.0,44.0,51.724138,33.116883,...,457.0,147.0,47.0,34.0,217.0,4.858934,98.0,119.0,0.0,0.0
4,7077,23111,37417.0,32378.0,3276.0,903.0,555.0,42.5,55.621776,24.579736,...,6136.0,1771.0,264.0,246.0,2134.0,5.70329,1559.0,389.0,106.0,62.0
5,7078,23112,52485.0,39619.0,8325.0,2509.0,1855.0,37.7,55.629227,21.604268,...,11002.0,4822.0,586.0,824.0,1713.0,3.26379,885.0,648.0,48.0,19.0
6,7079,23113,25195.0,21364.0,1758.0,965.0,1466.0,45.5,49.351062,29.06926,...,6562.0,3026.0,973.0,535.0,1068.0,4.238936,844.0,81.0,76.0,124.0
7,7080,23114,18503.0,14790.0,1826.0,451.0,1355.0,39.2,54.510079,22.331514,...,3889.0,2048.0,405.0,200.0,763.0,4.123656,342.0,91.0,11.0,305.0
8,7082,23116,29005.0,26448.0,1586.0,792.0,487.0,42.4,53.863127,24.123427,...,6077.0,2621.0,599.0,395.0,1079.0,3.720048,935.0,142.0,0.0,2.0
9,7085,23120,8948.0,7770.0,568.0,245.0,273.0,35.4,52.749218,17.333482,...,2151.0,898.0,206.0,57.0,143.0,1.598122,122.0,21.0,0.0,0.0


In [None]:
# There are two rows (8 and 27) in this new dataframe with bad values (NaN, and negatives)
#census_reset = census_reset.dropna(how='any') # will remove the row with NaN (row 27)
# How to remove row 8 with negative values...?
#census_final = census_reset.drop([8,27], axis=0) #should remove both bad rows....

In [None]:
# Visualize
#print(len(census_final))
#census_final

In [15]:
# Creating an output file containing the data extracted from the input file
# Specify the file to write to
output_path = os.path.join("..", "Desktop", "census_2017.csv")

census_reset.to_csv (output_path, index = None, header=True)

In [None]:
#zip_list = ['23005', '23059','23060', '23063', '23111', '23112', '23113', '23114', '23116', '23120', '23139', '23219', '23221', '23222', '23223', '23224', '23225', '23226', '23227', '23228', '23229', '23230', '23231', '23232', '23233', '23234', '23235', '23236', '23238', '23294', '23298', '23806', '23831', '23832', '23838', '23860']

In [None]:
#len(zip_list)