In [1]:
#import dependencies
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import requests
from census import Census

# Census API Key
from config import census_key
c = Census(census_key, year=2018)

## Import Census Data 

In [2]:
#import census variable names csv
census_codes_csv = pd.read_csv('../Resources/census_columns.csv')
census_codes_csv.dropna(inplace = True)


#extract the columns of csv into lists
codes = [code for code in census_codes_csv['Code'] ]
names = [name for name in census_codes_csv['Name'] ]

In [63]:
#display the first four codes
codes[0:3]

['B01003_001E', 'B19013_001E', 'B19301_001E']

In [4]:
#call census api
census_data = c.acs5.get(codes, {'for': 'zip code tabulation area:*'})

In [5]:
#create data frame
census_df = pd.DataFrame(census_data)
census_df.head()

Unnamed: 0,B01003_001E,B19013_001E,B19301_001E,B02018_007E,B04006_040E,B04006_044E,B02015_001E,B04006_051E,B02015_011E,B04006_022E,B03001_004E,B05003B_001E,B02015_021E,zip code tabulation area
0,17242.0,13092.0,6999.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,145.0,0.0,601
1,38442.0,16358.0,9277.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,83.0,1070.0,0.0,602
2,48814.0,16603.0,11307.0,112.0,27.0,0.0,364.0,66.0,0.0,12.0,68.0,1930.0,0.0,603
3,6437.0,12832.0,5943.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,149.0,0.0,606
4,27073.0,19309.0,10220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,696.0,0.0,610


In [6]:
#create rename dictionary based on labels from the census columns csv
census_rename_dict = {codes[i]: names[i] for i in range(len(codes))} 
census_rename_dict['zip code tabulation area'] = 'zipcode'

#rename the columns
census_df.rename(columns = census_rename_dict,inplace = True)
census_df.head()

Unnamed: 0,population,household_income,per_capita_income,chinese,french,greek,indian,italian,japanese,brazillian,mexican,african_american,thai,zipcode
0,17242.0,13092.0,6999.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,145.0,0.0,601
1,38442.0,16358.0,9277.0,0.0,0.0,0.0,0.0,25.0,0.0,0.0,83.0,1070.0,0.0,602
2,48814.0,16603.0,11307.0,112.0,27.0,0.0,364.0,66.0,0.0,12.0,68.0,1930.0,0.0,603
3,6437.0,12832.0,5943.0,0.0,0.0,0.0,0.0,9.0,0.0,0.0,0.0,149.0,0.0,606
4,27073.0,19309.0,10220.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,696.0,0.0,610


In [7]:
census_df.dropna(inplace = True)
census_df['zipcode'] = census_df['zipcode'].astype('string')


In [8]:
census_df.agg(['mean','median','std','max'])

Unnamed: 0,population,household_income,per_capita_income,chinese,french,greek,indian,italian,japanese,brazillian,mexican,african_american,thai
mean,9952.712564,-38346450.0,-6479207.0,148.66927,238.49399,38.844368,536.381041,515.281334,23.608006,12.781059,1102.335917,1259.385465,6.108891
median,2885.0,52708.0,27151.0,0.0,59.0,0.0,9.0,69.0,0.0,0.0,41.0,35.0,0.0
std,14657.156911,155343700.0,65554330.0,888.197731,465.577616,104.316645,2095.808263,1141.493587,214.339454,101.058805,4377.175712,3915.764198,28.694656
max,122814.0,250001.0,461279.0,46205.0,10934.0,4498.0,58759.0,29680.0,13071.0,7440.0,92912.0,86679.0,1509.0


## Extract zipcodes of interest

In [49]:
#list cities of interest
cities = ['philadelphia','atlanta','dallas','indianapolis','seattle']

#import csv files contiaining zipcodes for the cities
zipcodes = [pd.read_csv(f"../Resources/{city}_zipcodes.csv",encoding= 'unicode_escape') for city in cities ]

#drop the last row from every list as it is a reference link
zipcodes = [city.drop(city.tail(1).index) for city in zipcodes]

#make strings
zipcodes = [city.astype('string') for city in zipcodes]

#show dataframe for one of the cities
zipcodes[0].head()

Unnamed: 0,Philadelphia
0,19102
1,19103
2,19104
3,19106
4,19107


In [39]:
#aim to make one long dataframe with 2 columns: city and zipcode to later merge with the larger dataframe
zipcodes_df = pd.concat(zipcodes)
zipcodes_df = zipcodes_df.stack()
zipcodes_df = zipcodes_df.reset_index()
zipcodes_df.drop(columns = 'level_0', inplace = True)
zipcodes_df.rename(columns = {'level_1' : 'city',0:'zipcode'},inplace = True)
zipcodes_df

Unnamed: 0,city,zipcode
0,Philadelphia,19102
1,Philadelphia,19103
2,Philadelphia,19104
3,Philadelphia,19106
4,Philadelphia,19107
...,...,...
186,Seattle,98164
187,Seattle,98174
188,Seattle,98177
189,Seattle,98195


In [47]:
#merge dataframes 
census_cities_df = pd.merge(zipcodes_df,census_df,how = 'left', on = 'zipcode')
census_cities_df

Unnamed: 0,city,zipcode,population,household_income,per_capita_income,chinese,french,greek,indian,italian,japanese,brazillian,mexican,african_american,thai
0,Philadelphia,19102,4937.0,90750.0,87197.0,264.0,214.0,34.0,809.0,591.0,69.0,0.0,4.0,220.0,0.0
1,Philadelphia,19103,24219.0,73611.0,78815.0,927.0,388.0,234.0,2840.0,2641.0,32.0,66.0,213.0,1499.0,127.0
2,Philadelphia,19104,54311.0,25865.0,15478.0,3760.0,430.0,117.0,7662.0,2190.0,147.0,5.0,550.0,22962.0,0.0
3,Philadelphia,19106,12375.0,109393.0,82364.0,304.0,416.0,245.0,798.0,2027.0,86.0,0.0,88.0,1054.0,0.0
4,Philadelphia,19107,13696.0,53534.0,49131.0,2147.0,554.0,173.0,3720.0,1267.0,66.0,38.0,93.0,1117.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
186,Seattle,98164,155.0,155391.0,163222.0,0.0,0.0,0.0,32.0,16.0,0.0,0.0,0.0,0.0,0.0
187,Seattle,98174,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,Seattle,98177,21619.0,109571.0,59544.0,987.0,665.0,282.0,1735.0,671.0,166.0,35.0,339.0,325.0,68.0
189,Seattle,98195,,,,,,,,,,,,,


## Cleaning Data 

In [48]:
census_cities_df[census_cities_df['population'].isnull()]

Unnamed: 0,city,zipcode,population,household_income,per_capita_income,chinese,french,greek,indian,italian,japanese,brazillian,mexican,african_american,thai
139,Indianapolis,46183,,,,,,,,,,,,,
189,Seattle,98195,,,,,,,,,,,,,


In [51]:
census_cities_df.dropna(inplace=True)
census_cities_df

Unnamed: 0,city,zipcode,population,household_income,per_capita_income,chinese,french,greek,indian,italian,japanese,brazillian,mexican,african_american,thai
0,Philadelphia,19102,4937.0,90750.0,87197.0,264.0,214.0,34.0,809.0,591.0,69.0,0.0,4.0,220.0,0.0
1,Philadelphia,19103,24219.0,73611.0,78815.0,927.0,388.0,234.0,2840.0,2641.0,32.0,66.0,213.0,1499.0,127.0
2,Philadelphia,19104,54311.0,25865.0,15478.0,3760.0,430.0,117.0,7662.0,2190.0,147.0,5.0,550.0,22962.0,0.0
3,Philadelphia,19106,12375.0,109393.0,82364.0,304.0,416.0,245.0,798.0,2027.0,86.0,0.0,88.0,1054.0,0.0
4,Philadelphia,19107,13696.0,53534.0,49131.0,2147.0,554.0,173.0,3720.0,1267.0,66.0,38.0,93.0,1117.0,17.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
185,Seattle,98154,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
186,Seattle,98164,155.0,155391.0,163222.0,0.0,0.0,0.0,32.0,16.0,0.0,0.0,0.0,0.0,0.0
187,Seattle,98174,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
188,Seattle,98177,21619.0,109571.0,59544.0,987.0,665.0,282.0,1735.0,671.0,166.0,35.0,339.0,325.0,68.0


I notice that some seattle entries do not look right (population of zero, negative per capita income) I investigate below:

In [53]:
census_cities_df.agg(['count','mean','max','min'])

Unnamed: 0,city,zipcode,population,household_income,per_capita_income,chinese,french,greek,indian,italian,japanese,brazillian,mexican,african_american,thai
count,189,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0,189.0
max,Seattle,98199.0,74971.0,167656.0,163222.0,6609.0,1826.0,539.0,14652.0,12570.0,681.0,1717.0,34853.0,62340.0,580.0
min,Atlanta,19102.0,0.0,-666666700.0,-666666700.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,,inf,28276.037037,-31686310.0,-24653790.0,558.84127,413.994709,90.31746,1806.174603,1087.31746,60.603175,41.957672,3277.962963,8624.825397,19.89418


In [70]:
census_cities_df[census_cities_df['population']<= 0]

Unnamed: 0,city,zipcode,population,household_income,per_capita_income,chinese,french,greek,indian,italian,japanese,brazillian,mexican,african_american,thai
5,Philadelphia,19109,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,Philadelphia,19112,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
70,Atlanta,30334,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
136,Dallas,75270,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
137,Dallas,75390,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
185,Seattle,98154,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
187,Seattle,98174,0.0,-666666666.0,-666666666.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
