In [2]:
!pip install requests census pandas us

Collecting census
  Downloading census-0.8.25-py3-none-any.whl.metadata (8.2 kB)
Collecting us
  Downloading us-3.2.0-py3-none-any.whl.metadata (10 kB)
Collecting jellyfish (from us)
  Downloading jellyfish-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (642 bytes)
Downloading census-0.8.25-py3-none-any.whl (11 kB)
Downloading us-3.2.0-py3-none-any.whl (13 kB)
Downloading jellyfish-1.2.1-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (360 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m360.5/360.5 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jellyfish, us, census
Successfully installed census-0.8.25 jellyfish-1.2.1 us-3.2.0


In [3]:
import pandas as pd
from census import Census
from us import states
from google.colab import userdata

### Pull in PA demographic data off ACS API

In [4]:
# 1. Load your API key
key = userdata.get("CensusAPIKey")
c = Census(key)

# 2. Choose the relevant dataset (e.g., acs5 for 5-year estimates)
dataset = c.acs5

# 3. Define variable codes to pull in and readable column names to replace them with in DF
variables = (
    'NAME',                 # Geography Name
    'B01001_001E',          # Total Population
    'B01001A_001E',         # Race: White Alone
    'B01001B_001E',         # Race: Black/African American Alone
    'B03002_012E',          # Ethnicity: Hispanic or Latino
    'B19013_001E',          # Income: Median Household Income
    'B15003_022E',          # Education: Bachelor's Degree or Higher (Pop 25+)
    'B01001_020E',          # Age: 65-66 years (often higher turnout)
    'B01001_021E',          # Age: 67-69 years
    'B01001_022E',          # Age: 70-74 years
    'B01001_023E',          # Age: 75-79 years
    'B01001_024E',          # Age: 80-84 years
    'B01001_025E',          # Age: 85+ years
    'B01001_007E',          # Age: 18-19 years (often lower turnout)
    'B01001_008E',          # Age: 20 years
    'B01001_009E',          # Age: 21 years
    'B01001_010E',          # Age: 22-24 years
    'B25003_002E',          # Housing: Owner-Occupied Units
    'B25003_003E',          # Housing: Renter-Occupied Units
    'B05001_006E'           # Citizenship: Foreign Born (Naturalized/Not)
)

labels = (
    'NAME',
    'Total Population',
    'Pop White Alone',
    'Pop Black Alone',
    'Pop Hispanic or Latino',
    'Median Household Income',
    'Edu Bachelors or Higher',
    'Age 65-66',
    'Age 67-69',
    'Age 70-74',
    'Age 75-79',
    'Age 80-84',
    'Age 85 Plus',
    'Age 18-19',
    'Age 20',
    'Age 21',
    'Age 22-24',
    'Owner Occupied Units',
    'Renter Occupied Units',
    'Foreign Born Pop'
)

# 4. Define geographic aggregations and criterias
# Example: Get data for all counties in a specific state (e.g., Colorado)
geo_for = 'county:*'
geo_in_PA = f'state:{states.PA.fips}' # Using the 'us' library to get FIPS code for PA
geo_in_MI = f'state:{states.MI.fips}' # Using the 'us' library to get FIPS code for MI
geo_in_NC = f'state:{states.NC.fips}' # Using the 'us' library to get FIPS code for NC

# 5. Run the query and store as a pandas DataFrame
# Pull in 2024 data for all three states
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_PA}, year=2024)
df_pa24 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_MI}, year=2024)
df_mi24 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_NC}, year=2024)
df_nc24 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
# Pull in 2020 data for all three states
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_PA}, year=2020)
df_pa20 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_MI}, year=2020)
df_mi20 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_NC}, year=2020)
df_nc20 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
# Pull in 2016 data for all three states
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_PA}, year=2016)
df_pa16 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_MI}, year=2016)
df_mi16 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
r = dataset.get(variables, {'for': geo_for, 'in': geo_in_NC}, year=2016)
df_nc16 = pd.DataFrame(r).rename(columns={v: l for v, l in zip(variables, labels)})
# Add year column
df_pa24['year'] = 2024
df_mi24['year'] = 2024
df_nc24['year'] = 2024
df_pa20['year'] = 2020
df_mi20['year'] = 2020
df_nc20['year'] = 2020
df_pa16['year'] = 2016
df_mi16['year'] = 2016
df_nc16['year'] = 2016
# Merge DFs
df_fullData = pd.concat([df_pa16, df_mi16, df_nc16, df_pa20, df_mi20, df_nc20, df_pa24, df_mi24, df_nc24], ignore_index=True)
# Print the first few rows of the DataFrame
print(df_fullData)
has_nulls = df_fullData.isnull().values.sum()
print(f"The DataFrame has {has_nulls} null values.")

                                 NAME  Total Population  Pop White Alone  \
0           Pike County, Pennsylvania           56210.0          50416.0   
1         Snyder County, Pennsylvania           40246.0          38891.0   
2    Susquehanna County, Pennsylvania           41832.0          40887.0   
3       Crawford County, Pennsylvania           87027.0          83395.0   
4           Erie County, Pennsylvania          279133.0         243823.0   
..                                ...               ...              ...   
745      Wayne County, North Carolina          118652.0          60942.0   
746     Wilkes County, North Carolina           65935.0          57339.0   
747     Wilson County, North Carolina           79290.0          36124.0   
748     Yadkin County, North Carolina           37574.0          31232.0   
749     Yancey County, North Carolina           18797.0          17315.0   

     Pop Black Alone  Pop Hispanic or Latino  Median Household Income  \
0             

In [5]:
df_fullData.describe()

Unnamed: 0,Total Population,Pop White Alone,Pop Black Alone,Pop Hispanic or Latino,Median Household Income,Edu Bachelors or Higher,Age 65-66,Age 67-69,Age 70-74,Age 75-79,Age 80-84,Age 85 Plus,Age 18-19,Age 20,Age 21,Age 22-24,Owner Occupied Units,Renter Occupied Units,Foreign Born Pop,year
count,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0,750.0
mean,132821.6,98362.241333,19762.445333,10170.068,55289.121333,17788.726667,1529.801333,2024.24,2660.294667,1768.976,1114.241333,980.261333,1848.533333,965.694667,935.686667,2653.885333,36202.316,16317.528,4993.28,2020.0
std,234273.2,149980.499051,67623.683045,23856.634729,14150.560753,37981.625589,2427.976504,3130.770157,4047.341596,2648.978917,1712.555864,1642.044722,3176.530119,1647.234806,1610.716257,4782.819684,59616.915519,34756.347068,13922.526958,3.268166
min,2102.0,1962.0,0.0,11.0,30013.0,134.0,27.0,58.0,58.0,31.0,13.0,0.0,4.0,0.0,0.0,1.0,905.0,106.0,2.0,2016.0
25%,25611.25,22369.5,407.25,640.0,45124.25,2430.75,397.75,525.25,741.5,515.5,307.5,225.5,300.25,145.0,130.75,422.25,8724.75,2329.25,215.75,2016.0
50%,54794.0,44130.0,2423.5,2438.5,52612.0,4899.0,717.0,961.5,1300.5,866.0,532.0,416.0,694.5,356.5,352.0,980.5,15603.5,5650.0,842.0,2020.0
75%,137647.2,109671.0,14316.0,8391.0,62925.25,14285.5,1633.75,2104.5,2819.5,1874.5,1185.25,975.0,1820.0,993.5,918.25,2564.5,37183.25,14694.5,2626.75,2024.0
max,1772259.0,992002.0,694872.0,245882.0,127208.0,271137.0,20479.0,24968.0,35827.0,22590.0,12683.0,12416.0,23726.0,14043.0,13652.0,37451.0,453652.0,327523.0,123286.0,2024.0
