# US Census - Extraction

### Import Libraries

In [18]:
import requests
import pandas as pd
from bs4 import BeautifulSoup

### Parameters

In [19]:
URL = "https://data.census.gov/table?q=B25040&g=160XX00US0643000&tid=ACSDT1Y2021.B25040"
API_METADATA = "https://data.census.gov/api/search/metadata/table?id=ACSDT1Y2021.B25040&g=160XX00US0643000"
API_TABLE = "https://data.census.gov/api/access/data/table?id=ACSDT1Y2021.B25040&g=160XX00US0643000"

In [3]:
headers =  {
    "authority": "data.census.gov",
    "method": "GET",
    "path": "/table?q=B25040&g=160XX00US0643000&tid=ACSDT1Y2021.B25040",
    "scheme": "https",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
    "Accept-Encoding": "gzip, deflate, br",
    "Accept-Language": "en-US,en;q=0.9",
    "Cache-Control": "max-age=0",
    "Sec-Ch-Ua": "\"Not.A/Brand\";v=\"8\", \"Chromium\";v=\"114\", \"Google Chrome\";v=\"114\"",
    "Sec-Ch-Ua-Mobile": "?0",
    "Sec-Ch-Ua-Platform": "\"Windows\"",
    "Sec-Fetch-Dest": "document",
    "Sec-Fetch-Mode": "navigate",
    "Sec-Fetch-Site": "same-origin",
    "Sec-Fetch-User": "?1",
    "Upgrade-Insecure-Requests": "1",
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

In [23]:
def get_metadata_content(API_METADATA, headers):
    """
    Gets metadata content from API
    """
    r = requests.get(API_METADATA, headers=headers)
    metadata_content = r.json()
    metadata_content = metadata_content['response']

    return metadata_content

In [20]:
def get_mapping_dict(metadata_content):
    """
    Gets mapping dictionary from metadata content
    """
    mapping_content = metadata_content['response']['metadataContent']['dimensions'][2]['categories']
    # Note - M is for margin of error and E is for estimate
    mapping_dict = {}

    for item in mapping_content:
        label = item['label']

        corresponding_ids = item['item_mapping']
        for corresponding_id in corresponding_ids:
            if 'E' in corresponding_id:
                mapping_dict[corresponding_id] = f"{label} Estimate"
            else:
                mapping_dict[corresponding_id] = f"{label} Margin of Error"

    return mapping_dict

### Tests

In [5]:
metadata_content['response']['metadataContent']['dataset']

{'component': 'Detailed Tables',
 'provider': 'ACS',
 'vintage': '2021',
 'dataCategory': 'AGGREGATE',
 'name': 'ACS 1-Year Estimates Detailed Tables',
 'program': 'American Community Survey',
 'subProgram': '1-Year Estimates',
 'dataset': 'ACSDT1Y2021',
 'programShortLabel': 'ACS'}

In [6]:
metadata_content['response']['metadataContent']['dimensions'][1]

{'globalDimension': False,
 'dimension_type': {'description': 'MEASURE', 'id': 'MEASURE'},
 'id': 'ESTIMATE',
 'categories': [{'orderNumber': '1',
   'id': 'COLUMN1',
   'label': 'Estimate',
   'item_mapping': ['B25040_001E',
    'B25040_002E',
    'B25040_003E',
    'B25040_004E',
    'B25040_005E',
    'B25040_006E',
    'B25040_007E',
    'B25040_008E',
    'B25040_009E',
    'B25040_010E']},
  {'orderNumber': '2',
   'id': 'COLUMN2',
   'label': 'Margin of Error',
   'item_mapping': ['B25040_001M',
    'B25040_002M',
    'B25040_003M',
    'B25040_004M',
    'B25040_005M',
    'B25040_006M',
    'B25040_007M',
    'B25040_008M',
    'B25040_009M',
    'B25040_010M']}],
 'axis': 'H',
 'order_number': 2}

In [22]:
mapping_dict = get_mapping_dict(metadata_content)
mapping_dict

{'B25040_001E': 'Total: Estimate',
 'B25040_001M': 'Total: Margin of Error',
 'B25040_002E': 'Utility gas Estimate',
 'B25040_002M': 'Utility gas Margin of Error',
 'B25040_003E': 'Bottled, tank, or LP gas Estimate',
 'B25040_003M': 'Bottled, tank, or LP gas Margin of Error',
 'B25040_004E': 'Electricity Estimate',
 'B25040_004M': 'Electricity Margin of Error',
 'B25040_005E': 'Fuel oil, kerosene, etc. Estimate',
 'B25040_005M': 'Fuel oil, kerosene, etc. Margin of Error',
 'B25040_006E': 'Coal or coke Estimate',
 'B25040_006M': 'Coal or coke Margin of Error',
 'B25040_007E': 'Wood Estimate',
 'B25040_007M': 'Wood Margin of Error',
 'B25040_008E': 'Solar energy Estimate',
 'B25040_008M': 'Solar energy Margin of Error',
 'B25040_009E': 'Other fuel Estimate',
 'B25040_009M': 'Other fuel Margin of Error',
 'B25040_010E': 'No fuel used Estimate',
 'B25040_010M': 'No fuel used Margin of Error'}

In [8]:
r = requests.get(API_TABLE, headers=headers)
table_content = r.json()['response']

In [17]:
labels = table_content['data'][0]
data = table_content['data'][1]

df = pd.DataFrame.from_dict({"labels": labels, "data": data})
df.replace(mapping_dict, inplace=True)
df.dropna(inplace=True)
df

Unnamed: 0,labels,data
0,Wood Margin of Error,96
1,Total: Estimate,172599
2,Other fuel Margin of Error,186
3,"Bottled, tank, or LP gas Estimate",1688
4,"Fuel oil, kerosene, etc. Estimate",106
5,Wood Estimate,56
6,Other fuel Estimate,207
22,Total: Margin of Error,4170
25,"Bottled, tank, or LP gas Margin of Error",522
27,"Fuel oil, kerosene, etc. Margin of Error",128
