# Step 3: Extracting demographics

Now that we have the geocoded data, we can find the demographics of the relevant locations to later on make connections between the two.

In [1]:
# Import relevant libraries

import requests
import pandas as pd

In [2]:
# Load geocoded data

data = pd.read_csv("../data/processed/gbh_geocoded_output.csv")

### Get Census Demographics

This is also a function that's been used in the past. With it, given the basic information, we can find the relevant demographics of the location. There is a limit to how many queries can be done though, so this must be kept in mind. Further, we can select which year, data source and survey name to collect the data from and which demographics we are interested on. All the relevant links are inside the function. 

In [3]:
"""
# CENSUS DATA API 
# https://www.census.gov/content/dam/Census/library/publications/2020/acs/acs_api_handbook_2020_ch02.pdf
# any user can query small quantities of data with minimal restrictions - up to 50 variables in a single query, up to 500 queries per IP address per day 
# more than 500 queries per IP address per day requires you to register for API key - www.census.gov/developers
# https://www.census.gov/data/developers/data-sets/decennial-census.html 
"""
# Get census demographics for any given article
def get_census_demographics(year, dsource, dname, tract, county, state, block=False):
    # input: census year, data source, survey name, tract, county, state
    # return: demographic data for tract mentioned
    
    # census variables: https://api.census.gov/data/2020/dec/pl/variables.html 
    """
    P2_001N - Total
    P2_002N - Total: Hispanic or Latino
    P2_003N - Total: Not Hispanic or Latino
    P2_004N - Total: Not Hispanic or Latino: Population of One Race
    P2_005N - Total: Not Hispanic or Latino: Population of One Race: White alone
    P2_006N - Total: Not Hispanic or Latino: Population of One Race: Black or African American alone
    P2_007N - Total: Not Hispanic or Latino: Population of One Race: American Indian and Alaska Native alone
    P2_008N - Total: Not Hispanic or Latino: Population of One Race: Asian alone
    P2_009N - Total: Not Hispanic or Latino: Population of One Race: Native Hawaiian and Other Pacific Islander alone
    P2_010N - Total: Not Hispanic or Latino: Population of One Race: Some Other Race Alone
    """
    cols = 'NAME,P2_001N,P2_002N,P2_003N,P2_004N,P2_005N,P2_006N,P2_007N,P2_008N,P2_009N,P2_010N'
    base_url = f"https://api.census.gov/data/{year}/{dsource}/{dname}"

    if (block):
        # to get block demographics 
        census_url = f"{base_url}?get={cols}&for=block:{block}&in=tract:{tract}&in=county:{county}&in=state:{state}"
    else:
        # to get tract demographics 
        census_url = f"{base_url}?get={cols}&for=tract:{tract}&in=county:{county}&in=state:{state}"

    census_response = requests.get(census_url)
    census_response_json = census_response.json()

    return census_response_json

In [4]:
# Function so save the data collected

def save_data(data1, filename1):
    data1.to_csv(filename1, index=False)

We can also create a function to handle get_demographics and further automate the process. 

In [5]:
# Get census data for all given articles (with unique tract/block) and save to CSV
def collect_data(data, output, filename, block=False):
    error_tracts = []

    for index, row in data.iterrows():
        tract = str(row["Census Tract"])
        county = str(row["County"])
        print(index, tract, county)
        state = "25" # Massachusetts
        try:
            # Make API call to get census demographics data
            if block:
                block = str(row["Block"])
                census_data = get_census_demographics("2020", "dec", "pl", tract, county, state, block)
            else:
                census_data = get_census_demographics("2020", "dec", "pl", tract, county, state, False)

            # If successful, save data to output dataframe
            if census_data:
                census_data = census_data[1]

                # Calculate percentages as well for each demographic
                total_population = int(census_data[1])
                percentages = [round(100*(int(census_data[i])/total_population), 2) for i in range(1, 11)]

                output.loc[len(output.index)] = [index, tract, county, state, census_data[0], census_data[1], percentages[0], census_data[2], percentages[1], census_data[3], 
                                                 percentages[2], census_data[4], percentages[3], census_data[5], percentages[4], census_data[6], percentages[5], census_data[7], 
                                                 percentages[6], census_data[8], percentages[7], census_data[9],  percentages[8], census_data[10], percentages[9]]
            
            # Save data to CSV after every 10 iterations in case there's any errors or takes too long
            if index % 10 == 0:
                save_data(output, filename)
        
        except Exception as e:
            print("Error retrieving data for tract: " + tract + ". With error: " + str(e))
            error_tracts.append([tract, county, state])

    # Save final data to CSV
    save_data(output, filename)

    return error_tracts


In the interest of limiting API calls, making the process faster and less memory intensive, we can collect only the unique tracts and use them to link it to other articles when needed.

In [6]:
# Create a list of unique tracts from the geocoded data

unique_tracts = data[['Census Tract', 'County']].drop_duplicates().reset_index(drop=True)

In [None]:
# Create table with demographics of each unique tract

demographics = pd.DataFrame(columns=["Index", "Census Tract", "County", "State", "Name", "Total", "Total Percent", "Hispanic or Latino", "Hispanic or Latino Percent",
                                     "Not Hispanic or Latino", "Not Hispanic or Latino Percent", "One Race Total", "One Race Total Percent", "White", "White Percent", "African American", 
                                     "African American Percent", "American Indian and Alaska Native", "American Indian and Alaska Native Percent", "Asian", "Asian Percent",
                                     "Native Hawaiian and Other Pacific Island", "Native Hawaiian and Other Pacific Island Percent", "Other", "Other Percent"])

# Collect data for each unique tract and save it to CSV
collect_data(unique_tracts[:3], demographics, '../data/processed/demographics_by_tract.csv')

We can also do the same with blocks to be more specific, but currently it gives us a better hollistic perspective to focus on the census tract so this is just left as something that could be used in the future.

In [15]:
# Create a list of unique blocks from the geocoded data
unique_blocks = data[['Block', 'Census Tract', 'County']].drop_duplicates().reset_index(drop=True)

In [None]:
# Create table with demographics of each unique tract
block_demographics = pd.DataFrame(columns=["Index", "Census Tract", "County", "State", "Name", "Total", "Total Percent", "Hispanic or Latino", "Hispanic or Latino Percent",
                                     "Not Hispanic or Latino", "Not Hispanic or Latino Percent", "One Race Total", "One Race Total Percent", "White", "White Percent", "African American", 
                                     "African American Percent", "American Indian and Alaska Native", "American Indian and Alaska Native Percent", "Asian", "Asian Percent",
                                     "Native Hawaiian and Other Pacific Island", "Native Hawaiian and Other Pacific Island Percent", "Other", "Other Percent"])

# Collect data for each unique tract and save it to CSV
collect_data(unique_blocks, block_demographics, '../data/processed/demographics_by_block.csv', True)