# Step 2: Geocoding data

Even though the initial data had some information about the census tracts and neighborhoods, we want to make sure they are up to date. So this code aims to use the relevant APIs to recollect such data.

In [2]:
# Import necessary libraries

import pandas as pd
import ast # For converting string to list
import requests # For making API requests
import json # For parsing JSON of neighboordhoods

In [3]:
# Load new processed data

articles = pd.read_csv("../data/processed/gbh_processed_output.csv")

### Get Census geographies

This function has been used in other parts of the website so it's valuable to understand it. It takes as input a dictionary of "place": (longitude coordinate and latitude coordinate as another nested dictionary), and outputs all the relevant census geographies for each location.

In [4]:
# Get census geographies for any given location

def get_census_geos(geocode_results): # format: {place: {lon: lon, lat: lat}}
    """
    get census geographies - tract, block group, block by coordinates
    input: google maps geocode_results as a dictionary
    return: block, block_group, tract, county for each location
    """
    census_geos = {}
    for place in geocode_results:
        # building the geocoding url
        base_url = f'https://geocoding.geo.census.gov/geocoder/geographies/coordinates?'
        survey_ver = f'&benchmark=4&vintage=4&layers=2020 Census Blocks&format=json'
        lon = geocode_results[place]['lon']
        lat = geocode_results[place]['lat']
        census_geo_url = f'{base_url}x={lon}&y={lat}{survey_ver}'

        # getting the census geographies 
        response = requests.get(census_geo_url)
        response_json = response.json()

        # Extract and organize the census geographies
        try:
            block = response_json['result']['geographies']['2020 Census Blocks'][0]['BLOCK']
            block_group = response_json['result']['geographies']['2020 Census Blocks'][0]['BLKGRP']
            tract = response_json['result']['geographies']['2020 Census Blocks'][0]['TRACT']
            county = response_json['result']['geographies']['2020 Census Blocks'][0]['COUNTY']
            census_geos[place] = {'block': block,
                                  'blkgrp': block_group,
                                  'tract': tract,
                                  'county': county}
        except IndexError:
            print("Unable to retrieve census geography for: " + place)
        except KeyError:
            print("Location is outside of the United States: " + place)
            # Still saves it as NaN, but will be able to filter out later
            census_geos[place] = {'block': None,
                                  'blkgrp': None,
                                  'tract': None,
                                  'county': None}
    return census_geos

In [None]:
# Get the census geographies for each location and add it to database

geocodes = {}
count=0 # Didn't know the 'place' so used index as a key
for coordinates in articles["Coordinates"]:
    if not (isinstance(coordinates,list)): # Sometimes the coordinates weren't in the right format, so convert them
        coordinates = ast.literal_eval(coordinates)
    lon, lat = coordinates[0], coordinates[1]
    geocodes[str(count)] = {"lon": lon, "lat": lat}
    count+=1 

census_geos = get_census_geos(geocodes)

census_geos_df = pd.DataFrame.from_dict(census_geos, orient='index')

Now that the census geographies have been added to the database, we can save it to a new file.

In [None]:
# Add census info to data

# Add the county, tract and block to dataFrame
articles.insert(3, "County", census_geos_df["county"].values)
articles.insert(3, "Census Tract", census_geos_df["tract"].values)
articles.insert(3,"Block", census_geos_df["block"].values)

# Filter out None values
geocoded_articles = articles[articles["Census Tract"].notna()]

### We can also collect the relevant neighborhood information

Using a data set that contains the neighborhood for any given census tract, we can easily relate the two. The following class is used in other sections of the project to achieve this.

In [11]:
geocoded_articles = pd.read_csv("../data/processed/gbh_geocoded_output.csv")

In [12]:
# Relate neighborhoods to census geographies
class neighborhood_mapping():
    def __init__(self):
        self.load_mappings()
    
    def load_mappings(self):
        # load census tract to boston neighborhood mapping 
        # File is not comprehensive! have to add more tracts and connections to their neighborhoods
        self.tract_mapping = json.load(open("./data/tracts-neighbors.json")) # Will have to download this file and add manually to the relevant folder
        
        # load census block to boston neighborhood mapping 
        self.block_mapping = json.load(open("./data/blocks-neighbors.json")) # Not currently in use

    def tract_to_neighborhood(self, tract):
        # given a census tract return the boston neighborhood it is in 
        return self.tract_mapping[tract]
    
    def block_to_neighborhood(self, block):
        # given a census block return the boston neighborhood it is in 
        return self.block_mapping(block)

In [None]:
# Create the neighborhood mapping object
mappings = neighborhood_mapping()

# Return a formatted id for the census tract
def format_id(state, county, census_tract): 
    # The input is of the form, Ma Code, County Code, Census Tract
    full_id= f"{state}{county.zfill(3)}{census_tract.zfill(6)}"

    return full_id
    
# Add a new column for Neighborhood
geocoded_articles['Neighborhood'] = None

# Loop through each row in the DataFrame and update the Neighborhood column
for index, row in geocoded_articles.iterrows():
    formatted_id = format_id("25", str(row["County"]), str(row["Census Tract"]))
    try:
        neighborhood = mappings.tract_to_neighborhood(formatted_id)
    except KeyError:
        print("Unable to retrieve neighborhood for: " + formatted_id)
        neighborhood = None
    geocoded_articles.at[index, 'Neighborhood'] = neighborhood

geocoded_articles.insert(5, "Neighborhood", geocoded_articles.pop("Neighborhood"))


Now that the census geographies have been added to the database, we can save it to a new file.

In [None]:
geocoded_articles.to_csv("../data/processed/gbh_geocoded_output.csv", index=False)
geocoded_articles.head(5)