# Prototype Spatial Analytics
These prototype spatial analytics are based on interviews we conducted with Epidemiologists at Fairfax County

More details about Fairfax County Spatial Analytics use cases are documented in this Google [Doc](https://docs.google.com/document/d/1l3HAq-skWRRF_lm8Cf659uvvx7lTbBUm526-RsAse10/edit#heading=h.45gyn9vnonff).

**Note, all data in this notebook is synthetic**.

In [35]:
import csv
import os
import random
from random import uniform

import censusgeocode as cg 
from concurrent.futures import ThreadPoolExecutor
from faker import Faker
from geopy import Point
from geopy.distance import geodesic
import geopandas as gpd
import git
import names
import pandas as pd
from tqdm.notebook import tqdm

PROJ: proj_create_from_database: SQLite error on SELECT name, type, coordinate_system_auth_name, coordinate_system_code, datum_auth_name, datum_code, area_of_use_auth_name, area_of_use_code, text_definition, deprecated FROM geodetic_crs WHERE auth_name = ? AND code = ?: no such column: area_of_use_auth_name


In [3]:
def get_git_root(path):
    """Return Top Level Git Repository directory given path"""
    git_repo = git.Repo(path, search_parent_directories=True)
    git_root = git_repo.git.rev_parse("--show-toplevel")
    return git_root

In [4]:
def geocode(row):
    """
    Placeholder geocoder for now, real data would come geocoded
    """
    index, lat, lng = row
    try:
        census = cg.coordinates(lng, lat)['2020 Census Blocks'][0]

        data = dict(geoid=census['GEOID'], 
                    state=census['STATE'], 
                    county=census['COUNTY'], 
                    tract=census['TRACT'], 
                    block=census['BLOCK'], 
                    lat=lat, 
                    lng=lng)

    except Exception as e:
        data = dict(lat=lat, 
                    lng=lng)

    return data

In [29]:
NOTEBOOK_PATH = os.path.abspath('')
HOME_DIR = os.path.expanduser('~')
INVENTORY_DIR = os.path.join(HOME_DIR, 'inventory')
GIT_ROOT_DIR = get_git_root(NOTEBOOK_PATH)
OUTPUTS_DIR = os.path.join(GIT_ROOT_DIR, 'src', 'notebooks', 'outputs')
FAIRFAX_COUNTY_CENTER = Point(38.845262, -77.307035)  # Approximate center of Fairfax County chosen on Google Maps

FAIRFAX_COUNTY_ZIPS = [
    22003, 22030, 20171, 22015, 20170, 20120, 22033, 22309, 22079,
    22306, 22031, 22042, 22312, 22310, 22153, 22032, 22315, 22152,
    20191, 20121, 22101, 22150, 22041, 22182, 22043, 20151, 22180,
    22102, 22311, 20190, 22124, 22046, 22151, 22039, 22066, 20124,
    22303, 22181, 22308, 22044, 20194, 22307, 22060, 22027, 22185,
    22035, 22081, 22092, 22082, 22095, 22096, 22103, 22107, 22106,
    22109, 22108, 22118, 22116, 22120, 22119, 22122, 22121, 22158,
    22156, 22160, 22159, 22161, 22183, 22184, 22199, 22009, 22037,
    22036, 22047, 22067, 20122, 20153, 20172, 20193, 20192, 20195,
    20196, 20511
]

In [6]:
def generate_point(center: Point, radius: int) -> Point:
    """
    This is from
    
    https://stackoverflow.com/questions/31192451/generate-random-geo-coordinates-within-specific-radius-from-seed-point
    """
    radius_in_kilometers = radius * 1e-3
    random_distance = random.random() * radius_in_kilometers
    random_bearing = random.random() * 360
    return geodesic(kilometers=random_distance).destination(center, random_bearing)

In [7]:
def write_points(points, filename='synthetic_locations.csv'):
    """
    Output a CSV so that you can upload to Google Maps to just check the synthetic points
    """
    with open(os.path.join(OUTPUTS_DIR, filename), 'w', encoding='UTF8', newline="") as f:
        writer = csv.writer(f)
        writer.writerow(['latitude', 'longitude'])
        for p in points:
            writer.writerow([p[0], p[1]])

In [8]:
radius = 10000  # By default this is in meters
number_of_points = 2000
center = FAIRFAX_COUNTY_CENTER
points = [generate_point(center, radius) for _ in range(number_of_points)]

write_points(points)

In [9]:
# Generating the synthetic data as a dataframe
latitudes = []
longitudes = []
personids = random.sample(range(len(points)), len(points))
first_names = []
last_names = []
addresses = []
fake = Faker()
zip_codes = []
for p in points:
    latitudes.append(p[0])
    longitudes.append(p[1])
    address = fake.address()
    street = address.split('\n')[0]
    first_names.append(names.get_first_name())
    last_names.append(names.get_last_name())
    addresses.append(street)
    zip_codes.append(random.sample(FAIRFAX_COUNTY_ZIPS, 1)[0])

In [11]:
df = pd.DataFrame({
    'personid': personids,
    'first_name': first_names,
    'last_name': last_names,
    'street_address': addresses,
    'zip_code': zip_codes,
    'latitude': latitudes,
    'longitude': longitudes,
}) 

In [14]:
with ThreadPoolExecutor() as tpe:
     data = list(tqdm(tpe.map(geocode, df[['latitude', 'longitude']].itertuples()), total=len(df)))
data_df = pd.DataFrame.from_records(data)

  0%|          | 0/2000 [00:00<?, ?it/s]

In [22]:
data_df.head()

Unnamed: 0,geoid,state,county,tract,block,lat,lng
0,510594714021005,51,59,471402,1005,38.877225,-77.202055
1,510594920002034,51,59,492000,2034,38.804877,-77.365538
2,510594920002011,51,59,492000,2011,38.813562,-77.330581
3,510594920002034,51,59,492000,2034,38.810908,-77.367862
4,510594406001000,51,59,440600,1000,38.849496,-77.331123


In [26]:
df['census_tract'] = data_df['state'] + data_df['county'] + data_df['tract']
df['census_block_group'] = data_df['state'] + data_df['county'] + data_df['tract'] + data_df['block'].str[0]
df['census_block'] = data_df['geoid']

In [None]:
def generate_covid_positive(row):
    if row['is_fully_vax'] == 0:
        covid_positive = random.choices([0, 1], weights=(0.97, 0.03), k=1)
        return covid_positive[0]
    else:
        covid_positive = random.choices([0, 1], weights=(0.99, 0.01), k=1)
        return covid_positive[0]

is_fully_vax = random.choices([0, 1], weights=(0.4, 0.6), k=len(points))
df['is_fully_vax'] = is_fully_vax
df['tested_covid_positive'] = df.apply(lambda row: generate_covid_positive(row), 1)

In [27]:
df.head()

Unnamed: 0,personid,first_name,last_name,street_address,zip_code,latitude,longitude,is_fully_vax,tested_covid_positive,census_tract,census_block,census_block_group
0,1111,Bradley,Keef,96260 Johnson Station,20194,38.877225,-77.202055,0,0,51059471402,510594714021005,510594714021
1,578,Gail,Mcmunn,3469 John Lock Apt. 118,22096,38.804877,-77.365538,1,0,51059492000,510594920002034,510594920002
2,1328,Joshua,Montgomery,685 Blankenship Bridge Suite 497,22042,38.813562,-77.330581,1,0,51059492000,510594920002011,510594920002
3,806,Edward,Enos,8599 Hill Shores Apt. 784,20153,38.810908,-77.367862,1,0,51059492000,510594920002034,510594920002
4,120,Hazel,Richardson,433 Ramos Lights,22092,38.849496,-77.331123,0,0,51059440600,510594406001000,510594406001


# Spatial Analytics

I found this Medium [Post](https://towardsdatascience.com/mapping-census-data-fbab6722def0) and [notebook](https://github.com/allisoncstafford/seattle_tree_canopy_blog/blob/master/census_ethnicity.ipynb) to be helpful

In [36]:
virginia_map = gpd.read_file(os.path.join(INVENTORY_DIR, '2019 Virginia Census Block Groups', 'geo_export_64de6806-7ef0-4c39-97f1-b52e5d986702.shp'))


CRSError: Invalid projection: epsg:4326: (Internal Proj Error: proj_create: SQLite error on SELECT name, type, coordinate_system_auth_name, coordinate_system_code, datum_auth_name, datum_code, area_of_use_auth_name, area_of_use_code, text_definition, deprecated FROM geodetic_crs WHERE auth_name = ? AND code = ?: no such column: area_of_use_auth_name)

In [None]:
result = cg.coordinates(x=-77.289, y=38.836)

In [None]:
result['Census Tracts'][0]['GEOID']