In [12]:
import os
from arcgis.gis import GIS
from arcgis.features import GeoAccessor
from arcgis.geometry import Geometry
from arcgis.geoenrichment import enrich, Country
import pandas as pd
import numpy as np

import sys
sys.path.append('../src/geoai_retail')
import utils
import config

ent_gis = GIS(config.ent_url, username=config.ent_user, password=config.ent_pass)
ent_gis

In [9]:
raw_gdb = r'../data/raw/raw.gdb'
origin_fc = os.path.join(raw_gdb, 'blocks')
origin_id_fld = 'GEOID'

vars_csv = '../data/raw/enrichment_variables.csv'

In [8]:
geo_fc = origin_fc
geo_id_fld = origin_id_fld

geo_df = utils.get_dataframe(geo_fc)
geo_df = geo_df[[geo_id_fld, 'SHAPE']].copy()
geo_df.head()

Unnamed: 0,GEOID,SHAPE
0,410050201001000,"{""rings"": [[[-122.65566145899999, 45.426378462..."
1,410050201001001,"{""rings"": [[[-122.65534145899994, 45.427037462..."
2,410050201001002,"{""rings"": [[[-122.65548645999996, 45.429533462..."
3,410050201001003,"{""rings"": [[[-122.65696245999999, 45.428736462..."
4,410050201001004,"{""rings"": [[[-122.65702845999999, 45.428751462..."


In [10]:
vars_df = pd.read_csv(vars_csv, index_col=0)
vars_df['variable_name'] = vars_df['variable_name'].str.upper()
vars_df.sample(5)

Unnamed: 0,alias,category_id,local_enrich_name,name,rest_enrich_name,variable_name
998,2018 Asian Pop Age 80-84,agebyracebysex,agebyracebysex.asn80_cy,agebyracebysex_asn80_cy,agebyracebysex_asn80_cy,ASN80_CY
1165,2018 Multiple Races Pop Age 15-19,agebyracebysex,agebyracebysex.mlt15_cy,agebyracebysex_mlt15_cy,agebyracebysex_mlt15_cy,MLT15_CY
107,ACS Pop 18-64 speak Oth/English NW,age,age.languacsotnwa18,language_acsotnwa18,age_languacsotnwa18,LANGUACSOTNWA18
1531,2018 Hispanic Asian Pop,hispanicorigin,hispanicorigin.hispasn_cy,hispanicorigin_hispasn_cy,hispanicorigin_hispasn_cy,HISPASN_CY
186,2018 Median Disposable Income,disposableincome,disposableincome.meddi_cy,disposableincome_meddi_cy,disposableincome_meddi_cy,MEDDI_CY


In [11]:
vars_df['enrich_name'] = vars_df['category_id'].str.cat(vars_df['variable_name'].str.upper(), sep='.')
vars_df.sample(5)

Unnamed: 0,alias,category_id,local_enrich_name,name,rest_enrich_name,variable_name,enrich_name
103,ACS Pop 18-64 speak API/English NW,age,age.languacsapnwa18,language_acsapnwa18,age_languacsapnwa18,LANGUACSAPNWA18,age.LANGUACSAPNWA18
926,2018 American Indian Pop Age 20-24,agebyracebysex,agebyracebysex.ai20_cy,agebyracebysex_ai20_cy,agebyracebysex_ai20_cy,AI20_CY,agebyracebysex.AI20_CY
845,2018 White Females Age 25-29,agebyracebysex,agebyracebysex.whtf25_cy,agebyracebysex_whtf25_cy,agebyracebysex_whtf25_cy,WHTF25_CY,agebyracebysex.WHTF25_CY
280,2018 Disposable Inc Base/HHr 75+,disposableincome,disposableincome.dia75bascy,disposableincome_dia75bascy,disposableincome_dia75bascy,DIA75BASCY,disposableincome.DIA75BASCY
351,2018 HHr 55-64/Inc $35K-49999,age,age.incomebya55i35_cy,incomebyage_a55i35_cy,age_incomebya55i35_cy,INCOMEBYA55I35_CY,age.INCOMEBYA55I35_CY


In [25]:
usa = Country.get('USA')
enrich_df = usa.data_collections
enrich_df['variable_name'] = enrich_df['analysisVariable'].apply(lambda val: val.split('.')[1])
enrich_df.set_index('variable_name', inplace=True)

five_year_out = str(np.max([int(val) for val in enrich_df['vintage'].unique() if pd.notna(val) and not '-' in val]))
enrich_df = enrich_df[enrich_df['vintage'] != five_year_out]

enrich_df.sample(5)

Unnamed: 0_level_0,analysisVariable,alias,fieldCategory,vintage
variable_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
NHSPAI_CY,raceandhispanicorigin.NHSPAI_CY,2018 Non-Hispanic American Indian Pop,2018 Race and Hispanic Origin (Esri),2018
MP05028a_B,LeisureActivitiesLifestyle.MP05028a_B,Bought novel in last 12 months,2018 Books & Cards (Market Potential),2018
MP03001a_I,BabyProductsToysGames.MP03001a_I,Index: Used baby food in last 6 months,2018 Baby Products (Market Potential),2018
X3009_X,HousingHousehold.X3009_X,Owned Dwellings: Ground Rent,2018 Housing (Consumer Spending),2018
MP18038a_I,FinancialInsurance.MP18038a_I,Index: Have life insurance: separate whole lif...,2018 Insurance (Market Potential),2018


In [27]:
full_enrich_df = vars_df.join(enrich_df, on='variable_name', rsuffix='_online')
full_enrich_df.drop_duplicates('variable_name', inplace=True)
full_enrich_df.dropna(0, subset=['analysisVariable'], inplace=True)
enrich_vars = list(full_enrich_df['analysisVariable'].values)
enrich_vars[:20]

['AtRisk.GQPOP_CY',
 'Generations.GENALPHACY',
 'Generations.GENZ_CY',
 'Generations.MILLENN_CY',
 'Generations.GENX_CY',
 'Generations.BABYBOOMCY',
 'Generations.OLDRGENSCY',
 'Generations.GENBASE_CY',
 '5yearincrements.POP0_CY',
 '5yearincrements.POP5_CY',
 '5yearincrements.POP10_CY',
 '5yearincrements.POP15_CY',
 '5yearincrements.POP20_CY',
 '5yearincrements.POP25_CY',
 '5yearincrements.POP30_CY',
 '5yearincrements.POP35_CY',
 '5yearincrements.POP40_CY',
 '5yearincrements.POP45_CY',
 '5yearincrements.POP50_CY',
 '5yearincrements.POP55_CY']

In [None]:
%%time
enrich_df = enrich(geo_df, analysis_variables=enrich_vars, return_geometry=False)

In [None]:
enrich_df.sample(5)

In [51]:
enriched_result.to_csv('./home/block_enrich_result.csv')