In [289]:
import pandas as pd
import os
from datetime import datetime
import time
import numpy as np

In [290]:
# START SCRIPT TIMER
print(f"Start Time: {datetime.now()}")
tic = time.perf_counter()

Start Time: 2024-04-22 19:49:50.747228


In [291]:
# ADD USER FILE PATHS

# BEDDN spatial file used to create generate near table
dunslocs_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Geodatabases\NETS2022_locs.gdb\DunsLocations20231130_v2'

# participant location file used to create generate near table
mesa_locs_file = r'X:\AddressGeocoding\From_MESA_Air\Data\MESAAIR_locs.gdb\mesa_locs_aeac_zcta10'

# generate near table
near_table_file = r'X:\AddressGeocoding\From_MESA_Air\Data\Temp\scratch\NETS_linkage_test.gdb\mesa_nets_linkage_test_sample'

classifiedlong_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\ClassifiedLong20231127.txt'
dunsmove_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\DunsMove20231201.txt'
cat_descriptions_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\CategoryDescriptions20231127.txt'
xwalk_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\BG_CC_TC_Xwalk20231023.txt'
output_file = r'D:\scratch\beddn_bufferlevel.txt'

In [292]:
# USER INPUTS

# provide list of year(s)
years = [2000, 2005]

# use hierarchy? True or False
hierarchy = True

# limit categories? True or False
limit_cats = True

# if True, pick categories:

# print list of domains
desc = pd.read_csv(cat_descriptions_file, sep='\t')
domlist = list(desc['Domain'].unique())
print(domlist)

# provide list of categories by entire domain (optional):
domains = ['Food']

# provide list of individual categories (optional):
categories = ['DLR', 'CMN', 'GRY']

# provide UHCMatchCodeRank threshold (<=) for NETS/BEDDN and participant location. if no threshold desired, use value of 99:
UHCMCR_NETS = 6
UHCMCR_PART = 6

# provide buffer sizes in km
buff_sizes = [0.5, 1, 1.609]

['Food', 'Healthcare', 'Physical Activity', 'Social', 'Cognitive Enrichment', 'Financial', 'Alcohol, Tobacco, Marijuana, Firearm', 'Walking', 'Transportation', 'Disaster/Construction']


In [293]:
arcpy.env.workspace = r"F:\Arc_Projects\NETS_test_linkage"

In [294]:
# CONVERT FC/GDB TABLE TO PANDAS DF

# define function to convert fc table to pandas dataframe
def table_to_data_frame(in_table, input_fields=None, where_clause=None):
    """Function will convert an arcgis table into a pandas dataframe with an object ID index, and the selected
    input fields using an arcpy.da.SearchCursor."""
    OIDFieldName = arcpy.Describe(in_table).OIDFieldName
    if input_fields:
        final_fields = [OIDFieldName] + input_fields
    else:
        final_fields = [field.name for field in arcpy.ListFields(in_table)]
    data = [row for row in arcpy.da.SearchCursor(in_table, final_fields, where_clause=where_clause)]
    fc_dataframe = pd.DataFrame(data, columns=final_fields)
    fc_dataframe = fc_dataframe.set_index(OIDFieldName, drop=True)
    return fc_dataframe

In [295]:
# LOAD NEAR TABLE

# load table and round NEAR_DIST to 3 decimal points
near_df = table_to_data_frame(near_table_file, input_fields=['IN_FID', 'NEAR_FID', 'NEAR_DIST'])
near_df['NEAR_DIST'] = near_df['NEAR_DIST'].round(3)

# get unique NEAR_FIDs to subset NETS/BEDDN info
near_fids = list(near_df['NEAR_FID'].unique())

In [296]:
len(near_fids)

604043

In [297]:
# LOAD MESA LOCS AND MERGE TO NEAR TABLE
mesa_locs = table_to_data_frame(mesa_locs_file, input_fields=['LOCID_DREXEL', 'UHCMatchCodeRank'])
print(f'nrows of mesa_locs before removing low quality geocodes: {len(mesa_locs)}')

# remove records where UHCMatchCodeRank is above threshold
mesa_locs = mesa_locs.loc[mesa_locs['UHCMatchCodeRank'] <= UHCMCR_PART]
print(f'nrows of mesa_locs after removing low quality geocodes: {len(mesa_locs)}')

# merge participant location unique ids and uhcmatchcoderank to near table
join_mesa = (near_df
              .merge(mesa_locs, how='left', left_on='IN_FID', right_on='OBJECTID')
              .drop(columns=['IN_FID', 'UHCMatchCodeRank'])
             )
del mesa_locs, near_df

nrows of mesa_locs before removing low quality geocodes: 16703
nrows of mesa_locs after removing low quality geocodes: 15160


In [298]:
# LOAD NETS/BEDDN LOCS, SUBSET FOR THOSE USED IN GENERATE NEAR 
duns_locs = table_to_data_frame(dunslocs_file, input_fields=['AddressID', 'UHCMatchCodeRank'])
duns_locs = (duns_locs
             .iloc[near_fids]
             .reset_index()
            )
print(duns_locs.head())
print(f'nrows of duns_locs before removing low quality geocodes: {len(duns_locs)}')

# remove records where UHCMatchCodeRank is above threshold
duns_locs = (duns_locs
             .loc[duns_locs['UHCMatchCodeRank'] <= UHCMCR_NETS]
             .drop(columns = ['UHCMatchCodeRank'])
            )
print(f'nrows of duns_locs after removing low quality geocodes: {len(duns_locs)}')

   OBJECTID   AddressID  UHCMatchCodeRank
0  14454030  A014454025               1.0
1   9584403  A009584403               1.0
2  11312063  A011312063               1.0
3  16498116  A016498114               1.0
4  23517596  A023517598               1.0
nrows of duns_locs before removing low quality geocodes: 604043
nrows of duns_locs after removing low quality geocodes: 584718


In [None]:
# LOAD DUNSMOVE, SUBSET BY YEAR, MERGE WITH DUNS LOCS
dunsmove = pd.read_csv(dunsmove_file, sep='\t', usecols=['DunsYear', 'AddressID', 'Year'])

# subset dunsmove for years requested
dunsmove = dunsmove.loc[dunsmove['Year'].isin(years)] 

# merge in dunsmove columns
join_dunsmove = duns_locs.merge(dunsmove, how='inner', on='AddressID')
# del dunsmove, duns_locs

In [None]:
print(join_dunsmove.shape)
print(join_dunsmove.head())

In [None]:
join_dunsmove['AddressID'].nunique()

In [None]:
# READ IN CLASSIFIED LONG AND SUBSET BY CATEGORY IF APPLICABLE
classlong = pd.read_csv(classifiedlong_file, sep='\t', usecols=['DunsYear','BaseGroup'])

# subset for provided categories
if limit_cats == True:
    # grab all categories in chosen domain(s)
    domain_cats = desc['Category'].loc[desc['Domain'].isin(domains)]  
    all_cats = list(domain_cats)

    # grab all additional categories and order alphabetically
    [all_cats.append(category) for category in categories]
    all_cats.sort()
    
    # subset classlong for all provided categories
    classlong = classlong.loc[classlong['BaseGroup'].isin(all_cats)]
else:
    pass

In [None]:
# MERGE CLASSLONG WITH OTHER DUNS VARS

# this merges all of the NETS/BEDDN data together into one table.
#it drops records where BaseGroup is Null.
join_classlong = (join_dunsmove
                  .merge(classlong, how='left', on='DunsYear')
                  .dropna(subset=['BaseGroup'])               
                 )
del classlong

In [None]:
print(join_classlong.shape)
print(join_classlong.head())
print(join_classlong['DunsYear'].nunique())

In [None]:
# APPLY HIERARCHY IF APPLICABLE
if hierarchy == True:
    # join hierarchy
    join_classlong = (join_classlong
                   .merge(desc[['Category', 'Hierarchy']], how='left', left_on='BaseGroup', right_on='Category')
                   .drop(columns=['Category']))
    
    # sort by hierarchy, then drop all duplicates of dunsyear, keep first instance
    join_classlong = (join_classlong
                         .sort_values(by='Hierarchy')
                         .drop_duplicates(subset=['DunsYear', 'Year'], keep='first')
                         .drop(columns=['Hierarchy'])
                        )
else: 
    pass


In [None]:
print(join_classlong.shape)
print(join_classlong.head())
print(join_classlong['DunsYear'].nunique())

In [None]:
# LOOP THROUGH YEARS, MERGE NEAR TABLE TO DUNS VARS, EXPORT FINAL BASEGROUP MEASURES TO FILE

xwalk = pd.read_csv(xwalk_file, sep='\t')
main_df = pd.DataFrame()
for year in years:
    # subset for year
    join_classlong_1year = join_classlong.loc[join_classlong['Year'] == year]
    temp = (join_mesa
            .merge(join_classlong_1year, how='inner', left_on = 'NEAR_FID', right_on = 'OBJECTID')
            .drop(columns=['OBJECTID','NEAR_FID'])
           )
    
    # separate into separate dataframes for each buffer
    halfkm = temp.loc[temp['NEAR_DIST'] <= 0.5]
    onekm = temp.loc[temp['NEAR_DIST'] <= 1]
    onemi = temp.loc[temp['NEAR_DIST'] <= 1.60934]
    fivekm = temp.loc[temp['NEAR_DIST'] <= 5]
    fivemi = temp.copy()
    del temp
    
    # create distance, distance_unit cols to identify buffers
    halfkm['distance'] = 0.5 
    halfkm['distance_unit'] = 'km'
    onekm['distance'] = 1
    onekm['distance_unit'] = 'km'
    onemi['distance'] = 1
    onemi['distance_unit'] = 'mi'    
    fivekm['distance'] = 5
    fivekm['distance_unit'] = 'km'    
    fivemi['distance'] = 5
    fivemi['distance_unit'] = 'mi'
    
    # get base group counts
    basegroup_df = pd.DataFrame()
    buffers = [halfkm, onekm, onemi, fivekm, fivemi]
    for buffer in buffers:
        
        # groupby participant id and buffer distance
        buffer_counts_bg = pd.DataFrame(buffer
                            .groupby(['LOCID_DREXEL', 'Year', 'distance', 'distance_unit'])['BaseGroup']
                            .value_counts()
                            .reset_index(level=4)
                           )
        buffer_wide_bg = pd.pivot(buffer_counts_bg, columns='BaseGroup', values='count')
        
        # append basegroups to year_df
        basegroup_df = pd.concat([basegroup_df,buffer_wide_bg])
        
        
    # get high level counts
    join_xwalk = (join_classlong_1year
          .merge(xwalk[['BaseGroup', 'HighLevel']], how='left', on='BaseGroup')
          .drop(columns='BaseGroup')
         )
    del join_classlong_1year

    # drop duplicates of dunsyear-highlevel, so none are double counted
    #due to a dunsyear being in more than one basegroup that feeds into 
    #a higher level category. this only matters if hierarchy=False
    if hierarchy == False:
        join_xwalk = join_xwalk.drop_duplicates(subset=['DunsYear', 'Year', 'HighLevel'], keep='last')
    else:
        pass
    
    highlevel_df = pd.DataFrame()
    for buffer in buffers:
        # join HighLevel column from xwalk
        join_xwalk = (buffer
          .merge(xwalk[['BaseGroup', 'HighLevel']], how='left', on='BaseGroup')
          .drop(columns='BaseGroup')
         )
        # groupby participant id and buffer distance
        buffer_counts_hl = pd.DataFrame(join_xwalk
                            .groupby(['LOCID_DREXEL', 'Year', 'distance', 'distance_unit'])['HighLevel']
                            .value_counts()
                            .reset_index(level=4)
                           )
        buffer_wide_hl = pd.pivot(buffer_counts_hl, columns='HighLevel', values='count')
        
        # append highlevel cats in each buffer to highlevel_df
        highlevel_df = pd.concat([highlevel_df,buffer_wide_hl])
        
    # merge highlevel cats to year_df
    year_df = (basegroup_df
               .merge(highlevel_df, how='inner', on=['LOCID_DREXEL', 'Year', 'distance', 'distance_unit'])
               .fillna(0)
              )
    main_df = pd.concat([main_df, year_df])
    
main_df.to_csv(output_file, sep='\t', index=True)

In [None]:
year_df

In [None]:
# END SCRIPT TIMER 
toc = time.perf_counter()
t = toc - tic
print(f'total time: {round(t/60, 2)} minutes')

In [None]:
# add user input for matchcoderank threshold for mesa and nets
# round NEAR_DIST to 3 decimal points and split into bins
# set year to integer?
# make output only participant id, year, basegroup, highlevel (wide?)