In [21]:
import pandas as pd
import os
from datetime import datetime
import time

In [22]:
# START SCRIPT TIMER
print(f"Start Time: {datetime.now()}")
tic = time.perf_counter()

Start Time: 2024-04-17 17:24:26.103929


In [23]:
# ADD USER FILE PATHS

# BEDDN spatial file used to create generate near table
dunslocs_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Geodatabases\NETS2022_locs.gdb\DunsLocations20231130_v2'

# participant location file used to create generate near table
mesa_locs_file = r'X:\AddressGeocoding\From_MESA_Air\Data\MESAAIR_locs.gdb\mesa_locs_aeac_zcta10'

# generate near table
near_table_file = r'X:\AddressGeocoding\From_MESA_Air\Data\Temp\scratch\NETS_linkage_test.gdb\mesa_nets_linkage_test_sample'

classifiedlong_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\ClassifiedLong20231127.txt'
dunsmove_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\DunsMove20231201.txt'
cat_descriptions_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\CategoryDescriptions20231127.txt'
xwalk_file = r'Z:\UHC_Data\NETS_UHC\NETS2022\Data\Final\BG_CC_TC_Xwalk20231023.txt'
output_folder = r'D:\scratch'

In [24]:
# USER INPUTS

# provide list of year(s)
years = [2000, 2005]

# use hierarchy? True or False
hierarchy = True

# limit categories? True or False
limit_cats = False

# if True, pick categories:

# print list of domains
desc = pd.read_csv(cat_descriptions_file, sep='\t')
domlist = list(desc['Domain'].unique())
print(domlist)

# provide list of categories by entire domain (optional):
domains = ['Financial']

# provide list of individual categories (optional):
categories = ['DLR', 'CMN', 'GRY']


['Food', 'Healthcare', 'Physical Activity', 'Social', 'Cognitive Enrichment', 'Financial', 'Alcohol, Tobacco, Marijuana, Firearm', 'Walking', 'Transportation', 'Disaster/Construction']


In [25]:
arcpy.env.workspace = r"F:\Arc_Projects\NETS_test_linkage"

In [26]:
#%% CONVERT FC TABLE TO PANDAS DF

# define function to convert fc table to pandas dataframe
def table_to_data_frame(in_table, input_fields=None, where_clause=None):
    """Function will convert an arcgis table into a pandas dataframe with an object ID index, and the selected
    input fields using an arcpy.da.SearchCursor."""
    OIDFieldName = arcpy.Describe(in_table).OIDFieldName
    if input_fields:
        final_fields = [OIDFieldName] + input_fields
    else:
        final_fields = [field.name for field in arcpy.ListFields(in_table)]
    data = [row for row in arcpy.da.SearchCursor(in_table, final_fields, where_clause=where_clause)]
    fc_dataframe = pd.DataFrame(data, columns=final_fields)
    fc_dataframe = fc_dataframe.set_index(OIDFieldName, drop=True)
    return fc_dataframe

In [27]:
# LOAD NEAR TABLE AND MESA LOCS THEN MERGE

near_df = table_to_data_frame(near_table_file, input_fields=['IN_FID', 'NEAR_FID', 'NEAR_DIST'])
mesa_locs = table_to_data_frame(mesa_locs_file, input_fields=['LOCID_DREXEL', 'UHCMatchCodeRank'])

# merge participant location unique ids and uhcmatchcoderank to near table
join_mesa = (near_df
              .merge(mesa_locs, how='left', left_on='IN_FID', right_on='OBJECTID')
              .drop(columns=['IN_FID'])
              .rename(columns={'UHCMatchCodeRank': 'UHCMatchCodeRank_MESA'})
             )
del mesa_locs, near_df

In [28]:
# LOAD DUNS LOCS THEN MERGE

dunslocs = table_to_data_frame(dunslocs_file, input_fields=['AddressID', 'UHCMatchCodeRank'])

# merge beddn addressids and uhcmatchcoderank
join_addressid = (join_mesa
                  .merge(dunslocs, how='left', left_on='NEAR_FID', right_on='OBJECTID')
                  .drop(columns=['NEAR_FID'])
                  .rename(columns={'UHCMatchCodeRank': 'UHCMatchCodeRank_NETS'})
                 )
del join_mesa, dunslocs

# check shape (rows, cols) of dataframe
join_addressid.shape

(23178595, 5)

In [29]:
# SUBSET LIST OF BEDDN CATEGORIES

# grab all categories in chosen domain(s)
domain_cats = desc['Category'].loc[desc['Domain'].isin(domains)]  
all_cats = list(domain_cats)

# grab all additional categories
[all_cats.append(category) for category in categories]

[None, None, None]

In [36]:
# READ IN DUNSMOVE AND SUBSET BY YEAR
dunsmove = pd.read_csv(dunsmove_file, sep='\t', usecols=['DunsYear', 'DunsMove', 'AddressID', 'Year'], dtype={'Year':int})

# subset dunsmove for years requested
dunsmove = dunsmove.loc[dunsmove['Year'].isin(years)] 

# merge in dunsmove columns
join_dunsmove = join_addressid.merge(dunsmove, how='left', on='AddressID')
del dunsmove, join_addressid

ValueError: Usecols do not match columns, columns expected but not found: ['AddressId']

In [None]:
# READ IN CLASSIFIED LONG AND SUBSET BY CATEGORY IF APPLICABLE
classlong = pd.read_csv(classifiedlong_file, sep='\t', usecols=['DunsYear','BaseGroup'])

# subset for provided categories
if limit_cats == True:
    classlong = classlong.loc[classlong['BaseGroup'].isin(all_cats)]
else:
    pass

In [None]:
# MERGE BASE GROUP (CLASSIFIED) DATA 
join_classlong = join_dunsmove.merge(classlong, how='left', on='DunsYear')

In [None]:
# END SCRIPT TIMER 
toc = time.perf_counter()
t = toc - tic
print(f'total time: {round(t/60, 2)} minutes')