In [166]:
from pathlib import Path
import pandas as pd
import numpy as np
import json
from pandas.api.types import union_categoricals
from itertools import islice
import re
import addfips
import requests
import urllib
import time
DC_STATEHOOD = 1 # Enables DC to be included in the state list
import us
import pickle
import rapidfuzz

In [167]:
# Create Census geographic codes file
census_county = pd.read_csv("../Data/census_geography_codes/national_county2020.txt", sep='|', dtype = 'string', keep_default_na=False).apply(lambda x: x.str.upper())
census_countysub = pd.read_csv("../Data/census_geography_codes/national_cousub2020.txt", sep='|', dtype = 'string', keep_default_na=False).apply(lambda x: x.str.upper())
census_place = pd.read_csv("../Data/census_geography_codes/national_place2020.txt", sep='|', dtype = 'string', keep_default_na=False).apply(lambda x: x.str.upper())
census_placebycounty = pd.read_csv("../Data/census_geography_codes/national_place_by_county2020.txt", sep='|', dtype = 'string', keep_default_na=False).apply(lambda x: x.str.upper())
census_zip = pd.read_csv("../Data/census_geography_codes/tab20_zcta520_county20_natl.txt", sep='|', dtype = 'string', keep_default_na=False).apply(lambda x: x.str.upper())

In [149]:
# Add FIPS column
census_county['fips'] = census_county['STATEFP'] + census_county['COUNTYFP']
census_placebycounty['fips'] = census_placebycounty['STATEFP'] + census_placebycounty['COUNTYFP']
census_zip['fips'] = census_zip['GEOID_COUNTY_20']

In [150]:
# There may be places and ZIP codes that map to multiple counties; collapse these into unique entries
census_county = census_county[['STATE', 'COUNTYNAME', 'fips']]
census_place_agg = census_placebycounty.groupby(['STATE', 'COUNTYNAME', 'PLACENAME']).agg({'fips':lambda x: ",".join(x)}).reset_index()
census_zip_agg = census_zip.groupby(['GEOID_ZCTA5_20']).agg({'fips':lambda x: ",".join(x)}).reset_index()

In [151]:
# Drop empty entries
census_county = census_county[census_county['COUNTYNAME'] != '']
census_place_agg = census_place_agg[census_place_agg['PLACENAME'] != '']
census_zip_agg = census_zip_agg[census_zip_agg['GEOID_ZCTA5_20'] != '']

In [152]:
# Names of H-2A program disclosure files from the DOL-OFLC
h2a_file_name_dict = {
    '2008':'H2A_FY2008.xlsx',
    '2009':'H2A_FY2009.xlsx',
    '2010':'H-2A_FY2010.xlsx',
    '2011':'H-2A_FY2011.xlsx',
    '2012':'H-2A_FY2012.xlsx',
    '2013':'H2A_FY2013.xls',
    '2014':'H-2A_FY14_Q4.xlsx',
    '2015':'H-2A_Disclosure_Data_FY15_Q4.xlsx',
    '2016':'H-2A_Disclosure_Data_FY16_updated.xlsx',
    '2017':'H-2A_Disclosure_Data_FY17.xlsx',
    '2018':'H-2A_Disclosure_Data_FY2018_EOY.xlsx',
    '2019':'H-2A_Disclosure_Data_FY2019.xlsx',
    '2020':'H-2A_Disclosure_Data_FY2020.xlsx',
    '2021':'H-2A_Disclosure_Data_FY2021.xlsx',
    '2022':'H-2A_Disclosure_Data_FY2022_Q4.xlsx',
    '2023':'H-2A_Disclosure_Data_FY2023_Q4.xlsx',
    '2024':'H-2A_Disclosure_Data_FY2024_Q4.xlsx'
}

In [153]:
# Define common set of variables we want from every fiscal year, and their types
# We want all as string type, but some dates cannot be read as string type due to storage as date type in Excel, so strip the trailing time later
h2a_dtype_dict = {}

h2a_dtype_dict['2008'] = {
    'CASE_NO':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'CERTIFICATION_BEGIN_DATE':'string',
    'CERTIFICATION_END_DATE':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'ALIEN_WORK_CITY':'string',
    'ALIEN_WORK_STATE':'string',
    'ORGANIZATION_FLAG':'string'
}

h2a_dtype_dict['2009'] = {
    'CASE_NO':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'CERTIFICATION_BEGIN_DATE':'string',
    'CERTIFICATION_END_DATE':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'ALIEN_WORK_CITY':'string',
    'ALIEN_WORK_STATE':'string',
    'ORGANIZATION_FLAG':'string'
}

h2a_dtype_dict['2010'] = h2a_dtype_dict['2009']

h2a_dtype_dict['2011'] = {
    'CASE_NO':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_REQUESTED':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'REQUESTED_START_DATE_OF_NEED':'object',
    'REQUESTED_END_DATE_OF_NEED':'object',
    'CERTIFICATION_BEGIN_DATE':'string',
    'CERTIFICATION_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'ALIEN_WORK_CITY':'string',
    'ALIEN_WORK_STATE':'string',
    'ORGANIZATION_FLAG':'string'
}

h2a_dtype_dict['2012'] = h2a_dtype_dict['2011']

h2a_dtype_dict['2013'] = {
    'CASE_NO':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'REQUESTED_START_DATE_OF_NEED':'object',
    'REQUESTED_END_DATE_OF_NEED':'object',
    'CERTIFICATION_BEGIN_DATE':'string',
    'CERTIFICATION_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'ALIEN_WORK_CITY':'string',
    'ALIEN_WORK_STATE':'string',
    'ORGANIZATION_FLAG':'string'
}

h2a_dtype_dict['2014'] = {
    'CASE_NO':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'REQUESTED_START_DATE_OF_NEED':'object',
    'REQUESTED_END_DATE_OF_NEED':'object',
    'CERTIFICATION_BEGIN_DATE':'string',
    'CERTIFICATION_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'WORKSITE_LOCATION_CITY':'string',
    'WORKSITE_LOCATION_STATE':'string',
    'ORGANIZATION_FLAG':'string'
}

h2a_dtype_dict['2015'] = {
    'CASE_NUMBER':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_REQUESTED':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'CERTIFICATION_BEGIN_DATE':'string',
    'CERTIFICATION_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'WORKSITE_CITY':'string',
    'WORKSITE_STATE':'string',
    'WORKSITE_POSTAL_CODE':'string',
    'ORGANIZATION_FLAG':'string'
}

h2a_dtype_dict['2016'] = {
    'CASE_NUMBER':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_REQUESTED':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'REQUESTED_START_DATE_OF_NEED':'object',
    'REQUESTED_END_DATE_OF_NEED':'object',
    'JOB_START_DATE':'string',
    'JOB_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'WORKSITE_CITY':'string',
    'WORKSITE_STATE':'string',
    'WORKSITE_POSTAL_CODE':'string',
    'ORGANIZATION_FLAG':'string',
    'PRIMARY/SUB':'string',
}

h2a_dtype_dict['2017'] = {
    'CASE_NUMBER':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_REQUESTED':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'REQUESTED_START_DATE_OF_NEED':'object',
    'REQUESTED_END_DATE_OF_NEED':'object',
    'JOB_START_DATE':'string', 
    'JOB_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'WORKSITE_CITY':'string',
    'WORKSITE_COUNTY':'string',
    'WORKSITE_STATE':'string',
    'WORKSITE_POSTAL_CODE':'string',
    'ORGANIZATION_FLAG':'string',
    'PRIMARY/SUB':'string'
}

h2a_dtype_dict['2018'] = {
    'CASE_NO':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_REQUESTED':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'REQUESTED_START_DATE_OF_NEED':'object',
    'REQUESTED_END_DATE_OF_NEED':'object',
    'JOB_START_DATE':'string', 
    'JOB_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'WORKSITE_CITY':'string',
    'WORKSITE_COUNTY':'string',
    'WORKSITE_STATE':'string',
    'WORKSITE_POSTAL_CODE':'string',
    'ORGANIZATION_FLAG':'string',
    'PRIMARY_SUB':'string'
}

h2a_dtype_dict['2019'] = {
    'CASE_NUMBER':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'NBR_WORKERS_REQUESTED':'string',
    'NBR_WORKERS_CERTIFIED':'string',
    'REQUESTED_START_DATE_OF_NEED':'object',
    'REQUESTED_END_DATE_OF_NEED':'object',
    'JOB_START_DATE':'string', 
    'JOB_END_DATE':'string',
    'BASIC_NUMBER_OF_HOURS':'string',
    'BASIC_RATE_OF_PAY':'string',
    'BASIC_UNIT_OF_PAY':'string',
    'WORKSITE_CITY':'string',
    'WORKSITE_COUNTY':'string',
    'WORKSITE_STATE':'string',
    'WORKSITE_POSTAL_CODE':'string',
    'ORGANIZATION_FLAG':'string',
    'PRMARY/SUB':'string'
}

h2a_dtype_dict['2020'] = {
    'CASE_NUMBER':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'TOTAL_WORKERS_NEEDED':'string',
    'TOTAL_WORKERS_H2A_REQUESTED':'string',
    'TOTAL_WORKERS_H2A_CERTIFIED':'string',
    'REQUESTED_BEGIN_DATE':'string',
    'REQUESTED_END_DATE':'string',
    'EMPLOYMENT_BEGIN_DATE':'string',
    'EMPLOYMENT_END_DATE':'string',
    'ANTICIPATED_NUMBER_OF_HOURS':'string',
    'WAGE_OFFER':'string',
    'PER':'string',
    'WORKSITE_CITY':'string',
    'WORKSITE_COUNTY':'string',
    'WORKSITE_STATE':'string',
    'WORKSITE_POSTAL_CODE':'string',
    'TYPE_OF_EMPLOYER_APPLICATION':'string',
    'H2A_LABOR_CONTRACTOR':'string'
}

h2a_dtype_dict['2021'] = h2a_dtype_dict['2020']
h2a_dtype_dict['2022'] = h2a_dtype_dict['2020']
h2a_dtype_dict['2023'] = h2a_dtype_dict['2020']

h2a_dtype_dict['2024'] = {
    'CASE_NUMBER':'string',
    'CASE_STATUS':'string',
    'EMPLOYER_NAME':'string',
    'EMPLOYER_CITY':'string',
    'EMPLOYER_STATE':'string',
    'EMPLOYER_POSTAL_CODE':'string',
    'TOTAL_WORKERS_NEEDED':'string',
    'TOTAL_WORKERS_H2A_REQUESTED':'string',
    'TOTAL_WORKERS_H2A_CERTIFIED':'string',
    'REQUESTED_BEGIN_DATE':'string',
    'REQUESTED_END_DATE':'string',
    'EMPLOYMENT_BEGIN_DATE':'string',
    'EMPLOYMENT_END_DATE':'string',
    'ANTICIPATED_NUMBER_OF_HOURS':'string',
    'WAGE_OFFER':'string',
    'PER':'string',
    'WORKSITE_CITY':'string',
    'WORKSITE_COUNTY':'string',
    'WORKSITE_STATE':'string',
    'WORKSITE_POSTAL_CODE':'string',
    'TYPE_OF_EMPLOYER_APPLICATION':'string',
    'AG_ASSN_OR_AGENCY_STATUS':'string',
    'H2A_LABOR_CONTRACTOR':'string'
}

In [154]:
# h2a_df_dict = {}
# for year, file_name in h2a_file_name_dict.items():
#     h2a_path = Path(f"../Data/h2a/{file_name}")
#     print(h2a_path)

#     dtype_dict = h2a_dtype_dict[year]
#     col_list = list(dtype_dict.keys())
#     h2a_df_dict[year] = pd.read_excel(h2a_path, usecols = col_list, dtype = dtype_dict, parse_dates=False)

# # Pickling
# with open("h2a_pickle", "wb") as fp:
#     pickle.dump(h2a_df_dict, fp)

In [155]:
# Unpickling
with open("h2a_pickle", "rb") as fp:
    h2a_df_dict = pickle.load(fp)

In [156]:
# Define set of common names for concatenating
h2a_rename_dict = {
    'CASE_NO':'case_number',
    'CASE_NUMBER':'case_number',
    'CASE_STATUS':'case_status',
    'EMPLOYER_NAME':'employer_name',
    'EMPLOYER_CITY':'employer_city',
    'EMPLOYER_STATE':'employer_state',
    'EMPLOYER_POSTAL_CODE':'employer_postal_code',
    'NBR_WORKERS_REQUESTED':'nbr_workers_requested',
    'NBR_WORKERS_CERTIFIED':'nbr_workers_certified',
    'TOTAL_WORKERS_NEEDED':'nbr_workers_needed',
    'TOTAL_WORKERS_H2A_REQUESTED':'nbr_workers_requested',
    'TOTAL_WORKERS_H2A_CERTIFIED':'nbr_workers_certified',
    'REQUESTED_START_DATE_OF_NEED':'requested_begin_date',
    'REQUESTED_END_DATE_OF_NEED':'requested_end_date',
    'CERTIFICATION_BEGIN_DATE':'certification_begin_date',
    'CERTIFICATION_END_DATE':'certification_end_date',
    'REQUESTED_BEGIN_DATE':'requested_begin_date',
    'REQUESTED_END_DATE':'requested_end_date',
    'EMPLOYMENT_BEGIN_DATE':'job_begin_date',
    'EMPLOYMENT_END_DATE':'job_end_date',
    'JOB_START_DATE':'job_begin_date',
    'JOB_END_DATE':'job_end_date',
    'BASIC_NUMBER_OF_HOURS':'number_of_hours',
    'ANTICIPATED_NUMBER_OF_HOURS':'number_of_hours',
    'BASIC_RATE_OF_PAY':'wage_rate',
    'WAGE_OFFER':'wage_rate',
    'BASIC_UNIT_OF_PAY':'wage_unit',
    'PER':'wage_unit',
    'ALIEN_WORK_CITY':'worksite_city',
    'ALIEN_WORK_STATE':'worksite_state',
    'WORKSITE_LOCATION_CITY':'worksite_city',
    'WORKSITE_LOCATION_STATE':'worksite_state',
    'WORKSITE_CITY':'worksite_city',
    'WORKSITE_COUNTY':'worksite_county',
    'WORKSITE_STATE':'worksite_state',
    'WORKSITE_POSTAL_CODE':'worksite_zip',
    'PRIMARY/SUB':'primary_sub',
    'PRIMARY_SUB':'primary_sub',
    'PRMARY/SUB':'primary_sub',
    'ORGANIZATION_FLAG':'organization_flag',
    'TYPE_OF_EMPLOYER_APPLICATION':'type_of_employer_application',
    'H2A_LABOR_CONTRACTOR':'h2a_labor_contractor',
    'AG_ASSN_OR_AGENCY_STATUS':'ag_association_or_agency',
}

In [157]:
h2a_df = pd.DataFrame()
for year, df in h2a_df_dict.items():
    df = df.rename(columns = h2a_rename_dict)
    df['fiscal_year'] = year
    h2a_df = pd.concat([h2a_df, df])

In [158]:
# Strip time, define consistent NAs, convert all entries to uppercase
h2a_df = h2a_df.apply(lambda x: x.str.replace(' 00:00:00', ''))
h2a_df = h2a_df.fillna(value='').apply(lambda x: x.str.upper())

Match FIPS codes for worksite locations

In [159]:
# These are the list of worksites we want to obtain county FIPS codes for
h2a_worksite_locations = h2a_df[['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip']]
h2a_worksite_locations = h2a_worksite_locations.drop_duplicates()

We want to match as many worksites as possible with AddFIPS

Start by fixing some basic data entry errors

In [160]:
# There are some entries with state in the county column; fill in state name in state column if empty
state_name_list = []
state_abbr_list = []
for x in us.states.STATES:
    state_abbr_list.append(x.abbr.upper())
    state_name_list.append(x.name.upper())

In [None]:
h2a_worksite_locations.loc[(h2a_worksite_locations['city'] == 'PLANKINTON') & (h2a_worksite_locations['county'] == 'AURORA'), 'state'] = "SD"
h2a_worksite_locations.loc[(h2a_worksite_locations['city'] == 'SOUTH LAKE TAHOE') & (h2a_worksite_locations['county'] == 'CALIFORNIA'), 'state'] = "CA"
h2a_worksite_locations.loc[(h2a_worksite_locations['city'] == 'PARMA') & (h2a_worksite_locations['county'] == 'IDAHO'), 'state'] = "ID"
h2a_worksite_locations.loc[(h2a_worksite_locations['city'] == 'IOWA') & (h2a_worksite_locations['county'] == 'LOUISIANA'), 'state'] = "LA"
h2a_worksite_locations.loc[(h2a_worksite_locations['city'] == 'BRIGGSDALE') & (h2a_worksite_locations['county'] == 'WELD'), 'state'] = "CO"

In [130]:
# Some of these are cities, some are multiple counties smushed together
h2a_worksite_locations['city'] = h2a_worksite_locations['worksite_city']
h2a_worksite_locations['county'] = h2a_worksite_locations['worksite_county']
h2a_worksite_locations['state'] = h2a_worksite_locations['worksite_state']
h2a_worksite_locations['zip'] = h2a_worksite_locations['worksite_zip']

In [131]:
# Split entries with AND, &, / for multiple counties, by adding ',' separators
h2a_worksite_locations['county'] = h2a_worksite_locations['county'].str.replace(' AND ', ',')
h2a_worksite_locations['county'] = h2a_worksite_locations['county'].str.replace(' & ', ',')
h2a_worksite_locations['county'] = h2a_worksite_locations['county'].str.replace('/', ',')

In [73]:
# Explode
h2a_worksite_locations['id'] = h2a_worksite_locations.reset_index().index.copy().astype('str')
h2a_worksite_locations['county_list'] = h2a_worksite_locations['county'].str.split(',')
h2a_worksite_locations = h2a_worksite_locations.explode('county_list')

# Remove leading and trailing whitespace
h2a_worksite_locations['county_list'] = h2a_worksite_locations['county_list'].str.strip()

In [None]:
# Strip suffixes from names
h2a_worksite_locations['county'] = h2a_worksite_locations['county'].str.replace(' COUNTY', '')
h2a_worksite_locations['county'] = h2a_worksite_locations['county'].str.replace(' COUNTIES', '')
h2a_worksite_locations['county'] = h2a_worksite_locations['county'].str.replace(' PARISH', '')
h2a_worksite_locations['county'] = h2a_worksite_locations['county'].str.replace(' PARRISH', '')

In [74]:
# Fix common typos
# These are common typos
h2a_worksite_locations['county_list'] = h2a_worksite_locations['county_list'].str.replace("ST ", "ST. ", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"ST\.\w", "ST. ", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"NORTH\. ", "NORTH ", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"SOUTH\. ", "SOUTH ", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"EAST\. ", "EAST ", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"WEST\. ", "WEST ", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"BATON ROGUE", "BATON ROUGE", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"JEFF DAVIS", "JEFFERSON DAVIS", regex=True)
h2a_worksite_locations.loc[h2a_worksite_locations['state'] == "LA", 'county_list'] = h2a_worksite_locations[h2a_worksite_locations['state'] == "LA"]['county_list'].str.replace(r"IBERIAL", "IBERIA", regex=True)

In [64]:
# Split some entries with multiple counties by hand
# These are entries with multiple counties where simple replacement of separators with ',' for explosion doesn't work
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'HARRISON & BOURBON COUNTIES', 'county'] = 'HARRISON,BOURBON'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'STAMPING GROUND SCOTT COUNTY', 'county'] = 'SCOTT'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'HENRY & UNION COUNTY', 'county'] = 'HENRY,UNION'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'SILVER BOW AND MADISON', 'county'] = 'SILVER BOW,MADISON'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'CANDLER BULLOCH & EVANS COUNTIES', 'county'] = 'CANDLER,BULLOCH,EVANS'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'SAMPSON AND JOHNSTON COUNTIES', 'county'] = 'SAMPSON,JOHNSTON'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'LINCOLN AND BAYFIELD COUNTIES', 'county'] = 'LINCOLN,BAYFIELD'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'ALLEN & MONROE COUNTIES', 'county'] = 'ALLEN,MONROE'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'OWEN & HENRY COUNTIES', 'county'] = 'OWEN,HENRY'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'UPTON/MAGNOLIA HART COUNTY', 'county'] = 'UPTON,MAGNOLIA,HART'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'WARREN & SIMPSON COUNTY', 'county'] = 'WARREN,SIMPSON'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'CASEY & BOYLE COUNTIES', 'county'] = 'CASEY,BOYLE'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'MONTGOMERY & HALIFAX COUNTIES', 'county'] = 'MONTOGOMERY,HALIFAX'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'COUNTIES: RISING SUN DILLISBORO DEARBORN', 'county'] = 'RISING SUN,DILLSBORO,DEARBORN'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'GEORGE AND RANKIN COUNTIES', 'county'] = 'GEORGE,RANKIN'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'MISSAUKEE WEXFORD OSCEOLA & ANTRIM COUNTIES', 'county'] = 'MISSAUKEE,WEXFORD,OSCEOLA,ANTRIM'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'WASHAKIE BIG HORN JOHNSON COUNTY', 'county'] = 'WASHAKIE,BIG HORN,JOHNSON'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'SOMERSET/BALD MOUNTAIN-UNORGANIZED TS', 'county'] = 'SOMERSET,BALD MOUNTAIN'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'MAHLEUR AND HARNEY COUNTIES', 'county'] = 'MAHLEUR,HARNEY'
h2a_worksite_locations.loc[h2a_worksite_locations['county'] == 'FRANKLIN & SOMERSET', 'county'] = 'FRANKLIN,SOMERSET'

In [77]:
# Some entries with state names in the county column should have the county blanked out
# Some entries have USA or U.S.A in the county column as well
h2a_worksite_locations.loc[h2a_worksite_locations['county_list'] == h2a_worksite_locations['state'], 'county_list'] = ""
h2a_worksite_locations.loc[h2a_worksite_locations['county_list'].isin(state_abbr_list), 'county_list'] = ""
h2a_worksite_locations.loc[h2a_worksite_locations['county_list'].isin(state_name_list), 'county_list'] = ""

usa_list = ['USA', 'U.S.A', 'U.S.A.', 'UNITED STATES', 'UNITED STATES OF AMERICA']
h2a_worksite_locations.loc[h2a_worksite_locations['county_list'].isin(usa_list), 'county_list'] = ""

In [78]:
# Use addfips package to get FIPS codes for each state-county pair
af = addfips.AddFIPS()
h2a_worksite_locations['fips_from_addfips'] = h2a_worksite_locations.apply(lambda x: af.get_county_fips(x['county_list'], state=x['state']), axis=1)
h2a_worksite_locations = h2a_worksite_locations.fillna(value='')

In [79]:
h2a_matched = h2a_worksite_locations[h2a_worksite_locations['fips_from_addfips']!=''].copy()

# For unmatched, see if there are issues we can fix
h2a_unmatched = h2a_worksite_locations[h2a_worksite_locations['fips_from_addfips']==''].copy()

Now match the unmatched using Census county names, place names, and ZCTA (ZIP) codes

In [80]:
# There are mistakes in the ZIP code
h2a_unmatched.loc[:, 'zip'] = h2a_unmatched['zip'].str.pad(width=5, side='left', fillchar='0')
h2a_unmatched.loc[:, 'zip'] = h2a_unmatched['zip'].str[0:5]

In [81]:
# Start with those with county information first
h2a_unmatched_counties = h2a_unmatched[(h2a_unmatched['county_list']!='') & (h2a_unmatched['state']!='')]
h2a_unmatched_counties = h2a_unmatched_counties.sort_values(['state', 'county_list'])

Use fuzzy string matching to check for obvious typos in county names

In [82]:
# Define function for fuzzy string matching
def fuzz_search(census_df, census_col, state_to_search, name_to_match):

    def fuzz_match(x, y):
        return rapidfuzz.fuzz.partial_ratio_alignment(x, y).score
    
    state_df = census_df[census_df['STATE'] == state_to_search].copy()
    state_df['score'] = state_df[census_col].apply(lambda x: fuzz_match(x, name_to_match))
    
    state_df = state_df.sort_values('score')
    
    max_score_row = state_df[state_df['score'] == state_df['score'].max()].reset_index()

    # Best match
    if len(max_score_row) >= 1:
        fips = str(max_score_row['fips'][0])
        score = str(max_score_row['score'][0])
        census_name = (max_score_row[census_col][0])
        return(fips, score, census_name)
    else:
        return('', '', '')

In [83]:
# Get matches and match score
census_df = census_county
census_col = 'COUNTYNAME'
fuzzy_result_df = h2a_unmatched_counties.apply(lambda x: fuzz_search(census_df, census_col, x.state, x.county_list), axis=1, result_type='expand')
fuzzy_result_df = fuzzy_result_df.rename(columns = {0:'fips_from_county', 1:'score_from_county', 2:'census_name_county'})
fuzzy_result_df['score_from_county'] = pd.to_numeric(fuzzy_result_df['score_from_county'], errors='coerce')

In [84]:
# It appears 80 is a good cutoff
fuzzy_result_df.loc[fuzzy_result_df['score_from_county'] < 80, ['fips_from_county']] = ''

In [85]:
# Combine match list back in
h2a_unmatched_counties = pd.concat([h2a_unmatched_counties, fuzzy_result_df], axis=1)

Now use fuzzy string matching for city names

In [86]:
# Get matches and match score using city instead
census_df = census_placebycounty
census_col = 'PLACENAME'
fuzzy_result_df = h2a_unmatched_counties.apply(lambda x: fuzz_search(census_df, census_col, x.state, x.city), axis=1, result_type='expand')
fuzzy_result_df = fuzzy_result_df.rename(columns = {0:'fips_from_city', 1:'score_from_city', 2:'census_name_city'})
fuzzy_result_df['score_from_city'] = pd.to_numeric(fuzzy_result_df['score_from_city'], errors='coerce')

In [87]:
# It appears 90 is a good cutoff
fuzzy_result_df.loc[fuzzy_result_df['score_from_city'] < 90, ['fips_from_city']] = ''

In [88]:
# Combine match list back in
h2a_unmatched_counties = pd.concat([h2a_unmatched_counties, fuzzy_result_df], axis=1)

We can also match using ZIP codes

In [89]:
census_zip_agg = census_zip_agg.rename(columns = {'GEOID_ZCTA5_20':'zip', 'fips':'fips_from_zip'})
h2a_unmatched_counties = h2a_unmatched_counties.merge(census_zip_agg, how='left', on=['zip'])

New England has to be mapped separately as they put place names in their county column

In [90]:
# Obvious that New England states have to be handled separately, as they use town names as counties; rematch these again using county entry with census city/place names
new_england_states = ['CT', 'ME', 'MA', 'NH', 'RI', 'VT']

# Get matches and match score using city instead
census_df = census_place_agg
census_col = 'PLACENAME'
fuzzy_result_df = h2a_unmatched_counties.apply(lambda x: fuzz_search(census_df, census_col, x.state, x.county_list), axis=1, result_type='expand')
fuzzy_result_df = fuzzy_result_df.rename(columns = {0:'fips_from_city_ne', 1:'score_from_city_ne', 2:'census_name_city_ne'})
fuzzy_result_df['score_from_city_ne'] = pd.to_numeric(fuzzy_result_df['score_from_city_ne'], errors='coerce')

In [91]:
# Combine match list back in
h2a_unmatched_counties = pd.concat([h2a_unmatched_counties, fuzzy_result_df], axis=1)

In [92]:
# Different states have different cutoffs
# CT - 90
# MA - 95
# ME - 80
# NH - 83
# RI - 99
# VT - 89
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'CT') & (h2a_unmatched_counties['score_from_city_ne'] < 90), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'MA') & (h2a_unmatched_counties['score_from_city_ne'] < 95), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'ME') & (h2a_unmatched_counties['score_from_city_ne'] < 80), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'NH') & (h2a_unmatched_counties['score_from_city_ne'] < 83), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'RI') & (h2a_unmatched_counties['score_from_city_ne'] < 99), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'VT') & (h2a_unmatched_counties['score_from_city_ne'] < 89), ['fips_from_city_ne']] = ''

In [93]:
# Some mistakes in the New England matches
# HAMPTON - NORTHAMPTON CITY in MA
# STOW - WILLIAMSTOWN CDP in MA
# SO - SOUTH PARIS CDP in ME
# CHESTER - MANCHESTER CITY in NH
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'MA') & (h2a_unmatched_counties['city'] == 'HAMPTON') & (h2a_unmatched_counties['census_name_city_ne'] == 'NORTHAMPTON CITY'), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'MA') & (h2a_unmatched_counties['city'] == 'STOW') & (h2a_unmatched_counties['census_name_city_ne'] == 'WILLIAMSTOWN CDP'), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'ME') & (h2a_unmatched_counties['city'] == 'SO') & (h2a_unmatched_counties['census_name_city_ne'] == 'SOUTH PARIS CDP'), ['fips_from_city_ne']] = ''
h2a_unmatched_counties.loc[(h2a_unmatched_counties['state'] == 'NH') & (h2a_unmatched_counties['city'] == 'CHESTER') & (h2a_unmatched_counties['census_name_city_ne'] == 'MANCHESTER CITY'), ['fips_from_city_ne']] = ''

In [94]:
# Fix New England FIPS codes
h2a_unmatched_counties.loc[(h2a_unmatched_counties['worksite_state'].isin(new_england_states)) & (h2a_unmatched_counties['fips_from_city'].isna() | h2a_unmatched_counties['fips_from_city'] == ''), 'fips_from_city'] = h2a_unmatched_counties['fips_from_city_ne']
h2a_unmatched_counties = h2a_unmatched_counties.drop(columns = ['fips_from_city_ne'])

For entries without county information, merge just on city and ZIP

In [95]:
h2a_unmatched_cities = h2a_unmatched[(h2a_unmatched['county_list']=='') & (h2a_unmatched['state']!='')]

In [96]:
# Get matches and match score using city instead
census_df = census_place_agg
census_col = 'PLACENAME'
fuzzy_result_df = h2a_unmatched_cities.apply(lambda x: fuzz_search(census_df, census_col, x.state, x.city), axis=1, result_type='expand')
fuzzy_result_df = fuzzy_result_df.rename(columns = {0:'fips_from_city', 1:'score_from_city', 2:'census_name_city'})
fuzzy_result_df['score_from_city'] = pd.to_numeric(fuzzy_result_df['score_from_city'], errors='coerce')

In [97]:
# It appears 85 is a good cutoff
fuzzy_result_df.loc[fuzzy_result_df['score_from_city'] < 85, ['fips_from_city']] = ''

In [98]:
# Combine match list back in
h2a_unmatched_cities = pd.concat([h2a_unmatched_cities, fuzzy_result_df], axis=1)

In [99]:
# Match using ZIP code
h2a_unmatched_cities = h2a_unmatched_cities.merge(census_zip_agg, how='left', on=['zip'])

Now we need to choose which matched FIPS codes to use

In [100]:
# Keep only needed variables for this
h2a_unmatched_counties = h2a_unmatched_counties[['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip', 'zip', 'id', 'fips_from_county', 'fips_from_city', 'fips_from_zip']]
h2a_unmatched_cities = h2a_unmatched_cities[['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip', 'zip', 'id', 'fips_from_city', 'fips_from_zip']]

In [101]:
h2a_unmatched_with_fips = pd.concat([h2a_unmatched_cities, h2a_unmatched_counties], axis=0)

# Convert NaNs into blank strings
h2a_unmatched_with_fips = h2a_unmatched_with_fips.fillna(value = '')

In [102]:
h2a_unmatched_with_fips[(h2a_unmatched_with_fips['fips_from_zip']=='') & (h2a_unmatched_with_fips['zip']!='')].to_csv("test.csv")

Choose FIPS

In [103]:
# Write function with logic for choosing FIPS
def fips_choice(county_fips, zip_fips, city_fips):
    if (zip_fips != ''):
        return(zip_fips)
    
    if (county_fips != ''):
        return(county_fips)
    
    if (city_fips != ''):
        return(city_fips)
    
    else:
        return('')

In [104]:
# Choose fips
h2a_unmatched_with_fips['fips_from_census'] = h2a_unmatched_with_fips.apply(lambda x: fips_choice(x.fips_from_county, x.fips_from_zip, x.fips_from_city), axis=1)
h2a_unmatched_with_fips = h2a_unmatched_with_fips[['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip', 'id', 'fips_from_census']]

Combine back into original H-2A data

In [105]:
# All worksite locations in the original data, but exploded
h2a_worksite_locations_all = h2a_worksite_locations.merge(h2a_unmatched_with_fips, how='left', on=['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip', 'id'])

In [106]:
# Finalized FIPS codes
h2a_worksite_locations_all = h2a_worksite_locations_all.fillna(value = '')
h2a_worksite_locations_all['fips'] = h2a_worksite_locations_all['fips_from_addfips']
h2a_worksite_locations_all.loc[h2a_worksite_locations_all['fips']=='', 'fips'] = h2a_worksite_locations_all['fips_from_census']

In [107]:
# Aggregate back into original worksite entries using original ID
h2a_worksite_locations_agg = h2a_worksite_locations_all.groupby(['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip', 'id']).agg({'fips': lambda x: ','.join(x)}).reset_index()

In [108]:
# Merge fips back into original H-2a file
h2a_df_with_fips = h2a_df.merge(h2a_worksite_locations_agg, how = 'left', on = ['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip'])

For the remaining locations, find county using Google's Places API

Start by finding the Place ID for each location using Find Place

In [109]:
h2a_unmatched_census = h2a_worksite_locations_agg[h2a_worksite_locations_agg['fips'] == ''][['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip']]

In [110]:
# Some of these are counties, tag them
h2a_unmatched_census['place_name'] = h2a_unmatched_census['worksite_city']
h2a_unmatched_census['are_counties'] = h2a_unmatched_census['place_name'].str.contains(pat=r" COUNTY| COUNTIES| COUNTIE| COUNTRY|COUNTIES: | CO\.", regex=True)
h2a_unmatched_census['place_name'] = h2a_unmatched_census['place_name'].str.replace(pat=r" COUNTY| COUNTIES| COUNTIE| COUNTRY|COUNTIES: | CO\.", regex=True, repl='')

In [111]:
h2a_unmatched_census['place_name'] = h2a_unmatched_census['place_name'].str.replace(pat=r"\(SEE ATTACH.*\)", regex=True, repl='')

In [112]:
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'WASHAKIE, BIG HORN JOHNSON COUNTY', 'place_name'] = 'WASHAKIE,BIG HORN,JOHNSON'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == '1) WENDEN 2)TONOPAH  3) DATELAND', 'place_name'] = 'WENDEN,TONOPAH,DATELAND'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == '1.- TONOPAH                           2.- DATELAND', 'place_name'] = 'TONOPAH,DATELAND'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'BRANCH & ST. JOSEPH COS. IN MI & LAGRANGE CO. IN', 'place_name'] = 'BRANCH COUNTY,ST. JOSEPH'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'CHARLOTTE  CT. HTS.', 'place_name'] = 'CHARLOTTE COURT HOUSE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'CHARLOTTE CT, HS', 'place_name'] = 'CHARLOTTE COURT HOUSE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'CHARLOTTE CT. HS.', 'place_name'] = 'CHARLOTTE COURT HOUSE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'CHARLOTTE CT.HS', 'place_name'] = 'CHARLOTTE COURT HOUSE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'CHARLOTTE CT.HS.', 'place_name'] = 'CHARLOTTE COURT HOUSE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'CHARLOTTEE CT.HS.', 'place_name'] = 'CHARLOTTE COURT HOUSE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'DUMAS AND AMARILLO  79029 & 79120', 'place_name'] = 'DUMAS,AMARILLO'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'EDISON, NE 68932', 'place_name'] = 'EDISON'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'EMPLOYER OWNED AND OPERATED', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'EDISON, NE 68932', 'place_name'] = 'EDISON'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'ESSEX JCT.', 'place_name'] = 'ESSEX JUNCTION'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'ESSEX, JCT.', 'place_name'] = 'ESSEX JUNCTION'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'FANNETTSBURG, PA 17221', 'place_name'] = 'FANNETTSBURG'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'FAYETTE C. & BOURBON CO.', 'place_name'] = 'FAYETTE,BOURBON'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'HAWESVILLE, KY AND HAWESVILLE', 'place_name'] = 'HAWESVILLE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'HILLSVILLE SEE ATTACHMENT # 1', 'place_name'] = 'HILLSVILLE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'LEBANON, NJ 08833', 'place_name'] = 'LEBANON'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'MT, PLEASANT', 'place_name'] = 'MT. PLEASANT'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'NEAREST TOWN IS FORT KENT MAINE', 'place_name'] = 'FORT KENT'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'ODELL, TEXAS 79247', 'place_name'] = 'ODELL'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'PITTSBURG, ATKINSON GILMANTON ACADEMY', 'place_name'] = 'ATKINSON GILMANTON ACADEMY'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'PITTSBURG, ATKINSON GILMANTON ACADEMY GRANT', 'place_name'] = 'ATKINSON GILMANTON ACADEMY'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'PITTSBURG, ATKINSON, GILMANTON ACADEMY', 'place_name'] = 'ATKINSON GILMANTON ACADEMY'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'PLEASE SEE 7A. BELOW.', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'RIPLEY, TN.', 'place_name'] = 'RIPLEY'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'SEE ADDENDUM', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'SEE ATTACHMENT', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'SEE ATTACHMENT # 1', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'SEE ATTACHMENT #1', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'STARTS IN HAYS, KS', 'place_name'] = 'HAYS'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'T13R14, T5R11, ESTCOURT', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'T8R12,T7R13,T8R13,T9R13,T7R14,T8R14,T5R15,T6R15,T7', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'T911,T8R12,T7R13,T7R14,T8R14,T5R15,T6R15,T7R15', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'T9R11,T8R12,T7R13,T8R13,T9R13,T7R14,T8R14,T5R15,T6', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'TOWNSHIP', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'TOWNSHIP T15R15', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'].str.contains(pat=r"UNORGANIZED TOWN", regex=True), 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'UNORANIZED TOWNSHIPS AROUND', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'UNORGANIZED TOWNSHIPS AROUND THE CLAYTON LAKE AREA', 'place_name'] = 'CLAYTON LAKE'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'UNORGANIZED TOWNSHIPS: NEAREST TOWN IS JACKSON MAI', 'place_name'] = 'JACKSON'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'UNORGANIZED TOWNSHIPS; AROUND EUSTIS', 'place_name'] = 'EUSTIS'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'VARIOUS', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'VARIOUS IN WESTERN', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'VARIOUS UNORGAINIZED TOWNSHIPS', 'place_name'] = ''
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'VARIOUS, BEGINNING IN STRATFORD', 'place_name'] = 'STRATFORD'
h2a_unmatched_census.loc[h2a_unmatched_census['place_name'] == 'WESTON, TX', 'place_name'] = 'WESTON'

In [113]:
h2a_unmatched_census['place_list'] = h2a_unmatched_census['place_name'].str.split(pat = r" AND | & |,|/", regex=True)

In [114]:
# Add county suffix to entries that are county names
def add_county_suffix(is_county, name):
    if is_county == True:
        if isinstance(name, str):
            name = name + ' COUNTY'
            return(name)
        else:
            name_list = [x + ' COUNTY' for x in name]
            return(name_list)
    else:
        return(name)
    
h2a_unmatched_census['place_list'] =  h2a_unmatched_census.apply(lambda x: add_county_suffix(x.are_counties, x.place_list), axis=1)

In [115]:
state_name_dict = {}
for x in us.states.STATES:
    state_name_dict[x.abbr] = x.name

In [116]:
# We don't have to search for rows without a place name or state name
h2a_unmatched_census = h2a_unmatched_census[h2a_unmatched_census['place_name']!='']
h2a_unmatched_census['state_name'] = h2a_unmatched_census['worksite_state'].map(state_name_dict)
h2a_unmatched_census = h2a_unmatched_census[~h2a_unmatched_census['state_name'].isna()]

In [117]:
# Explode for rows with multiple places
h2a_unmatched_census = h2a_unmatched_census.explode(column = 'place_list')

In [118]:
# Create ID for each row to link with API request responses
h2a_unmatched_census['id'] = h2a_unmatched_census.reset_index().index.astype('str')

In [119]:
# Split API calls into chunks of 100
h2a_unmatched_census['chunk'] = h2a_unmatched_census['id'].astype(int)//100

In [120]:
# Google maps API key from my account
# Import API key stored in text file
with open("../tools/google_places_api_key.txt") as f:
    lines = f.readlines()

api_key = lines[0]

In [121]:
# # Base url to call Find Place API
# base_url = "https://maps.googleapis.com/maps/api/place/findplacefromtext/json?"

# for c in range(0, 14):
#     h2a_chunk = h2a_unmatched_census[h2a_unmatched_census['chunk'] == c]

#     # Dict to store API responses
#     api_placeid_dict = {}

#     for ind in range(0, len(h2a_chunk)):
#         row = h2a_chunk.iloc[ind]
#         id = row['id']
#         state_name = row['state_name']
#         place_name = row['place_list']
#         name_to_search = place_name + ', ' + state_name

#         print(id, name_to_search)

#         # Create API request
#         # URL'ed location name we want to search
#         input = urllib.parse.quote(name_to_search) # Encode place name as URL string
#         request_url = base_url + "input=" + input + "&inputtype=textquery" + "&fields=place_id" + "&key=" + api_key

#         payload = {}
#         headers = {}

#         # Sleep one second between each API call
#         time.sleep(1)

#         # Make API call
#         response = requests.request("GET", request_url, headers=headers, data=payload)
#         response_json = response.json()
        
#         # If API call is successful, then place response result into dict
#         if response_json['status']=='OK':
#             print('Successful')
#             api_placeid_dict[id] = response_json
#         else:
#             # If API call is unsuccessful, then wait 5 seconds and retry
#             print('NOT successful, retrying')
#             time.sleep(5)
#             response = requests.request("GET", request_url, headers=headers, data=payload)
#             response_json = response.json()

#             if response_json['status']=='OK':
#                 print('Retry successful')
#                 api_placeid_dict[id] = response_json
#             else:
#                 error_type = response_json['status']
#                 print('Retry unsuccessful, error: ' + error_type)

#     # Save API request results as JSON
#     with open(f'json/placeid_api_request_result_chunk_{c}.json', 'w') as f:
#         json.dump(api_placeid_dict, f)

Now use the Place ID to find the county name of each location

In [122]:
# Load JSON of API responses and put into DataFrame
api_placeid_dict = {}
for c in range(0, 14):
    with open(f'json/placeid_api_request_result_chunk_{c}.json', 'r') as infile:
        api_dict = json.load(infile)

    api_placeid_dict = api_placeid_dict | api_dict

In [123]:
# Put place IDs into DataFrame
api_placeid_df = pd.DataFrame(columns=['id', 'placeid'])

for id, response in api_placeid_dict.items():
    number_of_candidates = len(response['candidates'])
    for response_ind in range(0, number_of_candidates):
        placeid = response['candidates'][response_ind]['place_id']
        api_placeid_df.loc[len(api_placeid_df)] = [id, placeid]

In [124]:
# Split API calls into chunks of 100
api_placeid_df['chunk'] = api_placeid_df['id'].astype(int)//100

In [125]:
# # Use Place details API to get county names
# base_url = 'https://maps.googleapis.com/maps/api/geocode/json?'

# for c in range(0, 16):
#     api_placeid_chunk = api_placeid_df[api_placeid_df['chunk'] == c]
#     api_place_details_dict = {}

#     # Iterate over each place ID
#     for index, row in api_placeid_chunk.iterrows():
#         print(row['id'], row['placeid'])

#         # Create API request
#         input = row['placeid']
#         request_url = base_url + "place_id=" + input + "&key=" + api_key

#         payload = {}
#         headers = {}

#         response = requests.request("GET", request_url, headers=headers, data=payload)
#         response_json = response.json()

#         # If API call is successful, then place response result into dict
#         if response_json['status']=='OK':
#             print('Successful')
#             api_place_details_dict[input] = response_json
#         else:
#             # If API call is unsuccessful, then wait 5 seconds and retry
#             print('NOT successful, retrying')
#             time.sleep(5)
#             response = requests.request("GET", request_url, headers=headers, data=payload)
#             response_json = response.json()

#             if response_json['status']=='OK':
#                 print('Retry successful')
#                 api_place_details_dict[input] = response_json
#             else:
#                 error_type = response_json['status']
#                 print('Retry unsuccessful, error: ' + error_type)

#     # Save API request results as JSON
#     with open(f'json/place_details_api_request_result_chunk_{c}.json', 'w') as f:
#         json.dump(api_place_details_dict, f)

In [126]:
# Load JSON of API responses and put into DataFrame
api_place_details_dict = {}
for c in range(0, 16):
    with open(f'json/place_details_api_request_result_chunk_{c}.json', 'r') as infile:
        api_dict = json.load(infile)

    api_place_details_dict = api_place_details_dict | api_dict

In [127]:
# Store county name from place details into dictionary (store state names too as there may be incorrect states)
county_name_dict = {}
state_name_dict = {}

In [128]:
# Extract information we want from API response
for placeid, response in api_place_details_dict.items():
    n_responses = len(response['results'])

    for response_ind in range(0, n_responses):
        individual_response = response['results'][response_ind]
        response_address_components_list = individual_response['address_components']
        n_components = len(response_address_components_list)

        for component_ind in range(0, n_components):
            component_dict = response_address_components_list[component_ind]
            component_type =  component_dict['types'][0]

            if component_type == 'administrative_area_level_2':
                county_name = component_dict['long_name']
                county_name_dict[placeid] = county_name
            
            if component_type == 'administrative_area_level_1':
                state_name = component_dict['long_name']
                state_name_dict[placeid] = state_name

In [129]:
# Add county and state name columns to Place ID
api_placeid_df['county_name_api'] = api_placeid_df['placeid'].map(county_name_dict)
api_placeid_df['state_name_api'] = api_placeid_df['placeid'].map(state_name_dict)

In [130]:
# Some of these multiple responses per place name are in the same county, so we can collapse those
api_placeid_df = api_placeid_df.drop_duplicates(subset = ['id', 'county_name_api', 'state_name_api'])

In [131]:
# For the remainder, manually resolve
api_placeid_df = api_placeid_df.merge(h2a_unmatched_census[['place_list', 'state_name', 'id']], how = 'left', on = ['id'])
multiple_response = api_placeid_df[api_placeid_df.duplicated(subset=['id'], keep=False)]
multiple_response.to_csv("test.csv")

In [132]:
# Caddo, Texas is in Stephens County
api_placeid_df.loc[(api_placeid_df['place_list'] == '33 CADDO') & (api_placeid_df['state_name'] == 'Texas'), 'county_name_api'] = 'Stephens County'

# Alice, Tennessee is ambiguous
api_placeid_df.loc[(api_placeid_df['place_list'] == 'ALICE') & (api_placeid_df['state_name'] == 'Tennessee'), 'county_name_api'] = None

# Box 78, Kansas?
api_placeid_df.loc[(api_placeid_df['place_list'] == 'BOX 78') & (api_placeid_df['state_name'] == 'Kansas'), 'county_name_api'] = None

# Britton, South Dakota is in Marshall County
api_placeid_df.loc[(api_placeid_df['place_list'] == 'BRITTON') & (api_placeid_df['state_name'] == 'North Dakota'), 'county_name_api'] = 'Marshall County'

# There is no Casa Grande in Arkansas
api_placeid_df.loc[(api_placeid_df['place_list'] == 'CASA GRANDE') & (api_placeid_df['state_name'] == 'Arkansas'), 'county_name_api'] = None

# There is no Clark in North Dakota
api_placeid_df.loc[(api_placeid_df['place_list'] == 'CLARK') & (api_placeid_df['state_name'] == 'North Dakota'), 'county_name_api'] = None

# There is no Cothertown in Tennessee
api_placeid_df.loc[(api_placeid_df['place_list'] == 'COTHERTOWN') & (api_placeid_df['state_name'] == 'Tennessee'), 'county_name_api'] = None

# There is no Craig in Wyoming
api_placeid_df.loc[(api_placeid_df['place_list'] == 'CRAIG') & (api_placeid_df['state_name'] == 'Wyoming'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == ' COUNTY') & (api_placeid_df['state_name'] == 'South Carolina'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'GLADSTONE') & (api_placeid_df['state_name'] == 'South Dakota'), 'county_name_api'] = None

# Ambiguous
api_placeid_df.loc[(api_placeid_df['place_list'] == 'HOLLIS') & (api_placeid_df['state_name'] == 'Massachusetts'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'IDAHO COUNTY') & (api_placeid_df['state_name'] == 'Nevada'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'LONE TREE') & (api_placeid_df['state_name'] == 'Michigan'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'MACK (MAIL)') & (api_placeid_df['state_name'] == 'Colorado'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'MACK(MAIL)') & (api_placeid_df['state_name'] == 'Colorado'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'MAMOIA') & (api_placeid_df['state_name'] == 'Louisiana'), 'county_name_api'] = 'Evangeline Parish'

api_placeid_df.loc[(api_placeid_df['place_list'] == 'MANNNING') & (api_placeid_df['state_name'] == 'North Carolina'), 'county_name_api'] = 'Nash County'

api_placeid_df.loc[(api_placeid_df['place_list'] == 'MAURICECHURCH POINT') & (api_placeid_df['state_name'] == 'Louisiana'), 'county_name_api'] = 'Acadia Parish, Vermilion Parish'

api_placeid_df.loc[(api_placeid_df['place_list'] == 'NORTH SPRING VALLEY') & (api_placeid_df['state_name'] == 'Utah'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'PARKERS PRAIRIE') & (api_placeid_df['state_name'] == 'Missouri'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'PORTON') & (api_placeid_df['state_name'] == 'Arizona'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'ROYNE') & (api_placeid_df['state_name'] == 'Louisiana'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'SAVERY') & (api_placeid_df['state_name'] == 'Colorado'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'STRATFORD') & (api_placeid_df['state_name'] == 'North Dakota'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'THREE RIVERS') & (api_placeid_df['state_name'] == 'Florida'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'UNORGNAIZED TOWNSHIPS: AROUND') & (api_placeid_df['state_name'] == 'Maine'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'VASS') & (api_placeid_df['state_name'] == 'Tennessee'), 'county_name_api'] = None

api_placeid_df.loc[(api_placeid_df['place_list'] == 'WLIESBURG') & (api_placeid_df['state_name'] == 'Virginia'), 'county_name_api'] = 'Charlotte County'

api_placeid_df.loc[(api_placeid_df['place_list'] == 'WOLCOTT') & (api_placeid_df['state_name'] == 'Idaho'), 'county_name_api'] = None

In [133]:
# Recollapse after fixing
api_placeid_df = api_placeid_df.drop_duplicates(subset = ['id', 'county_name_api', 'state_name'])

In [134]:
# MAURICECHURCH POINT, Louisiana is actually 2 cities
extra_row = api_placeid_df[api_placeid_df['place_list'] == 'MAURICECHURCH POINT'].copy()
api_placeid_df.loc[api_placeid_df['place_list'] == 'MAURICECHURCH POINT', 'county_name_api'] = 'Acadia Parish'
extra_row['county_name_api'] = 'Vermilion Parish'
api_placeid_df = pd.concat([api_placeid_df, extra_row])

In [135]:
# Get FIPS codes using addFIPS
api_placeid_df = api_placeid_df[~api_placeid_df['county_name_api'].isna()].copy()
api_placeid_df['fips_api'] = api_placeid_df.apply(lambda x: af.get_county_fips(x['county_name_api'], state=x['state_name']), axis=1)

In [136]:
# Drop API results that don't match states
api_placeid_df = api_placeid_df[api_placeid_df['state_name'] == api_placeid_df['state_name_api']]

In [137]:
# Recollapse back into individual entries (some entries had multiple places per entry)
h2a_api_df = h2a_unmatched_census.merge(api_placeid_df[['id', 'county_name_api', 'fips_api']], how = 'left', on = ['id'])
h2a_api_df = h2a_api_df[~h2a_api_df['fips_api'].isna()].copy()
h2a_api_df = h2a_api_df.groupby(['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip']).agg({'fips_api': lambda x: ",".join(x)}).reset_index()

In [138]:
# Add FIPS from API to the H-2A entries based on worksites
h2a_df_final = h2a_df_with_fips.merge(h2a_api_df, how='left', on=['worksite_city', 'worksite_county', 'worksite_state', 'worksite_zip'])
h2a_df_final = h2a_df_final.fillna(value='')
h2a_df_final.loc[h2a_df_final['fips'] == '', 'fips'] = h2a_df_final.loc[h2a_df_final['fips'] == '', 'fips_api']

In [139]:
# Export binary
h2a_df_final = h2a_df_final.drop(columns = ['fips_api'])
h2a_df_final.to_parquet("../binaries/h2a_with_fips.parquet", index=False)

In [140]:
h2a_df_final['wage_unit'].drop_duplicates()

0                      HR
18                    MTH
302                    WK
614                   DAI
5490                   BI
10367                    
40691               MONTH
40692                HOUR
40885    SELECT PAY RANGE
41563           BI-WEEKLY
41986                WEEK
50466                YEAR
Name: wage_unit, dtype: string

In [143]:
h2a_df_final[h2a_df_final['fiscal_year'] == '2021']

Unnamed: 0,case_number,case_status,certification_begin_date,certification_end_date,employer_name,employer_city,employer_state,employer_postal_code,nbr_workers_certified,wage_rate,...,worksite_zip,primary_sub,job_begin_date,job_end_date,worksite_county,type_of_employer_application,h2a_labor_contractor,nbr_workers_needed,id,fips
136645,H-300-21109-235767,DETERMINATION ISSUED - CERTIFICATION,,,"LA HACIENDA CITRUS, INC.",UTUADO,PR,611,4,7.25,...,611,,2021-06-28,2022-04-15,UTUADO,INDIVIDUAL EMPLOYER,N,6,19494,72141
136646,H-300-21113-254532,DETERMINATION ISSUED - CERTIFICATION,,,"HACIENDA RAMIREZ, INC.",SAN SEBASTIAN,PR,685,6,7.25,...,685,,2021-06-28,2022-04-15,SAN SEBASTIAN,INDIVIDUAL EMPLOYER,N,10,19495,72131
136647,H-300-21137-318304,DETERMINATION ISSUED - CERTIFICATION,,,"HACIENDA LOS EUCALIPTOS, INC.",LARES,PR,669,12,7.25,...,669,,2021-07-18,2022-05-17,LARES,JOINT EMPLOYER,N,20,19496,72081
136648,H-300-21144-339364,DETERMINATION ISSUED - CERTIFICATION,,,YOMAR RAMOS-GONZALEZ,AGUADA,PR,602,7,7.25,...,602,,2021-07-23,2022-05-22,AGUADA,JOINT EMPLOYER,N,12,19497,72003
136649,H-300-21166-400271,DETERMINATION ISSUED - CERTIFICATION,,,ROBERTO ATIENZA-FIGUEROA,JAYUYA,PR,664,6,7.25,...,664,,2021-08-14,2022-06-13,JAYUYA,INDIVIDUAL EMPLOYER,N,10,19498,72073
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
152974,H-300-21250-569460,DETERMINATION ISSUED - CERTIFICATION,,,TURNER FARMS NURSERY,WILMER,AL,36587,6,11.81,...,36587,,2021-11-01,2022-08-31,MOBILE,INDIVIDUAL EMPLOYER,N,6,14459,01097
152975,H-300-21265-599271,DETERMINATION ISSUED - CERTIFICATION,,,WESTERN RANGE ASSOCIATION,TWIN FALLS,ID,83301,5,1727.75,...,81403,,2021-12-01,2022-01-31,MONTROSE,ASSOCIATION - JOINT EMPLOYER,N,5,10501,08085
152976,H-300-21265-599641,DETERMINATION ISSUED - CERTIFICATION,,,WESTERN RANGE ASSOCIATION,TWIN FALLS,ID,83301,2,1727.75,...,84643,,2021-12-01,2022-02-28,SANPETE,ASSOCIATION - JOINT EMPLOYER,N,2,21211,49039
152977,H-300-21265-600060,DETERMINATION ISSUED - CERTIFICATION,,,WESTERN RANGE ASSOCIATION,TWIN FALLS,ID,83301,7,1727.75,...,84033,,2021-12-01,2022-04-30,SUMMIT,ASSOCIATION - JOINT EMPLOYER,N,7,10414,49043


In [144]:
h2a_df_final[h2a_df_final['fiscal_year'] == '2022']

Unnamed: 0,case_number,case_status,certification_begin_date,certification_end_date,employer_name,employer_city,employer_state,employer_postal_code,nbr_workers_certified,wage_rate,...,worksite_zip,primary_sub,job_begin_date,job_end_date,worksite_county,type_of_employer_application,h2a_labor_contractor,nbr_workers_needed,id,fips
152979,H-300-21162-392244,DETERMINATION ISSUED - CERTIFICATION,,,MIDWEST AG ELECTRIC INC.,ALBERT LEA,MN,56007,6,14.72,...,56039,,2021-08-10,2022-04-01,MARTIN,INDIVIDUAL EMPLOYER,Y,6,15092,27091
152980,H-300-21194-457735,DETERMINATION ISSUED - CERTIFICATION,,,"ALTENBURG CONSTRUCTION, INC.",LEWISVILLE,MN,56060,7,15.37,...,50436,,2021-09-10,2021-12-31,WINNEBAGO,INDIVIDUAL EMPLOYER,Y,7,21320,19189
152981,H-300-21221-510019,DETERMINATION ISSUED - CERTIFICATION,,,PANCOST RANCH LLC,STONEHAM,CO,80754,1,1727.75,...,80754,,2021-11-01,2022-02-28,WELD,ASSOCIATION - AGENT,N,1,17049,08123
152982,H-300-21223-516656,DETERMINATION ISSUED - CERTIFICATION,,,"LA ALIANZA, LP",NIPOMO,CA,93444,80,16.05,...,92233,,2021-10-11,2022-01-29,IMPERIAL,JOINT EMPLOYER,Y,80,21321,06025
152983,H-300-21224-517089,DETERMINATION ISSUED - CERTIFICATION,,,RC PACKING LLC,GONZALEZ,CA,93926,215,13.85,...,85365,,2021-10-25,2022-04-15,YUMA,INDIVIDUAL EMPLOYER,Y,215,10874,04027
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
172062,H-300-22262-479583,DETERMINATION ISSUED - CERTIFICATION,,,"A & R FARMS, LLC",ALAPAHA,GA,31622,1,11.99,...,31622,,2022-12-01,2023-04-01,BERRIEN,INDIVIDUAL EMPLOYER,N,1,17271,13019
172063,H-300-22262-479788,DETERMINATION ISSUED - CERTIFICATION,,,SITTIG CRAWFISH LLC,EUNICE,LA,70535,2,12.45,...,70535,,2022-12-01,2023-08-31,ACADIA,INDIVIDUAL EMPLOYER,N,2,12981,22001
172064,H-300-22262-479833,DETERMINATION ISSUED - CERTIFICATION,,,SITTIG CRAWFISH LLC,EUNICE,LA,70535,4,12.45,...,70535,,2022-12-01,2023-07-15,ACADIA,INDIVIDUAL EMPLOYER,N,4,12981,22001
172065,H-300-22262-480426,DETERMINATION ISSUED - CERTIFICATION,,,GERALD GUILLORY FARMS,EUNICE,LA,70535,4,12.45,...,70535,,2022-12-02,2023-09-02,ST LANDRY,INDIVIDUAL EMPLOYER,N,4,12989,22097
