In [145]:
from pathlib import Path
import pandas as pd
import json
from itertools import islice

# OPM series to SOC crosswalk

In [146]:
occ_rename_dict = {
    'OPM SERIES NUMBER (December 2018)': 'opm_occ_code',
    'OPM SERIES TITLE\xa0\n(December 2018)': 'opm_occ_title',
    '2018 \nSOC CODE': '2018_soc_code',
    '2014-2018 EEO TABULATION (CENSUS) CODE': '2018_census_occ_code',
    '2014-2018 EEO TABULATION (CENSUS) OCCUPATION TITLE': '2018_census_occ_title'
}

In [147]:
# Load OPM occupation code to census code crosswalk
opm_occ_df = pd.read_excel('../raw_data/MD715-Census Occupation Crosswalk 2022Feb23.xlsx', header = 2, dtype = 'str')
opm_occ_df.rename(columns = occ_rename_dict, inplace = True)
opm_occ_df = opm_occ_df[list(occ_rename_dict.values())]

# Remove trailing #, which indicates deviations from documentation of the 2014-2018 EEO Tabulation
for col in opm_occ_df.columns:
    opm_occ_df[col] = opm_occ_df[col].str.replace('#', '')

opm_occ_df.reset_index(inplace = True, drop = True)
opm_occ_df.to_feather('../code_output/opm_occ_2018_soc_xwalk.feather')

soc_1018_xwalk_df = pd.read_excel('../raw_data/soc_2010_to_2018_crosswalk.xlsx', header = 8)
soc_1018_xwalk_df.rename(columns = {'2010 SOC Code': '2010_soc_code', '2010 SOC Title':'2010_soc_title', '2018 SOC Code': '2018_soc_code', '2018 SOC Title':'2018_soc_title'}, inplace = True)
soc_1018_xwalk_df.reset_index(inplace = True, drop = True)
soc_1018_xwalk_df.to_feather('../code_output/soc_1018_xwalk.feather')

soc_0010_xwalk_df = pd.read_excel('../raw_data/soc_2000_to_2010_crosswalk.xls', header = 6)
soc_0010_xwalk_df.rename(columns = {'2000 SOC code': '2000_soc_code', '2000 SOC title':'2000_soc_title', '2010 SOC code': '2010_soc_code', '2010 SOC title':'2010_soc_title'}, inplace = True)
soc_0010_xwalk_df.dropna(inplace = True)
soc_0010_xwalk_df.reset_index(inplace = True, drop = True)
soc_0010_xwalk_df.to_feather('../code_output/soc_0010_xwalk.feather')

# Census OCC 2010 to 2018 crosswalk

In [148]:
occ_1018_xwalk_df = pd.read_excel('../raw_data/2018-occupation-code-list-and-crosswalk.xlsx', sheet_name = '2010 to 2018 Crosswalk ', header = 3, skipfooter = 6, dtype = str)

In [149]:
occ_1018_rename_dict = {'2010 Census Code': '2010_census_occ_code', '2010 Census Title \n': '2010_census_occ_title', '2018 Census Code': '2018_census_occ_code', '2018 Census Title ':'2018_census_occ_title'}
occ_1018_xwalk_df.rename(columns = occ_1018_rename_dict, inplace = True)
occ_1018_xwalk_df = occ_1018_xwalk_df[list(occ_1018_rename_dict.values())]

In [150]:
occ_1018_xwalk_df['2010_census_occ_code'] = occ_1018_xwalk_df['2010_census_occ_code'].ffill()
occ_1018_xwalk_df['2010_census_occ_title'] = occ_1018_xwalk_df['2010_census_occ_title'].ffill()
occ_1018_xwalk_df = occ_1018_xwalk_df.dropna()
occ_1018_xwalk_df.reset_index(drop = True, inplace = True)
occ_1018_xwalk_df.to_feather('../code_output/census_occ_1018_xwalk.feather')

# Census OCC 2000 to 2010 crosswalk

In [151]:
occ_0010_xwalk_df = pd.read_excel('../raw_data/2010-occ-codes-with-crosswalk-from-2002-2011.xls', sheet_name = '2002to2010xwalk', header = 3, skipfooter = 7, dtype = str)

In [152]:
occ_2010_rename_dict = {'2002 Census code': '2000_census_occ_code', '2010 Census Code': '2010_census_occ_code'}
occ_0010_xwalk_df.rename(columns = occ_2010_rename_dict, inplace = True)
occ_0010_xwalk_df = occ_0010_xwalk_df[list(occ_2010_rename_dict.values())]

In [153]:
occ_0010_xwalk_df['2000_census_occ_code'] = occ_0010_xwalk_df['2000_census_occ_code'].ffill()
occ_0010_xwalk_df = occ_0010_xwalk_df.dropna()
occ_0010_xwalk_df.reset_index(drop = True, inplace = True)
occ_0010_xwalk_df.to_feather('../code_output/census_occ_0010_xwalk.feather')

# Census OCC 2018 to SOC 2018 crosswalk

In [154]:
occ_soc_18_xwalk_df = pd.read_excel('../raw_data/2018-occupation-code-list-and-crosswalk.xlsx', sheet_name = '2018 Census Occ Code List', header = 4, dtype = str)

In [155]:
occ_soc_18_rename_dict = {'2018 Census Code': '2018_census_occ_code', '2018 SOC Code':'2018_soc_code'}
occ_soc_18_xwalk_df.rename(columns = occ_soc_18_rename_dict, inplace = True)
occ_soc_18_xwalk_df['len'] = occ_soc_18_xwalk_df['2018_census_occ_code'].str.len()
occ_soc_18_xwalk_df = occ_soc_18_xwalk_df[occ_soc_18_xwalk_df['len'] == 4]
occ_soc_18_xwalk_df = occ_soc_18_xwalk_df[list(occ_soc_18_rename_dict.values())]
occ_soc_18_xwalk_df
occ_soc_18_xwalk_df = occ_soc_18_xwalk_df.dropna()
occ_soc_18_xwalk_df.reset_index(drop = True, inplace = True)
occ_soc_18_xwalk_df.to_feather('../code_output/occ_soc_18_xwalk.feather')

# Census OCC 2010 to SOC 2010 crosswalk

In [156]:
occ_soc_10_xwalk_df = pd.read_excel('../raw_data/2010-occ-codes-with-crosswalk-from-2002-2011.xls', sheet_name = '2010OccCodeList', header = 4, dtype = str)

In [157]:
occ_soc_10_rename_dict = {'2010 Census Code': '2010_census_occ_code', '2010 SOC Code':'2010_soc_code'}
occ_soc_10_xwalk_df.rename(columns = occ_soc_10_rename_dict, inplace = True)
occ_soc_10_xwalk_df['len'] = occ_soc_10_xwalk_df['2010_census_occ_code'].str.len()
occ_soc_10_xwalk_df = occ_soc_10_xwalk_df[occ_soc_10_xwalk_df['len'] == 4]
occ_soc_10_xwalk_df = occ_soc_10_xwalk_df[list(occ_soc_10_rename_dict.values())]
occ_soc_10_xwalk_df
occ_soc_10_xwalk_df = occ_soc_10_xwalk_df.dropna()
occ_soc_10_xwalk_df.reset_index(drop = True, inplace = True)
occ_soc_10_xwalk_df.to_feather('../code_output/occ_soc_10_xwalk.feather')


# Census OCC 2000 to SOC 2000 crosswalk

In [158]:
occ_soc_00_xwalk_df = pd.read_excel('../raw_data/2002-census-occupation-codes.xls', sheet_name = 'Occ Codes', header = 2, dtype = str)

In [159]:
occ_soc_00_rename_dict = {'2002 Census Code': '2000_census_occ_code', '2002 SOC Code':'2000_soc_code'}
occ_soc_00_xwalk_df.rename(columns = occ_soc_00_rename_dict, inplace = True)
occ_soc_00_xwalk_df['len'] = occ_soc_00_xwalk_df['2000_census_occ_code'].str.len()
occ_soc_00_xwalk_df = occ_soc_00_xwalk_df[occ_soc_00_xwalk_df['len'] == 4]
occ_soc_00_xwalk_df = occ_soc_00_xwalk_df[list(occ_soc_00_rename_dict.values())]
occ_soc_00_xwalk_df
occ_soc_00_xwalk_df = occ_soc_00_xwalk_df.dropna()
occ_soc_00_xwalk_df.reset_index(drop = True, inplace = True)
occ_soc_00_xwalk_df.to_feather('../code_output/occ_soc_00_xwalk.feather')

# Duty station to MSA crosswalk

In [160]:
dutystation_rename_dict = {
    'Code': 'duty_station_code',
    'CBSA': 'cbsa',
    'CSA': 'csa',
    'City': 'city',
    'County': 'county',
    'State': 'state',
    'Country': 'country'
}

In [161]:
# Load duty station to county crosswalk file
opm_dutystation_df = pd.read_excel('../raw_data/opm_dutystation.xlsx', dtype = 'str')
opm_dutystation_df.rename(columns = dutystation_rename_dict, inplace = True)
opm_dutystation_df = opm_dutystation_df[list(dutystation_rename_dict.values())]

# Pad duty station code to 9 characters with 0s on the left
opm_dutystation_df['duty_station_code'] = opm_dutystation_df['duty_station_code'].str.pad(9, side = 'left', fillchar = '0')

# Remove duty stations not in a county
opm_dutystation_df = opm_dutystation_df[~opm_dutystation_df['county'].isna()]
opm_dutystation_df['msa_code'] = pd.Series(dtype = 'str') # Add column for MSA code

In [162]:
# Load QCEW county-MSA crosswalk
qcew_df = pd.read_excel('../raw_data/qcew-county-msa-csa-crosswalk-xlsx.xlsx')
#qcew_df = qcew_df[~qcew_df['MSA Code'].isna()]
qcew_df.loc[qcew_df['MSA Code'].isna(), 'MSA Code'] = 'matched'

# Split conty title into county and state columns, capitalized to merge with OPM dutystation code file
qcew_df['county'] = qcew_df['County Title'].str.split(', ').str[0]
qcew_df['state'] = qcew_df['County Title'].str.split(', ').str[1]

qcew_df['state'] = qcew_df['state'].str.upper()
qcew_df.loc[qcew_df['state'].isna(), 'state'] = 'DISTRICT OF COLUMBIA' # DC has blank state
qcew_df.loc[qcew_df['state'] == 'AK', 'state'] = 'ALASKA' # One entry has AK instead of Alaska

qcew_df['county'] = qcew_df['county'].str.upper()

In [163]:
# List of states that are in both QCEW and OPM (50 states + DC)
qcew_state_list = list(qcew_df['state'].unique())
opm_dutystation_state_list = list(opm_dutystation_df['state'].unique())
state_list = list(set(qcew_state_list) & set(opm_dutystation_state_list))

In [164]:
# Iterate over states and counties to add MSA code to OPM dutystation df
for state_name in state_list:
    county_list = list(opm_dutystation_df['county'][opm_dutystation_df['state'] == state_name].unique())
    for county_name in county_list:

        qcew_county_name = county_name.replace('SAINT ', 'ST. ')
        qcew_county_name = qcew_county_name.replace('DE KALB', 'DEKALB')
        qcew_county_name = qcew_county_name.replace('DU PAGE', 'DUPAGE')
        qcew_county_name = qcew_county_name.replace('LA PORTE', 'LAPORTE')
        qcew_county_name = qcew_county_name.replace('DONA ANA', 'DOÃ‘A ANA')
        qcew_county_name = qcew_county_name.replace('LA MOURE', 'LAMOURE')
        qcew_county_name = qcew_county_name.replace('SPAULDING', 'SPALDING') # Typo in OPM county name
        qcew_county_name = qcew_county_name.replace('O BRIEN', 'OBRIEN')
        qcew_county_name = qcew_county_name.replace('STE GENEVIEVE', 'STE. GENEVIEVE')
        # La Salle has a space in Texas, but not in other states
        if state_name != 'TEXAS':
            qcew_county_name = qcew_county_name.replace('LA SALLE', 'LASALLE')
        # De Soto has a space in Louisiana, but not in other states
        if state_name != 'LOUISIANA':
            qcew_county_name = qcew_county_name.replace('DE SOTO', 'DESOTO')

        try:
            msa_code = qcew_df[(qcew_df['state'] == state_name) & (qcew_df['county'].str.contains(qcew_county_name))]['MSA Code'].iloc[0]
        except:
            continue

        opm_dutystation_df.loc[(opm_dutystation_df['state'] == state_name) & (opm_dutystation_df['county'] == county_name), 'msa_code'] = msa_code # Assign MSA code

In [165]:
# All counties matched, with counties outside MSAs assigned an MSA code of 'matched'
sum(opm_dutystation_df['msa_code'].isna())

0

In [166]:
opm_dutystation_df.reset_index(inplace = True, drop = True)
opm_dutystation_df.to_feather('../code_output/dutystation_msa_xwalk.feather')