In [293]:
from pathlib import Path
import pandas as pd
import json
from itertools import islice

In [294]:
occ_rename_dict = {
    'OPM SERIES NUMBER (December 2018)': 'opm_occ_code',
    'OPM SERIES TITLE\xa0\n(December 2018)': 'opm_occ_title',
    '2018 \nSOC CODE': '2018_soc_code',
    '2014-2018 EEO TABULATION (CENSUS) CODE': 'census_occ_code',
    '2014-2018 EEO TABULATION (CENSUS) OCCUPATION TITLE': 'census_occ_title'
}

In [295]:
# Load OPM occupation code to SOC code crosswalk
opm_occ_df = pd.read_excel('../raw_data/MD715-Census Occupation Crosswalk 2022Feb23.xlsx', header = 2, dtype = 'str')
opm_occ_df.rename(columns = occ_rename_dict, inplace = True)
opm_occ_df = opm_occ_df[list(occ_rename_dict.values())]

In [296]:
# Remove trailing #, which indicates deviations from documentation of the 2014-2018 EEO Tabulation
for col in opm_occ_df.columns:
    opm_occ_df[col] = opm_occ_df[col].str.replace('#', '')

In [297]:
dutystation_rename_dict = {
    'Code': 'duty_station_code',
    'CBSA': 'cbsa',
    'CSA': 'csa',
    'City': 'city',
    'County': 'county',
    'State': 'state',
    'Country': 'country'
}

In [298]:
# Load duty station to CBSA crosswalk file
opm_dutystation_df = pd.read_excel('../raw_data/opm_dutystation.xlsx', dtype = 'str')
opm_dutystation_df.rename(columns = dutystation_rename_dict, inplace = True)
opm_dutystation_df = opm_dutystation_df[list(dutystation_rename_dict.values())]

In [299]:
# Pad duty station code to 9 characters with 0s on the left
opm_dutystation_df['duty_station_code'] = opm_dutystation_df['duty_station_code'].str.pad(9, side = 'left', fillchar = '0')

In [300]:
# Load dicts of paths of binaries
with open('../code_output/opm_nondod_status_pre2014_feather_path_dict.json', 'r') as infile:
    opm_nondod_status_feather_path_dict = json.load(infile)

with open('../code_output/opm_dod_status_pre2014_feather_path_dict.json', 'r') as infile:
    opm_dod_status_feather_path_dict = json.load(infile)

In [301]:
opm_collapsed_feather_path_dict = {}
binary_path = Path('../cleaned_binaries/')

In [302]:
for year, qtr_dict in opm_nondod_status_feather_path_dict.items():
    year_df = pd.DataFrame()

    if int(year) < 1990:
        continue
    
    print(f'Collapsing year {year}.')

    for qtr, file_path in qtr_dict.items():
        df = pd.read_feather(Path(file_path))
        year_df = pd.concat([year_df, df])

    year_df = year_df[['Pseudo-ID', 'Employee Name', 'Duty Station', 'Occupation', 'Adjusted Basic Pay']] # Keep only variables we want
    
    # Add CBSA and SOC codes
    merged_df = year_df.merge(opm_occ_df, how = 'inner', left_on = ['Occupation'], right_on = ['opm_occ_code'])
    merged_df = merged_df.merge(opm_dutystation_df, how = 'inner', left_on = ['Duty Station'], right_on = ['duty_station_code'])

    # Keep only 1 observation per worker in a given year
    merged_df = merged_df.drop_duplicates(subset = ['Pseudo-ID'])

    # Keep only workers that are within a CBSA
    merged_df = merged_df[~merged_df['cbsa'].isna()]

    # Collapse by SOC code and CBSA
    collapsed_df = merged_df.groupby(by = ['2018_soc_code', 'cbsa']).agg({
        'Pseudo-ID': 'count',
        'Adjusted Basic Pay': 'mean',
        'opm_occ_code': 'first',
        'opm_occ_title': 'first',
        'census_occ_code': 'first',
        'census_occ_title': 'first',
        'county': 'first',
        'state': 'first',
    })

    # Cleap up collapsed df and export
    collapsed_df.reset_index(inplace = True)
    collapsed_df.rename(columns = {'Pseudo-ID': 'opm_n_emp', 'Adjusted Basic Pay': 'mean_abp'}, inplace = True)
    
    target_path = Path(binary_path).joinpath(f'opm_collapsed_{year}.feather')
    collapsed_df.to_feather(target_path)

    opm_collapsed_feather_path_dict[year] = str(target_path)

Collapsing year 1990.
Collapsing year 1991.
Collapsing year 1992.
Collapsing year 1993.
Collapsing year 1994.
Collapsing year 1995.
Collapsing year 1996.
Collapsing year 1997.
Collapsing year 1998.
Collapsing year 1999.
Collapsing year 2000.
Collapsing year 2001.
Collapsing year 2002.
Collapsing year 2003.
Collapsing year 2004.
Collapsing year 2005.
Collapsing year 2006.
Collapsing year 2007.
Collapsing year 2008.
Collapsing year 2009.
Collapsing year 2010.
Collapsing year 2011.
Collapsing year 2012.
Collapsing year 2013.
Collapsing year 2014.


In [303]:
# Save dictionaries for paths of binaries
with open('../code_output/opm_collapsed_pre2014_feather_path_dict.json', 'w') as outfile:
    json.dump(opm_collapsed_feather_path_dict, outfile, indent = 4)