In [7]:
from pathlib import Path
import pandas as pd
import json
from itertools import islice

In [8]:
opm_occ_soc_xwalk_df = pd.read_feather('../code_output/opm_occ_2018_soc_xwalk.feather')

In [9]:
census_occ_1018_xwalk_df = pd.read_feather('../code_output/census_occ_1018_xwalk.feather')
census_occ_0010_xwalk_df = pd.read_feather('../code_output/census_occ_0010_xwalk.feather')
census_occ_1018_xwalk_df = census_occ_1018_xwalk_df[['2018_census_occ_code', '2010_census_occ_code']]
soc_1018_xwalk_df = pd.read_feather('../code_output/soc_1018_xwalk.feather')
soc_0010_xwalk_df = pd.read_feather('../code_output/soc_0010_xwalk.feather')
dutystation_msa_xwalk_df = pd.read_feather('../code_output/dutystation_msa_xwalk.feather')

In [10]:
# Load dicts of paths of binaries
with open('../code_output/opm_nondod_status_pre2014_feather_path_dict.json', 'r') as infile:
    opm_nondod_status_feather_path_dict = json.load(infile)

with open('../code_output/opm_dod_status_pre2014_feather_path_dict.json', 'r') as infile:
    opm_dod_status_feather_path_dict = json.load(infile)

In [11]:
opm_collapsed_feather_path_dict = {}
binary_path = Path('../cleaned_binaries/')

In [12]:
for year, qtr_dict in opm_nondod_status_feather_path_dict.items():
    year_df = pd.DataFrame()

    if int(year) < 1997:
        continue
    
    print(f'Collapsing year {year}.')

    for qtr, file_path in qtr_dict.items():
        df = pd.read_feather(Path(file_path))
        year_df = pd.concat([year_df, df])

    year_df = year_df[['Pseudo-ID', 'Employee Name', 'Duty Station', 'Occupation', 'Adjusted Basic Pay']] # Keep only variables we want
    
    # Add 2018 SOC codes, then add 2010 and 2000 Census OCC codes
    merged_df = year_df.merge(opm_occ_soc_xwalk_df, how = 'inner', left_on = ['Occupation'], right_on = ['opm_occ_code'])
    merged_df = merged_df.merge(census_occ_1018_xwalk_df, how = 'left', left_on = ['2018_census_occ_code'], right_on = ['2018_census_occ_code'])
    merged_df = merged_df.merge(census_occ_0010_xwalk_df, how = 'left', left_on = ['2010_census_occ_code'], right_on = ['2010_census_occ_code'])

    # Add MSA codes
    merged_df = merged_df.merge(dutystation_msa_xwalk_df, how = 'inner', left_on = ['Duty Station'], right_on = ['duty_station_code'])

    # Keep only 1 observation per worker in a given year
    merged_df = merged_df[merged_df['msa_code'] != 'matched'] # Drop people who do not have an MSA code
    merged_df.drop_duplicates(subset = ['Pseudo-ID'])

    # Collapse by occupation code and MSA
    if int(year) < 2010:
        collapsed_df = merged_df.groupby(by = ['2000_census_occ_code', 'msa_code']).agg({
            'Pseudo-ID': 'count',
            'Adjusted Basic Pay': 'mean',
            '2018_census_occ_title': 'first',
            'county': 'first',
            'state': 'first',
        })
    elif 2009 < int(year) < 2019:
        collapsed_df = merged_df.groupby(by = ['2010_census_occ_code', 'msa_code']).agg({
            'Pseudo-ID': 'count',
            'Adjusted Basic Pay': 'mean',
            '2018_census_occ_title': 'first',
            'county': 'first',
            'state': 'first',
        })

    # Cleap up collapsed df and export
    collapsed_df.reset_index(inplace = True)
    collapsed_df.rename(columns = {'Pseudo-ID': 'opm_n_emp', 'Adjusted Basic Pay': 'mean_abp'}, inplace = True)
    
    target_path = Path(binary_path).joinpath(f'opm_collapsed_{year}.feather')
    collapsed_df.to_feather(target_path)

    opm_collapsed_feather_path_dict[year] = str(target_path)

Collapsing year 1997.
Collapsing year 1998.
Collapsing year 1999.
Collapsing year 2000.
Collapsing year 2001.
Collapsing year 2002.
Collapsing year 2003.
Collapsing year 2004.
Collapsing year 2005.
Collapsing year 2006.
Collapsing year 2007.
Collapsing year 2008.
Collapsing year 2009.
Collapsing year 2010.
Collapsing year 2011.
Collapsing year 2012.
Collapsing year 2013.
Collapsing year 2014.


In [13]:
# Save dictionaries for paths of binaries
with open('../code_output/opm_collapsed_pre2014_feather_path_dict.json', 'w') as outfile:
    json.dump(opm_collapsed_feather_path_dict, outfile, indent = 4)