In [3]:
from pathlib import Path
import pandas as pd
import json
from pandas.api.types import union_categoricals
from itertools import islice

In [4]:
with open('../code_output/opm_dynamic_fwf_dict.json') as infile:
    opm_dynamic_fwf_dict = json.load(infile)

with open('../code_output/opm_status_fwf_dict.json') as infile:
    opm_status_fwf_dict = json.load(infile)

with open('../code_output/opm_dynamic_dtype_dict.json') as infile:
    opm_dynamic_dtype_dict = json.load(infile)

with open('../code_output/opm_status_dtype_dict.json') as infile:
    opm_status_dtype_dict = json.load(infile)

In [5]:
# Set the relative file path for dynamic and status files
opm_dynamic_path = Path('../raw_data/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/dynamic')
opm_status_path = Path('../raw_data/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/status')

In [6]:
# Create dicts of paths of files
opm_dynamic_path_dict = {}
opm_status_path_dict = {}

In [7]:
for file_path in Path(opm_dynamic_path).iterdir():
    file_name = file_path.name
    file_date = pd.to_datetime(file_name[0:7])
    year = str(file_date.year)
    qtr = str(file_date.quarter)
    opm_dynamic_path_dict.setdefault(year, {}) # Create empty dict for that year if not yet created
    opm_dynamic_path_dict[year][qtr] = str(file_path)

In [8]:
for file_path in Path(opm_status_path).iterdir():
    file_name = file_path.name
    file_date = pd.to_datetime('-'.join(file_name[15:22].split('_')))
    year = str(file_date.year)
    qtr = str(file_date.quarter)
    opm_status_path_dict.setdefault(year, {}) # Create empty dict for that year if not yet created
    opm_status_path_dict[year][qtr] = str(file_path)

In [9]:
# Directory we are storing processed dataframes in binary format, for quick access later
binary_path = Path('../cleaned_binaries/')
binary_path.mkdir(parents = True, exist_ok = True)

# Directory we are storing json files
code_output_path = Path('../code_output/')
code_output_path.mkdir(parents = True, exist_ok = True)

# Dict to store paths to processed binaries
opm_dynamic_feather_path_dict = {}
opm_status_feather_path_dict = {}

# Dict to store unique 'Adjusted Basic Pay' (ABP) values that do not convert to numeric for each file
status_abp_error_dict = {}
dynamic_abp_error_dict = {}

In [10]:
for year, qtr_dict in opm_dynamic_path_dict.items():
    for qtr, str_path in qtr_dict.items():
        
        df = pd.read_fwf(Path(str_path), colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = opm_dynamic_dtype_dict, encoding = 'latin-1')
        
        print(f'Loaded dynamic file for year {year}, quarter {qtr}.')

        # Select ABP that could not be converted, list unique values, then convert to numeric
        numeric_abp = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')
        dynamic_abp_error_dict.setdefault(year, {})
        dynamic_abp_error_dict[year][qtr] = list(df['Adjusted Basic Pay'][numeric_abp.isna()].unique())

        df['Adjusted Basic Pay'] = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')

        target_path = binary_path.joinpath(f'opm_nondod_dynamic_{year}_{qtr}.feather')
        df.to_feather(target_path)

        opm_dynamic_feather_path_dict.setdefault(year, {})
        opm_dynamic_feather_path_dict[year][qtr] = str(target_path)

Loaded dynamic file for year 1982, quarter 4.
Loaded dynamic file for year 1982, quarter 2.
Loaded dynamic file for year 1982, quarter 1.
Loaded dynamic file for year 1982, quarter 3.
Loaded dynamic file for year 1983, quarter 4.
Loaded dynamic file for year 1983, quarter 2.
Loaded dynamic file for year 1983, quarter 1.
Loaded dynamic file for year 1983, quarter 3.
Loaded dynamic file for year 1984, quarter 4.
Loaded dynamic file for year 1984, quarter 2.
Loaded dynamic file for year 1984, quarter 1.
Loaded dynamic file for year 1984, quarter 3.
Loaded dynamic file for year 1985, quarter 4.
Loaded dynamic file for year 1985, quarter 2.
Loaded dynamic file for year 1985, quarter 1.
Loaded dynamic file for year 1985, quarter 3.
Loaded dynamic file for year 1986, quarter 4.
Loaded dynamic file for year 1986, quarter 2.
Loaded dynamic file for year 1986, quarter 1.
Loaded dynamic file for year 1986, quarter 3.
Loaded dynamic file for year 1987, quarter 4.
Loaded dynamic file for year 1987,

In [11]:
# Save dictionaries for paths of binaries
with open('../code_output/opm_nondod_dynamic_pre2014_feather_path_dict.json', 'w') as outfile:
    json.dump(opm_dynamic_feather_path_dict, outfile, indent = 4)

In [12]:
for year, qtr_dict in opm_status_path_dict.items():
    for qtr, str_path in qtr_dict.items():
        
        if year == '1982' and qtr == '1':
            df = pd.read_fwf(Path(str_path), colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = opm_status_dtype_dict, encoding = 'latin-1', skipfooter = 1)
        else:
            df = pd.read_fwf(Path(str_path), colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = opm_status_dtype_dict, encoding = 'latin-1')
        
        print(f'Loaded status file for year {year}, quarter {qtr}.')

        # Select ABP that could not be converted, list unique values, then convert to numeric
        numeric_abp = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')
        status_abp_error_dict.setdefault(year, {})
        status_abp_error_dict[year][qtr] = list(df['Adjusted Basic Pay'][numeric_abp.isna()].unique())

        df['Adjusted Basic Pay'] = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')

        target_path = binary_path.joinpath(f'opm_nondod_status_{year}_{qtr}.feather')
        df.to_feather(target_path)

        opm_status_feather_path_dict.setdefault(year, {})
        opm_status_feather_path_dict[year][qtr] = str(target_path)

Loaded status file for year 1973, quarter 3.
Loaded status file for year 1973, quarter 4.
Loaded status file for year 1974, quarter 1.
Loaded status file for year 1974, quarter 2.
Loaded status file for year 1974, quarter 3.
Loaded status file for year 1974, quarter 4.
Loaded status file for year 1975, quarter 1.
Loaded status file for year 1975, quarter 2.
Loaded status file for year 1975, quarter 3.
Loaded status file for year 1975, quarter 4.
Loaded status file for year 1976, quarter 1.
Loaded status file for year 1976, quarter 2.
Loaded status file for year 1976, quarter 3.
Loaded status file for year 1976, quarter 4.
Loaded status file for year 1977, quarter 1.
Loaded status file for year 1977, quarter 2.
Loaded status file for year 1977, quarter 3.
Loaded status file for year 1977, quarter 4.
Loaded status file for year 1978, quarter 1.
Loaded status file for year 1978, quarter 2.
Loaded status file for year 1978, quarter 3.
Loaded status file for year 1978, quarter 4.
Loaded sta

In [13]:
# Save dictionaries for paths of binaries
with open('../code_output/opm_nondod_status_pre2014_feather_path_dict.json', 'w') as outfile:
    json.dump(opm_status_feather_path_dict, outfile, indent = 4)

In [14]:
# Check list of ABP values that could not be converted to numeric
with open('../code_output/nondod_dynamic_abp_errors.txt', 'w') as outfile:
    for yq, my_list in dynamic_abp_error_dict.items():
        for item in my_list:
            outfile.write(f'{yq}: {item} \n')

with open('../code_output/nondod_status_abp_errors.txt', 'w') as outfile:
    for yq, my_list in status_abp_error_dict.items():
        for item in my_list:
            outfile.write(f'{yq}: {item} \n')