In [1]:
import pandas as pd
from pathlib import Path
import pickle
from pandas.api.types import union_categoricals

In [2]:
# Set the relative file path for dynamic and status files
opm_dynamic_path = Path('../raw_data/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/dynamic')
opm_status_path = Path('../raw_data/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/status')
opm_dynamic_path_dask = Path('../raw_data/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/dynamic/*.NONDOD.FO05M3.TXT')
opm_status_path_dask = Path('../raw_data/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/status/Status_Non_DoD_*_*.txt')

In [3]:
# Define the fwf dict for reading OPM files
opm_dynamic_fwf_dict = {
    'Pseudo-ID': (0, 9),
    'Employee Name': (9, 32),
    'Agency/Subelement': (32, 36),
    'Accession/Separation Indicator': (36, 38),
    'Effective Date (year)': (38, 42),
    'Effective Date (month)': (42, 44),
    'Effective Date (day)': (44, 46),
    'Age': (46, 52),
    'Pay Plan': (52, 54),
    'Grade': (54, 56),
    'LOS Level': (56, 62),
    'Duty Station': (62, 71),
    'Occupation': (71, 75),
    'Occupational Cateogy (PATCO)': (75, 76),
    'Adjusted Basic Pay': (76, 82),
    'Type of Appointment': (82, 84),
    'Work Schedule': (84, 85)
}

opm_status_fwf_dict = {
    'Pseudo-ID': (0, 9),
    'Employee Name': (9, 32),
    'File Date (year)': (32, 36),
    'File Date (month)': (36, 38),
    'File Date (day)': (38, 40),
    'Agency/Subelement': (40, 44),
    'Duty Station': (44, 53),
    'Age Range': (53, 59),
    'Education Level': (59, 61),
    'Pay Plan': (61, 63),
    'Grade': (63, 65),
    'LOS Level': (65, 71),
    'Occupation': (71, 75),
    'Occupational Category (PATCO)': (75, 76),
    'Adjusted Basic Pay': (76, 82),
    'Supervisory Status': (82, 83),
    'Type of Appointment': (83, 85),
    'Work Schedule': (85, 86),
    'NSFTP Indicator': (86, 87)
}

In [4]:
# Define the dicts for column dtypes
opm_dynamic_dtype_dict = {
    'Pseudo-ID': 'string',
    'Employee Name': 'string',
    'Agency/Subelement': 'category',
    'Accession/Separation Indicator': 'category',
    'Effective Date (year)': 'Int16',
    'Effective Date (month)': 'Int8',
    'Effective Date (day)': 'Int8',
    'Age': 'category',
    'Pay Plan': 'category',
    'Grade': 'category',
    'LOS Level': 'category',
    'Duty Station': 'category',
    'Occupation': 'category',
    'Occupational Category (PATCO)': 'category',
    'Adjusted Basic Pay': 'string',
    'Type of Appointment': 'category',
    'Work Schedule': 'category'
}

opm_status_dtype_dict = {
    'Pseudo-ID': 'string',
    'Employee Name': 'string',
    'File Date (year)': 'Int16',
    'File Date (month)': 'Int8',
    'File Date (day)': 'Int8',
    'Agency/Subelement': 'category',
    'Duty Station': 'category',
    'Age Range': 'category',
    'Education Level': 'category',
    'Pay Plan': 'category',
    'Grade': 'category',
    'LOS Level': 'category',
    'Occupation': 'category',
    'Occupational Category (PATCO)': 'category',
    'Adjusted Basic Pay': 'string',
    'Supervisory Status': 'category',
    'Type of Appointment': 'category',
    'Work Schedule': 'category',
    'NSFTP Indicator': 'category'
}

In [5]:
# Define functions that return a dict of files in the status or dynamic directory, indexed by year and quarter
def produce_opm_dynamic_path_dict(input_path):
    output_dict = {}
    for file_path in Path(input_path).iterdir():
        file_name = file_path.name
        file_date = pd.to_datetime(file_name[0:7])
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = file_path

    return output_dict

def produce_opm_status_path_dict(input_path):
    output_dict = {}
    for file_path in Path(input_path).iterdir():
        file_name = file_path.name
        file_date = pd.to_datetime('-'.join(file_name[15:22].split('_')))
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = file_path

    return output_dict

In [6]:
# Create dicts of paths of files
opm_dynamic_path_dict = produce_opm_dynamic_path_dict(opm_dynamic_path)
opm_status_path_dict = produce_opm_status_path_dict(opm_status_path)

In [7]:
# Directory we are storing processed dataframes in binary format, for quick access later
binary_path = Path('../cleaned_binaries/')
binary_path.mkdir(parents = True, exist_ok = True)

# Dict to store paths to processed binaries
opm_dynamic_feather_path_dict = {}
opm_status_feather_path_dict = {}

# Dict to store unique 'Adjusted Basic Pay' (ABP) values that do not convert to numeric for each file
status_abp_error_dict = {}
dynamic_abp_error_dict = {}

In [8]:
# Years and quarters we want to load (inclusive)
start_year = 1973
start_qtr = 3
end_year = 2014
end_qtr = 2

In [12]:
# Years and quarters we want to load (inclusive)
start_year = 1982
start_qtr = 1
end_year = 1982
end_qtr = 1

In [16]:
for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        # 1982q1 status file has invalid last row: has file names in last row for some reason
        if year == 1982 and qtr == 1:
            df = pd.read_fwf(opm_status_path_dict[(year, qtr)], colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = opm_status_dtype_dict, encoding = 'latin-1', skipfooter = 1)
        else:
            df = pd.read_fwf(opm_status_path_dict[(year, qtr)], colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = opm_status_dtype_dict, encoding = 'latin-1')
        print(f'Loaded status file for year {year}, quarter {qtr}.')

        # Select ABP that could not be converted, list unique values, then convert to numeric
        numeric_abp = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')
        status_abp_error_dict[(year, qtr)] = list(df['Adjusted Basic Pay'][numeric_abp.isna()].unique())

        df['Adjusted Basic Pay'] = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')

        target_path = binary_path.joinpath(f'opm_status_{year}_{qtr}.feather')
        df.to_feather(target_path)
        opm_status_feather_path_dict[(year, qtr)] = target_path

Loaded status file for year 1982, quarter 1.


In [22]:
# Save dictionaries for paths of binaries
with open('opm_status_feather_path_dict.pkl', 'wb') as f:
    pickle.dump(opm_status_feather_path_dict, f)

In [23]:
start_year = 1982
start_qtr = 1
end_year = 2014
end_qtr = 2

In [24]:
for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        df = pd.read_fwf(opm_dynamic_path_dict[(year, qtr)], colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = opm_dynamic_dtype_dict, encoding = 'latin-1')
        print(f'Loaded dynamic file for year {year}, quarter {qtr}.')

        # Select ABP that could not be converted, list unique values, then convert to numeric
        numeric_abp = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')
        dynamic_abp_error_dict[(year, qtr)] = list(df['Adjusted Basic Pay'][numeric_abp.isna()].unique())

        df['Adjusted Basic Pay'] = pd.to_numeric(df['Adjusted Basic Pay'], errors = 'coerce')
        
        target_path = binary_path.joinpath(f'opm_dynamic_{year}_{qtr}.feather')
        df.to_feather(target_path)
        opm_dynamic_feather_path_dict[(year, qtr)] = target_path

Loaded dynamic file for year 1982, quarter 1.
Loaded dynamic file for year 1982, quarter 2.
Loaded dynamic file for year 1982, quarter 3.
Loaded dynamic file for year 1982, quarter 4.
Loaded dynamic file for year 1983, quarter 1.
Loaded dynamic file for year 1983, quarter 2.
Loaded dynamic file for year 1983, quarter 3.
Loaded dynamic file for year 1983, quarter 4.
Loaded dynamic file for year 1984, quarter 1.
Loaded dynamic file for year 1984, quarter 2.
Loaded dynamic file for year 1984, quarter 3.
Loaded dynamic file for year 1984, quarter 4.
Loaded dynamic file for year 1985, quarter 1.
Loaded dynamic file for year 1985, quarter 2.
Loaded dynamic file for year 1985, quarter 3.
Loaded dynamic file for year 1985, quarter 4.
Loaded dynamic file for year 1986, quarter 1.
Loaded dynamic file for year 1986, quarter 2.
Loaded dynamic file for year 1986, quarter 3.
Loaded dynamic file for year 1986, quarter 4.
Loaded dynamic file for year 1987, quarter 1.
Loaded dynamic file for year 1987,

In [25]:
# Save dictionaries for paths of binaries
with open('opm_dynamic_feather_path_dict.pkl', 'wb') as f:
    pickle.dump(opm_dynamic_feather_path_dict, f)

In [26]:
# Check list of ABP values that could not be converted to numeric
with open('../output/dynamic_abp_errors.txt', 'w') as f:
    for yq, my_list in dynamic_abp_error_dict.items():
        for item in my_list:
            f.write(f'{yq}: {item} \n')

with open('../output/status_abp_errors.txt', 'w') as f:
    for yq, my_list in status_abp_error_dict.items():
        for item in my_list:
            f.write(f'{yq}: {item} \n')

In [27]:
# It appears that the only problematic yq is (1991, 2), which has 4 employees' ABP as '033  0'
df = pd.read_feather(opm_status_feather_path_dict[(1991, 2)])
df[df['Adjusted Basic Pay'] == '033  0']

Unnamed: 0,Pseudo-ID,Employee Name,File Date (year),File Date (month),File Date (day),Agency/Subelement,Duty Station,Age Range,Education Level,Pay Plan,Grade,LOS Level,Occupation,Occupational Category (PATCO),Adjusted Basic Pay,Supervisory Status,Type of Appointment,Work Schedule,NSFTP Indicator
