In [30]:
import pandas as pd
from collections import OrderedDict
import os

In [31]:
opm_dynamic_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'dynamic')
opm_status_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'status')

In [32]:
# Define the dict for reading OPM dynamic files
opm_dynamic_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('Agency/Subelement', (32, 36)),
    ('Accession/Separation Indicator', (36, 38)),
    ('Effective Date', (38, 46)),
    ('Age', (46, 52)),
    ('Pay Plan', (52, 54)),
    ('Grade', (54, 56)),
    ('LOS Level', (56, 62)),
    ('Duty Station', (62, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Cateogy (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Type of Appointment', (82, 84)),
    ('Work Schedule', (84, 85))
])

In [33]:
# Define the dict for reading OPM status files
opm_status_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('File Date (yyyymmdd)', (32, 40)),
    ('Agency/Subelement', (40, 44)),
    ('Duty Station', (44, 53)),
    ('Age Range', (53, 59)),
    ('Education Level', (59, 61)),
    ('Pay Plan', (61, 63)),
    ('Grade', (63, 65)),
    ('LOS Level', (65, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Category (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Supervisory Status', (82, 83)),
    ('Type of Appointment', (83, 85)),
    ('Work Schedule', (85, 86)),
    ('NSFTP Indicator', (86, 87))
])

In [36]:
# Define functions that return a dict of files in the status or dynamic directory, indexed by year and quarter
def produce_opm_dynamic_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime(file[0:7])
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

def produce_opm_status_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime('-'.join(file[15:22].split('_')))
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

In [45]:
# Define function that imports files into dict within a certain timeframe
def produce_opm_df_dict(opm_path_dict, opm_fwf_dict, start_year, start_qtr, end_year, end_qtr):
    output_df_dict = {}
    for year in range(start_year, end_year + 1):
        for qtr in range(1, 5):
            if (year == start_year) and (qtr < start_qtr):
                continue
            if (year == end_year) and (qtr > end_qtr):
                continue

            output_df_dict[(year, qtr)] = pd.read_fwf(opm_path_dict[(year, qtr)], colspecs = list(opm_fwf_dict.values()), names = list(opm_fwf_dict.keys()))

    return output_df_dict

In [35]:
# years and quarters we want to load (inclusive)
start_year = 1982
start_qtr = 1
end_year = 1983
end_qtr = 4

In [37]:
# Create dict of paths of files
opm_dynamic_path_dict = produce_opm_dynamic_path_dict(opm_dynamic_dir)
opm_status_path_dict = produce_opm_status_path_dict(opm_status_dir)

In [47]:
# Create df of files we want
opm_dynamic_df_dict = produce_opm_df_dict(opm_dynamic_path_dict, opm_dynamic_fwf_dict, start_year, start_qtr, end_year, end_qtr)
opm_status_df_dict = produce_opm_df_dict(opm_status_path_dict, opm_status_fwf_dict, start_year, start_qtr, end_year, end_qtr)