In [1]:
import pandas as pd
from collections import OrderedDict
import os

In [2]:
opm_dynamic_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'dynamic')
opm_status_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'status')

In [3]:
# Define the dict for reading OPM dynamic files
opm_dynamic_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('Agency/Subelement', (32, 36)),
    ('Accession/Separation Indicator', (36, 38)),
    ('Effective Date', (38, 46)),
    ('Age', (46, 52)),
    ('Pay Plan', (52, 54)),
    ('Grade', (54, 56)),
    ('LOS Level', (56, 62)),
    ('Duty Station', (62, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Cateogy (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Type of Appointment', (82, 84)),
    ('Work Schedule', (84, 85))
])

In [4]:
# Define the dict for reading OPM status files
opm_status_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('File Date (yyyymmdd)', (32, 40)),
    ('Agency/Subelement', (40, 44)),
    ('Duty Station', (44, 53)),
    ('Age Range', (53, 59)),
    ('Education Level', (59, 61)),
    ('Pay Plan', (61, 63)),
    ('Grade', (63, 65)),
    ('LOS Level', (65, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Category (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Supervisory Status', (82, 83)),
    ('Type of Appointment', (83, 85)),
    ('Work Schedule', (85, 86)),
    ('NSFTP Indicator', (86, 87))
])

In [5]:
start_date = '1982-01'
end_date = '1983-01'

In [6]:
# Define functions that return a dict of files in the status or dynamic directory, indexed by year and quarter
def produce_opm_dynamic_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime(file[0:7])
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

def produce_opm_status_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime('-'.join(file[15:22].split('_')))
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

In [7]:
opm_dynamic_path_dict = produce_opm_dynamic_path_dict(opm_dynamic_dir)
opm_status_path_dict = produce_opm_status_path_dict(opm_status_dir)

In [None]:
def read_opm_dynamic_fwf(start_date, end_date, path, fwf_dict):
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    # Create list of files to load based on date ranges selected
    files_to_load = []
    for file in os.listdir(path):
        file_date = pd.to_datetime(file[0:7])
        if start_date <= file_date <= end_date:
            files_to_load.append(file)

    files_to_load.sort(key = lambda x: pd.to_datetime(x[0:7])) # Sort list of files to load by date

    # Load files in list into dataframe
    df = pd.DataFrame()
    for file in files_to_load:
        df = df.append(pd.read_fwf(os.path.join(path, file), colspecs = list(fwf_dict.values()), names = list(fwf_dict.keys())))

    return df

In [None]:
def read_opm_status_fwf(start_date, end_date, path, fwf_dict):
    start_date = pd.to_datetime(start_date)
    end_date = pd.to_datetime(end_date)

    # Create list of files to load based on date ranges selected
    files_to_load = []
    for file in os.listdir(path):
        file_date = pd.to_datetime('-'.join(file[15:22].split('_'))) # Have to split and rejoin dates using '-'
        if start_date <= file_date <= end_date:
            files_to_load.append(file)

    files_to_load.sort(key = lambda x: pd.to_datetime('-'.join(x[15:22].split('_')))) # Sort list of files to load by date

    # Combine text files to be read
    combined_fwf_file = open(os.path.join(path, files_to_load[0]), 'a') # Load first file
    for file in files_to_load[1:-1]:
        combined_fwf_file.write(file)
        
    
    # Load files in list into dataframe
    df = pd.read_fwf(combined_fwf_file, colspecs = list(fwf_dict.values()), names = list(fwf_dict.keys()))

    return df

In [None]:
opm_dynamic_df = read_opm_dynamic_fwf(start_date, end_date, opm_dynamic_dir, opm_dynamic_fwf_dict)

In [None]:
opm_status_df = read_opm_status_fwf(start_date, end_date, opm_status_dir, opm_status_fwf_dict)