In [99]:
import pandas as pd
from collections import OrderedDict
import os

In [100]:
opm_dynamic_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'dynamic')
opm_status_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'status')

In [101]:
# Define the dict for reading OPM dynamic files
opm_dynamic_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('Agency/Subelement', (32, 36)),
    ('Accession/Separation Indicator', (36, 38)),
    ('Effective Date', (38, 46)),
    ('Age', (46, 52)),
    ('Pay Plan', (52, 54)),
    ('Grade', (54, 56)),
    ('LOS Level', (56, 62)),
    ('Duty Station', (62, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Cateogy (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Type of Appointment', (82, 84)),
    ('Work Schedule', (84, 85))
])

In [102]:
# Define the dict for reading OPM status files
opm_status_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('File Date (yyyymmdd)', (32, 40)),
    ('Agency/Subelement', (40, 44)),
    ('Duty Station', (44, 53)),
    ('Age Range', (53, 59)),
    ('Education Level', (59, 61)),
    ('Pay Plan', (61, 63)),
    ('Grade', (63, 65)),
    ('LOS Level', (65, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Category (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Supervisory Status', (82, 83)),
    ('Type of Appointment', (83, 85)),
    ('Work Schedule', (85, 86)),
    ('NSFTP Indicator', (86, 87))
])

In [103]:
# Define functions that return a dict of files in the status or dynamic directory, indexed by year and quarter
def produce_opm_dynamic_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime(file[0:7])
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

def produce_opm_status_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime('-'.join(file[15:22].split('_')))
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

In [104]:
# Define function that imports files into dict within a certain timeframe
def produce_opm_df_dict(opm_path_dict, opm_fwf_dict, start_year, start_qtr, end_year, end_qtr):
    output_df_dict = {}
    for year in range(start_year, end_year + 1):
        for qtr in range(1, 5):
            if (year == start_year) and (qtr < start_qtr):
                continue
            if (year == end_year) and (qtr > end_qtr):
                continue

            output_df_dict[(year, qtr)] = pd.read_fwf(opm_path_dict[(year, qtr)], colspecs = list(opm_fwf_dict.values()), names = list(opm_fwf_dict.keys()))

    return output_df_dict

In [105]:
# years and quarters we want to load (inclusive)
start_year = 1982
start_qtr = 1
end_year = 1985
end_qtr = 1

In [106]:
# Create dict of paths of files
opm_dynamic_path_dict = produce_opm_dynamic_path_dict(opm_dynamic_dir)
opm_status_path_dict = produce_opm_status_path_dict(opm_status_dir)

In [107]:
opm_educ_df_dict = {}
opm_dynamic_df_dict = {}

# Load dataframes we want
for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        print(f'Loading year {year}, quarter {qtr}.')

        opm_educ_df_dict[(year, qtr)] = pd.read_fwf(opm_status_path_dict[(year, qtr)], colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = str, usecols = ['Pseudo-ID', 'Education Level'])
        opm_dynamic_df_dict[(year, qtr)] = pd.read_fwf(opm_dynamic_path_dict[(year, qtr)], colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = str)

opm_educ_df_dict[(1982, 1)] = opm_educ_df_dict[(1982, 1)][:-1] # Drop last row which has name of other files for some reason

Loading year 1982, quarter 1.
Loading year 1982, quarter 2.
Loading year 1982, quarter 3.
Loading year 1982, quarter 4.
Loading year 1983, quarter 1.
Loading year 1983, quarter 2.
Loading year 1983, quarter 3.
Loading year 1983, quarter 4.
Loading year 1984, quarter 1.
Loading year 1984, quarter 2.
Loading year 1984, quarter 3.
Loading year 1984, quarter 4.
Loading year 1985, quarter 1.


In [108]:
for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        yq_current = year*4 + qtr

        print(f'Concatenating year {year}, quarter {qtr}.')

        # Produce df to merge
        educ_df_to_merge = pd.DataFrame()
        for key, df in opm_educ_df_dict.items():
            yq_df = key[0]*4 + key[1]

            # Want to merge education info from status file within the year before and after the relevant dynamic file date
            if -4 <= yq_df - yq_current <= 4:
                educ_df_to_merge = pd.concat([educ_df_to_merge, df], ignore_index = True)

        print(f'Merging year {year}, quarter {qtr}.')
        # Merge education info to dynamic dfs
        opm_dynamic_df_dict[(year, qtr)] = opm_dynamic_df_dict[(year, qtr)].merge(educ_df_to_merge, how = 'left', on = ['Pseudo-ID'])

Concatenating year 1982, quarter 1.
Merging year 1982, quarter 1.
Concatenating year 1982, quarter 2.
Merging year 1982, quarter 2.
Concatenating year 1982, quarter 3.
Merging year 1982, quarter 3.
Concatenating year 1982, quarter 4.
Merging year 1982, quarter 4.
Concatenating year 1983, quarter 1.
Merging year 1983, quarter 1.
Concatenating year 1983, quarter 2.
Merging year 1983, quarter 2.
Concatenating year 1983, quarter 3.
Merging year 1983, quarter 3.
Concatenating year 1983, quarter 4.
Merging year 1983, quarter 4.
Concatenating year 1984, quarter 1.
Merging year 1984, quarter 1.
Concatenating year 1984, quarter 2.
Merging year 1984, quarter 2.
Concatenating year 1984, quarter 3.
Merging year 1984, quarter 3.
Concatenating year 1984, quarter 4.
Merging year 1984, quarter 4.
Concatenating year 1985, quarter 1.
Merging year 1985, quarter 1.
