In [1]:
import pandas as pd
from collections import OrderedDict
import os
import dask.dataframe as ddf

In [2]:
opm_dynamic_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'dynamic')
opm_status_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'status')
opm_dynamic_dir_dask = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'dynamic', '*.NONDOD.FO05M3.TXT')
opm_status_dir_dask = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'status', 'Status_Non_DoD_*_*.txt')

In [3]:
# Define the dict for reading OPM dynamic files
opm_dynamic_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('Agency/Subelement', (32, 36)),
    ('Accession/Separation Indicator', (36, 38)),
    ('Effective Date', (38, 46)),
    ('Age', (46, 52)),
    ('Pay Plan', (52, 54)),
    ('Grade', (54, 56)),
    ('LOS Level', (56, 62)),
    ('Duty Station', (62, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Cateogy (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Type of Appointment', (82, 84)),
    ('Work Schedule', (84, 85))
])

In [4]:
# Define the dict for reading OPM status files
opm_status_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('File Date (yyyymmdd)', (32, 40)),
    ('Agency/Subelement', (40, 44)),
    ('Duty Station', (44, 53)),
    ('Age Range', (53, 59)),
    ('Education Level', (59, 61)),
    ('Pay Plan', (61, 63)),
    ('Grade', (63, 65)),
    ('LOS Level', (65, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Category (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Supervisory Status', (82, 83)),
    ('Type of Appointment', (83, 85)),
    ('Work Schedule', (85, 86)),
    ('NSFTP Indicator', (86, 87))
])

In [5]:
status_ddf = ddf.read_fwf(opm_status_dir_dask, colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = str, usecols = ['Pseudo-ID', 'Employee Name', 'Education Level'])

In [6]:
dynamic_ddf = ddf.read_fwf(opm_dynamic_dir_dask, colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = str)

In [7]:
# Define functions that return a dict of files in the status or dynamic directory, indexed by year and quarter
def produce_opm_dynamic_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime(file[0:7])
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

def produce_opm_status_path_dict(input_path):
    output_dict = {}
    for file in os.listdir(input_path):
        file_date = pd.to_datetime('-'.join(file[15:22].split('_')))
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = os.path.join(input_path, file)

    return output_dict

In [8]:
# Define function that imports files into dict within a certain timeframe
def produce_opm_df_dict(opm_path_dict, opm_fwf_dict, start_year, start_qtr, end_year, end_qtr):
    output_df_dict = {}
    for year in range(start_year, end_year + 1):
        for qtr in range(1, 5):
            if (year == start_year) and (qtr < start_qtr):
                continue
            if (year == end_year) and (qtr > end_qtr):
                continue

            output_df_dict[(year, qtr)] = pd.read_fwf(opm_path_dict[(year, qtr)], colspecs = list(opm_fwf_dict.values()), names = list(opm_fwf_dict.keys()), dtype = str)

    return output_df_dict

In [9]:
# years and quarters we want to load (inclusive)
start_year = 1988
start_qtr = 3
end_year = 1989
end_qtr = 4

In [10]:
# Create dict of paths of files
opm_dynamic_path_dict = produce_opm_dynamic_path_dict(opm_dynamic_dir)
opm_status_path_dict = produce_opm_status_path_dict(opm_status_dir)

In [11]:
opm_status_df_dict = {}
opm_dynamic_df_dict = {}
error_status_df_list = []
error_dynamic_df_list = []

# Load dataframes we want
for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        print(f'Loading year {year}, quarter {qtr}.')

        #try:
        #    pd.read_fwf(opm_status_path_dict[(year, qtr)], colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = str)
        #except:
        #    error_status_df_list.append((year, qtr))
        #    print(f'Error for status file in {year},q{qtr}.')

        #try:
        #    pd.read_fwf(opm_dynamic_path_dict[(year, qtr)], colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = str)
        #except:
        #    error_dynamic_df_list.append((year, qtr))
        #    print(f'Error for dynamic file in {year},q{qtr}.')
        
        line_counter = 0
        with open(opm_status_path_dict[(year, qtr)], 'r', errors = 'backslashreplace') as f:
            for line in f:
                if '\\xff' in line:
                    if line.find('\\xff') != 85:
                        print('Error not at position 85')

        #opm_status_df_dict[(year, qtr)].to_csv('test_latin1.csv')

#opm_status_df_dict[(1982, 1)] = opm_status_df_dict[(1982, 1)][:-1] # Drop last row which has name of other files for some reason

Loading year 1988, quarter 3.
Loading year 1988, quarter 4.
Loading year 1989, quarter 1.
Loading year 1989, quarter 2.
Loading year 1989, quarter 3.
Loading year 1989, quarter 4.


Problem in loading year 1988 q3

for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        yq_current = year*4 + qtr

        print(f'Concatenating year {year}, quarter {qtr}.')

        # Produce df to merge
        educ_df_to_merge = pd.DataFrame()
        for key, df in opm_status_df_dict.items():
            yq_df = key[0]*4 + key[1]

            # Want to merge education info from status file within the year before and after the relevant dynamic file date
            if -4 <= yq_df - yq_current <= 4:
                educ_df_to_merge = pd.concat([educ_df_to_merge, df[['Pseudo-ID', 'Education Level']]], ignore_index = True)

        print(f'Merging year {year}, quarter {qtr}.')
        # Merge education info to dynamic dfs
        opm_dynamic_df_dict[(year, qtr)] = opm_dynamic_df_dict[(year, qtr)].merge(educ_df_to_merge, how = 'left', on = ['Pseudo-ID'])