In [1]:
import pandas as pd
from collections import OrderedDict
from pathlib import Path
import dask.dataframe as ddf

opm_dynamic_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'dynamic')
opm_status_dir = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'status')
opm_dynamic_dir_dask = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'dynamic', '*.NONDOD.FO05M3.TXT')
opm_status_dir_dask = os.path.join('..', 'rawdata', 'opm-federal-employment-data', 'data', '1973-09-to-2014-06', 'non-dod', 'status', 'Status_Non_DoD_*_*.txt')

In [2]:
opm_dynamic_dir = Path('../rawdata/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/dynamic')
opm_status_dir = Path('../rawdata/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/status')
opm_dynamic_dir_dask = Path('../rawdata/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/dynamic/*.NONDOD.FO05M3.TXT')
opm_status_dir_dask = Path('../rawdata/opm-federal-employment-data/data/1973-09-to-2014-06/non-dod/status/Status_Non_DoD_*_*.txt')

In [3]:
# Define the dict for reading OPM dynamic files
opm_dynamic_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('Agency/Subelement', (32, 36)),
    ('Accession/Separation Indicator', (36, 38)),
    ('Effective Date', (38, 46)),
    ('Age', (46, 52)),
    ('Pay Plan', (52, 54)),
    ('Grade', (54, 56)),
    ('LOS Level', (56, 62)),
    ('Duty Station', (62, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Cateogy (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Type of Appointment', (82, 84)),
    ('Work Schedule', (84, 85))
])

In [4]:
# Define the dict for reading OPM status files
opm_status_fwf_dict = OrderedDict([
    ('Pseudo-ID', (0, 9)),
    ('Employee Name', (9, 32)),
    ('File Date (yyyymmdd)', (32, 40)),
    ('Agency/Subelement', (40, 44)),
    ('Duty Station', (44, 53)),
    ('Age Range', (53, 59)),
    ('Education Level', (59, 61)),
    ('Pay Plan', (61, 63)),
    ('Grade', (63, 65)),
    ('LOS Level', (65, 71)),
    ('Occupation', (71, 75)),
    ('Occupational Category (PATCO)', (75, 76)),
    ('Adjusted Basic Pay', (76, 82)),
    ('Supervisory Status', (82, 83)),
    ('Type of Appointment', (83, 85)),
    ('Work Schedule', (85, 86)),
    ('NSFTP Indicator', (86, 87))
])

In [5]:
# Define functions that return a dict of files in the status or dynamic directory, indexed by year and quarter
def produce_opm_dynamic_path_dict(input_path):
    output_dict = {}
    for file_path in Path(input_path).iterdir():
        file_name = file_path.name
        file_date = pd.to_datetime(file_name[0:7])
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = file_path

    return output_dict

def produce_opm_status_path_dict(input_path):
    output_dict = {}
    for file_path in Path(input_path).iterdir():
        file_name = file_path.name
        file_date = pd.to_datetime('-'.join(file_name[15:22].split('_')))
        year = file_date.year
        quarter = file_date.quarter
        output_dict[(year, quarter)] = file_path

    return output_dict

In [6]:
# years and quarters we want to load (inclusive)
start_year = 1973
start_qtr = 3
end_year = 1973
end_qtr = 3

In [15]:
for path in Path(opm_status_dir).iterdir():
    file = path.name

In [16]:
file[15:22]

'2014_06'

In [97]:
# Create dict of paths of files
opm_dynamic_path_dict = produce_opm_dynamic_path_dict(opm_dynamic_dir)
opm_status_path_dict = produce_opm_status_path_dict(opm_status_dir)

TypeError: 'WindowsPath' object is not subscriptable

In [84]:
Path('../data/converted_to_csv').mkdir(parents = True, exist_ok = True)
    
# Convert fwf to csv for faster access
for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue
        
        try:
            df = pd.read_fwf(opm_status_path_dict[(year, qtr)], colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = str)
            target_path = f'../data/converted_to_csv/status_{year}_{qtr}.csv'
            df.to_csv(target_path)
            print(f'Imported status file for year {year}, quarter {qtr}.')
        except:
            print(f'Status file for year {year}, quarter {qtr} does not exist.')
        try:
            df = pd.read_fwf(opm_dynamic_path_dict[(year, qtr)], colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = str)
            target_path = f'../data/converted_to_csv/dynamic_{year}_{qtr}.csv'
            df.to_csv(target_path)
            print(f'Imported dynamic file for year {year}, quarter {qtr}.')
        except:
            print(f'Dynamic file for year {year}, quarter {qtr} does not exist.')

AttributeError: 'str' object has no attribute '_accessor'

In [10]:
# Define function that imports files into dict within a certain timeframe
def produce_opm_df_dict(opm_path_dict, opm_fwf_dict, start_year, start_qtr, end_year, end_qtr):
    output_df_dict = {}
    for year in range(start_year, end_year + 1):
        for qtr in range(1, 5):
            if (year == start_year) and (qtr < start_qtr):
                continue
            if (year == end_year) and (qtr > end_qtr):
                continue

            output_df_dict[(year, qtr)] = pd.read_fwf(opm_path_dict[(year, qtr)], colspecs = list(opm_fwf_dict.values()), names = list(opm_fwf_dict.keys()), dtype = str)

    return output_df_dict

In [20]:
pd.read_fwf(opm_status_path_dict[(2014, 1)], colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = str).to_csv('test.csv')

In [22]:
pd.read_csv('test.csv', dtype = str)

Unnamed: 0.1,Unnamed: 0,Pseudo-ID,Employee Name,File Date (yyyymmdd),Agency/Subelement,Duty Station,Age Range,Education Level,Pay Plan,Grade,LOS Level,Occupation,Occupational Category (PATCO),Adjusted Basic Pay,Supervisory Status,Type of Appointment,Work Schedule,NSFTP Indicator
0,0,000898655,"JACOBS,GRETCHEN E",20140331,AA00,110010001,50-54,15,ES,00,20-24,0905,P,161600,8,50,F,1
1,1,001103456,"WIENER,MATTHEW LEE",20140331,AA00,110010001,40-44,15,ES,00,3-4,0301,A,161600,2,50,F,1
2,2,001745142,"BULL,REEVE T",20140331,AA00,110010001,30-34,15,GS,14,3-4,0905,P,113346,8,30,F,1
3,3,001981032,"SEIDMAN,HARRY M",20140331,AA00,110010001,30-34,13,GS,15,10-14,0341,A,141660,8,10,F,1
4,4,002168537,"TATHAM,STEPHANIE J",20140331,AA00,110010001,30-34,15,GS,13,1-2,0905,P,089924,8,30,F,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1317710,1317710,006365025,"POORMAN,JOSHUA W",20140331,ZU00,110010001,20-24,13,AD,00,< 1,0301,A,032486,8,48,F,2
1317711,1317711,006489582,"CRAMER,ALBERT H",20140331,ZU00,110010001,20-24,13,AD,00,1-2,0301,A,041410,8,48,F,2
1317712,1317712,007382526,NAME WITHHELD BY AGENCY,20140331,ZU00,#########,35-39,17,AD,00,3-4,0301,A,095785,8,48,F,2
1317713,1317713,009317335,"JACOBSON,JOYCE T",20140331,ZU00,110010001,65-69,07,AD,00,5-9,0301,A,131053,2,48,F,2


opm_status_df_dict = {}
opm_dynamic_df_dict = {}
error_status_df_list = []
error_dynamic_df_list = []

# Load dataframes we want
for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        print(f'Loading year {year}, quarter {qtr}.')

        #try:
        #    pd.read_fwf(opm_status_path_dict[(year, qtr)], colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = str)
        #except:
        #    error_status_df_list.append((year, qtr))
        #    print(f'Error for status file in {year},q{qtr}.')

        #try:
        #    pd.read_fwf(opm_dynamic_path_dict[(year, qtr)], colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = str)
        #except:
        #    error_dynamic_df_list.append((year, qtr))
        #    print(f'Error for dynamic file in {year},q{qtr}.')
        
        line_counter = 0
        with open(opm_dynamic_path_dict[(year, qtr)], 'r', errors = 'backslashreplace') as f:
            for line in f:
                line_counter = line_counter + 1
                if '\\xff' in line:
                    print(f'Error found')
                    #print(f'Error found at line {line_counter}.')
                    #if line.find('\\xff') != 85:
                    #    print('Error not at position 85')

        #opm_status_df_dict[(year, qtr)].to_csv('test_latin1.csv')

#opm_status_df_dict[(1982, 1)] = opm_status_df_dict[(1982, 1)][:-1] # Drop last row which has name of other files for some reason

In [12]:
status_ddf = ddf.read_fwf(opm_status_dir_dask, colspecs = list(opm_status_fwf_dict.values()), names = list(opm_status_fwf_dict.keys()), dtype = str, encoding = 'latin-1', usecols = ['Pseudo-ID, Education Level, Work Schedule'])

In [13]:
dynamic_ddf = ddf.read_fwf(opm_dynamic_dir_dask, colspecs = list(opm_dynamic_fwf_dict.values()), names = list(opm_dynamic_fwf_dict.keys()), dtype = str)

In [14]:
status_ddf.compute()

Unnamed: 0,Pseudo-ID,Employee Name,File Date (yyyymmdd),Agency/Subelement,Duty Station,Age Range,Education Level,Pay Plan,Grade,LOS Level,Occupation,Occupational Category (PATCO),Adjusted Basic Pay,Supervisory Status,Type of Appointment,Work Schedule,NSFTP Indicator
0,000148862,"STUCKEY,JAMES E,",19730930,AA00,110010001,50-54,03,GS,05,15-19,0301,C,008465,8,10,F,1
1,000278418,PALMER BETTY M,19730930,AA00,110010001,30-34,07,GS,07,10-14,0318,C,010471,8,10,F,1
2,003141713,SHAW MARY J,19730930,AA00,110010001,35-39,04,GS,07,UNSP,****,,009520,8,15,F,1
3,003733130,NAGI SAAD Z,19730930,AA00,391800049,45-49,21,EC,00,1-2,0000,,026000,8,**,I,2
4,003882030,WOLF JAMES L,19730930,AA00,110010001,20-24,**,GS,11,UNSP,0904,P,013996,*,40,F,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1331047,006365025,"POORMAN,JOSHUA W",20140630,ZU00,110010001,25-29,13,AD,00,< 1,0301,A,032486,8,48,F,2
1331048,006489582,"CRAMER,ALBERT H",20140630,ZU00,110010001,25-29,13,AD,00,1-2,0301,A,041410,8,48,F,2
1331049,007382526,NAME WITHHELD BY AGENCY,20140630,ZU00,#########,30-34,17,AD,00,3-4,0301,A,095785,8,48,F,2
1331050,009317335,"JACOBSON,JOYCE T",20140630,ZU00,110010001,70-74,07,AD,00,5-9,0301,A,131053,2,48,F,2


In [15]:
dynamic_ddf.compute()

KeyboardInterrupt: 

Problem in loading year 1988 q3

for year in range(start_year, end_year + 1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        yq_current = year*4 + qtr

        print(f'Concatenating year {year}, quarter {qtr}.')

        # Produce df to merge
        educ_df_to_merge = pd.DataFrame()
        for key, df in opm_status_df_dict.items():
            yq_df = key[0]*4 + key[1]

            # Want to merge education info from status file within the year before and after the relevant dynamic file date
            if -4 <= yq_df - yq_current <= 4:
                educ_df_to_merge = pd.concat([educ_df_to_merge, df[['Pseudo-ID', 'Education Level']]], ignore_index = True)

        print(f'Merging year {year}, quarter {qtr}.')
        # Merge education info to dynamic dfs
        opm_dynamic_df_dict[(year, qtr)] = opm_dynamic_df_dict[(year, qtr)].merge(educ_df_to_merge, how = 'left', on = ['Pseudo-ID'])