In [1]:
from pathlib import Path
import pandas as pd
import json
from pandas.api.types import union_categoricals
from itertools import islice
import re

In [2]:
oews_path = Path('../raw_data/oews')

In [3]:
# Create dict with list of MSA files within each folder-year
oews_path_dict = {}
for folder in Path(oews_path).iterdir():
    folder_name = folder.name
    year = re.findall('\d+', folder_name)[0]
    
    if year == '97' or year == '98' or year == '99':
        year = '19' + year
    else:
        year = '20' + year

    file_list = []
    for file in Path(folder).iterdir():
        file_name = file.name
        file_prefix = file_name[0:3]

        # All MSA files is prefixed with 'msa' or 'MSA', except for 1997, which has 'oes' prefix
        if file_prefix == 'msa' or file_prefix == 'MSA' or file_prefix == 'oes':
            file_list.append(file)

    oews_path_dict[year] = file_list

In [4]:
# Directory we are storing processed dataframes in binary format, for quick access later
binary_path = Path('../cleaned_binaries/')
binary_path.mkdir(parents = True, exist_ok = True)

# Dict to store paths to processed binaries
oews_feather_path_dict = {}

In [5]:
# These variables we want to store as strings
oews_dtype_dict = {
    'area': 'string',
    'area_name': 'string',
    'occ_code': 'string',
    'occ_title': 'string'
}

In [6]:
for year, file_list in oews_path_dict.items():
    file_list_df_dict = {} # Dict to store all relevant dfs in each year-folder

    print(f'Loading files for year {year}.')
    
    for file_path in file_list:

        # Prior to 2001, field descriptions were placed in the header rows

        if int(year) < 2001:
            df = pd.read_excel(file_path, dtype = str)

            header_row = df['Unnamed: 0'].isna().values.argmin() # Find first row that is non-empty in first column, which should be the header row
            header = df.iloc[header_row]

            # Create new df without field description and header rows, and new headers
            new_df = df.iloc[header_row + 1:]
            new_df = new_df.rename(columns = header)
            
        else:
            new_df = pd.read_excel(file_path, header = 0, dtype = str)

        # Keep only columns we want and store in df dict
        # For year 2000, occ_title has a typo
        if int(year) == 2000:
            new_df.rename(columns = {'occ_titl': 'occ_title'}, inplace = True)

        new_df.columns = new_df.columns.str.lower() # Convert to lowercase column titles for all years

        # Starting in 2019, area_name was changed to area_title
        if int(year) >= 2019:
            new_df.rename(columns = {'area_title': 'area_name'}, inplace = True)

        new_df = new_df[['area', 'area_name', 'occ_code', 'occ_title', 'tot_emp', 'a_mean']]
        file_list_df_dict[file_path.name] = new_df

    combined_df = pd.concat([df for df in file_list_df_dict.values()]) # Concat all dfs into one
    combined_df.reset_index(drop = True, inplace = True)

    combined_df['tot_emp'] = pd.to_numeric(combined_df['tot_emp'], errors = 'coerce')
    combined_df['a_mean'] = pd.to_numeric(combined_df['a_mean'], errors = 'coerce')

    target_path = binary_path.joinpath(f'oews_{year}.feather')
    combined_df.to_feather(target_path)

    oews_feather_path_dict[year] = str(target_path)


Loading files for year 2000.
Loading files for year 2001.
Loading files for year 2002.
Loading files for year 1997.
Loading files for year 1998.
Loading files for year 1999.
Loading files for year 2003.
Loading files for year 2004.
Loading files for year 2005.
Loading files for year 2006.
Loading files for year 2007.
Loading files for year 2008.
Loading files for year 2009.
Loading files for year 2010.
Loading files for year 2011.
Loading files for year 2012.
Loading files for year 2013.
Loading files for year 2014.
Loading files for year 2015.
Loading files for year 2016.
Loading files for year 2017.
Loading files for year 2018.
Loading files for year 2019.
Loading files for year 2020.
Loading files for year 2021.


In [7]:
# Save dictionary for paths of binaries
with open('../code_output/oews_feather_path_dict.json', 'w') as outfile:
    json.dump(oews_feather_path_dict, outfile, indent = 4)
