In [1]:
import pandas as pd
from pandas.api.types import union_categoricals
import numpy as np
from pathlib import Path
import pickle

In [2]:
# Load dicts of paths of binaries
with open('opm_dynamic_feather_path_dict.pkl', 'rb') as f:
    opm_dynamic_feather_path_dict = pickle.load(f)

with open('opm_status_feather_path_dict.pkl', 'rb') as f:
    opm_status_feather_path_dict = pickle.load(f)

In [3]:
opm_dynamic_cat_col_list = [
    'Agency/Subelement',
    'Accession/Separation Indicator',
    'Age',
    'Pay Plan',
    'Grade',
    'LOS Level',
    'Duty Station',
    'Occupation',
    'Type of Appointment',
    'Work Schedule'
    ]

In [4]:
opm_status_cat_col_list = [
    'Agency/Subelement',
    'Duty Station',
    'Age Range',
    'Education Level',
    'Pay Plan',
    'Grade',
    'LOS Level',
    'Occupation',
    'Occupational Category (PATCO)',
    'Supervisory Status',
    'Type of Appointment',
    'Work Schedule',
    'NSFTP Indicator'
    ]

In [5]:
# Store categorical variable map for each categorical variable and year in nested dict
opm_dynamic_cat_index_dict = {}
opm_status_cat_index_dict = {}

for col in opm_dynamic_cat_col_list:
    opm_dynamic_cat_index_dict[col] = {}

for col in opm_status_cat_col_list:
    opm_status_cat_index_dict[col] = {}

In [6]:
start_year = 1982
start_qtr = 1
end_year = 2014
end_qtr = 2

In [7]:
for year in range(start_year, end_year+1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        df = pd.read_feather(opm_dynamic_feather_path_dict[(year, qtr)])

        for col in opm_dynamic_cat_col_list:
            cat_index = pd.CategoricalIndex(df[col])
            opm_dynamic_cat_index_dict[col][(year, qtr)] = cat_index


In [8]:
# Find the union across years of each categorical variable, and store resulting union in dict
opm_dynamic_uc_dict = {}
for col, cat_index_dict in opm_dynamic_cat_index_dict.items():
    opm_dynamic_uc_dict[col] = union_categoricals([cat_index for cat_index in cat_index_dict.values()])

In [9]:
start_year = 1973
start_qtr = 3
end_year = 2014
end_qtr = 2

In [10]:
for year in range(start_year, end_year+1):
    for qtr in range(1, 5):
        if (year == start_year) and (qtr < start_qtr):
            continue
        if (year == end_year) and (qtr > end_qtr):
            continue

        df = pd.read_feather(opm_status_feather_path_dict[(year, qtr)])

        for col in opm_status_cat_col_list:
            cat_index = pd.CategoricalIndex(df[col])
            opm_status_cat_index_dict[col][(year, qtr)] = cat_index

In [14]:
opm_status_uc_dict = {}
for col, cat_index_dict in opm_status_cat_index_dict.items():
    opm_status_uc_dict[col] = union_categoricals([cat_index for cat_index in cat_index_dict.values()])

In [19]:
# Update categorical mapping for all binaries, to enable type retention when concatenating
for yq, path in opm_dynamic_feather_path_dict.items():
    df = pd.read_feather(path)

    for col, uc in opm_dynamic_uc_dict.items():
        df[col] = pd.Categorical(df[col], categories = uc.categories)

    df.to_feather(path)

In [25]:
for yq, path in opm_status_feather_path_dict.items():
    df = pd.read_feather(path)

    for col, uc in opm_status_uc_dict.items():
        df[col] = pd.Categorical(df[col], categories = uc.categories)

    df.to_feather(path)