In [63]:
from pathlib import Path
import pandas as pd
import json
from pandas.api.types import union_categoricals
from itertools import islice

In [2]:
# Load dicts of paths of binaries
with open('opm_dynamic_feather_path_dict.json', 'r') as infile:
    opm_dynamic_feather_path_dict = json.load(infile)

with open('opm_status_feather_path_dict.json', 'r') as infile:
    opm_status_feather_path_dict = json.load(infile)

In [3]:
with open('opm_dynamic_cat_col_list.json') as infile:
    opm_dynamic_cat_col_list = json.load(infile)

In [4]:
with open('opm_status_cat_col_list.json') as infile:
    opm_status_cat_col_list = json.load(infile)

In [5]:
# Store categorical variable map for each categorical variable and year in nested dict
opm_dynamic_cat_index_dict = {}
opm_status_cat_index_dict = {}

for col in opm_dynamic_cat_col_list:
    opm_dynamic_cat_index_dict[col] = {}
    for year, year_dict in opm_dynamic_feather_path_dict.items():
        opm_dynamic_cat_index_dict[col][year] = {}

for col in opm_status_cat_col_list:
    opm_status_cat_index_dict[col] = {}
    for year, year_dict in opm_status_feather_path_dict.items():
        opm_status_cat_index_dict[col][year] = {}

In [6]:
for year, qtr_dict in opm_dynamic_feather_path_dict.items():
    for qtr, str_path in qtr_dict.items():
        df = pd.read_feather(Path(str_path))

        for col in opm_dynamic_cat_col_list:
            cat_index = pd.CategoricalIndex(df[col])
            opm_dynamic_cat_index_dict[col][year][qtr] = cat_index

In [7]:
for year, qtr_dict in opm_status_feather_path_dict.items():
    for qtr, str_path in qtr_dict.items():
        df = pd.read_feather(Path(str_path))

        for col in opm_status_cat_col_list:
            cat_index = pd.CategoricalIndex(df[col])
            opm_status_cat_index_dict[col][year][qtr] = cat_index

In [8]:
# Find the union across years of each categorical variable, and store resulting union in dict
opm_dynamic_uc_dict = {}
for col, year_dict in opm_dynamic_cat_index_dict.items():
    cat_index_list = []
    for year, cat_index_dict in year_dict.items():
        for qtr, cat_index in cat_index_dict.items():
            cat_index_list.append(cat_index)
    
    opm_dynamic_uc_dict[col] = union_categoricals(cat_index_list)

In [9]:
opm_status_uc_dict = {}
for col, year_dict in opm_status_cat_index_dict.items():
    cat_index_list = []
    for year, cat_index_dict in year_dict.items():
        for qtr, cat_index in cat_index_dict.items():
            cat_index_list.append(cat_index)
    
    opm_status_uc_dict[col] = union_categoricals(cat_index_list)

In [10]:
# Update categorical mapping for all binaries, to enable type retention when concatenating
for year, qtr_dict in opm_dynamic_feather_path_dict.items():
    for qtr, str_path in qtr_dict.items():
        df = pd.read_feather(Path(str_path))

        for col, uc in opm_dynamic_uc_dict.items():
            df[col] = pd.Categorical(df[col], categories = uc.categories)

        df.to_feather(Path(str_path))

In [11]:
for year, qtr_dict in opm_status_feather_path_dict.items():
    for qtr, str_path in qtr_dict.items():
        df = pd.read_feather(Path(str_path))

        for col, uc in opm_status_uc_dict.items():
            df[col] = pd.Categorical(df[col], categories = uc.categories)

        df.to_feather(Path(str_path))

In [12]:
# Print out all categorical values
for col, cats in opm_dynamic_uc_dict.items():
    col = col.replace('/', '-')
    with open(f'../output/dynamic_{col}_cats.txt', 'w') as outfile:
        list = cats.categories.values.tolist()
        list.sort()
        for item in list:
            outfile.write(item + '\n')

In [13]:
# Print out all categorical values
for col, cats in opm_status_uc_dict.items():
    col = col.replace('/', '-')
    with open(f'../output/status_{col}_cats.txt', 'w') as outfile:
        list = cats.categories.values.tolist()
        list.sort()
        for item in list:
            outfile.write(item + '\n')