In [7]:
import numpy as np
import pandas as pd
import re
from unicodedata import normalize
import copy
import json
import os

In [2]:
csv_path = '../files/02_dictionaries/01_csv_load'
json_path = '../files/02_dictionaries/02_json_mapping'

In [3]:
def transform_spanish(s):
    s = re.sub(
        r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1", 
        normalize( "NFD", s), 0, re.I
    )

    # -> NFC
    s = normalize( 'NFC', s)
    s = s.replace(u"ñ","nh")
    s = s.lower()
    s = s.replace(" ","_")
    s = s.replace("¿","")
    return s

In [26]:
def get_labels(info_df, basename_f):
    info_df["DESC_CLEAN"] = info_df["DESCRIPCIÓN DE LAS VARIABLES"].apply(lambda x: transform_spanish(x))
    # Group mappings
    group_info_df = info_df.groupby(by=["VARIABLE"])

    # Create dict of mappings
    dict_mapping = {}
    for index, df in group_info_df:
        if df['VALORES'].isnull().all() == False:
            #print(index)
            #print(df['DESC_CLEAN'])
            new_key = df['DESC_CLEAN'].values[0]
            #print(new_key)
            dict_valor = {}
            for sub_index, valor, desc in df[['VALORES','DESCRIPCIÓN DE LAS ALTERNATIVAS']].itertuples():
                try:
                    valor = str(int(valor))
                except ValueError:
                    pass
                dict_valor[valor] = desc      
            dict_mapping[f'{basename_f}-{index}-{new_key}'] = copy.deepcopy(dict_valor)
            
    return dict_mapping

In [27]:
def get_desc_columns(info_df,basename_f):
    # idcolumns and decs columns
    columns_mapping_df = info_df[['VARIABLE','DESC_CLEAN']].drop_duplicates(keep = 'first')

    # For the columns
    dict_columnas = {}
    for sub_index, valor, desc in columns_mapping_df.itertuples():
        dict_columnas[valor] = f'{basename_f}-{valor}-{desc}'
        
    return dict_columnas

In [9]:
filenames = [os.path.splitext(f)[0] for f in  os.listdir(csv_path)]
filenames

['CSALUD01',
 'CSALUD08',
 'Programas_Sociales_x_Hogar',
 'PS_BECA_18',
 'PS_COMEDOR',
 'PS_PENSION65',
 'PS_QALIWARMA',
 'PS_TRABAJA',
 'PS_VL',
 'PS_WAWAWASI',
 'RECH0',
 'RECH23',
 'RECH5',
 'RECH6']

In [28]:
for f in filenames:
    print(f'Processing {f}')
    info_df = pd.read_csv(f'{csv_path}/{f}.csv', encoding = "ISO-8859-1", engine='python')
    dict_mapping = get_labels(info_df, f)
    dict_columnas = get_desc_columns(info_df, f)
    dump_path = f'{json_path}/{f}'
    if not os.path.exists(dump_path): os.makedirs(dump_path)
        
    with open(f'{dump_path}/labels.json', 'w') as fp:
        json.dump(dict_mapping, fp)
    print(f" -> Labels saved for {f}")
    
    with open(f'{dump_path}/columns.json', 'w') as fp:
        json.dump(dict_columnas, fp)
    print(f" -> Columns saved for {f}")
    

Processing CSALUD01
 -> Labels saved for CSALUD01
 -> Columns saved for CSALUD01
Processing CSALUD08
 -> Labels saved for CSALUD08
 -> Columns saved for CSALUD08
Processing Programas_Sociales_x_Hogar
 -> Labels saved for Programas_Sociales_x_Hogar
 -> Columns saved for Programas_Sociales_x_Hogar
Processing PS_BECA_18
 -> Labels saved for PS_BECA_18
 -> Columns saved for PS_BECA_18
Processing PS_COMEDOR
 -> Labels saved for PS_COMEDOR
 -> Columns saved for PS_COMEDOR
Processing PS_PENSION65
 -> Labels saved for PS_PENSION65
 -> Columns saved for PS_PENSION65
Processing PS_QALIWARMA
 -> Labels saved for PS_QALIWARMA
 -> Columns saved for PS_QALIWARMA
Processing PS_TRABAJA
 -> Labels saved for PS_TRABAJA
 -> Columns saved for PS_TRABAJA
Processing PS_VL
 -> Labels saved for PS_VL
 -> Columns saved for PS_VL
Processing PS_WAWAWASI
 -> Labels saved for PS_WAWAWASI
 -> Columns saved for PS_WAWAWASI
Processing RECH0
 -> Labels saved for RECH0
 -> Columns saved for RECH0
Processing RECH23
 -> 