In [1]:
# Imports
import sys
import os
sys.path.append('../..')

# Data Manipulation
import pandas as pd
import numpy as np
import openpyxl

In [2]:
# Global Variables
DATA_PATH = "../../data/"

## 1. Arquivos de Layout

In [3]:
# Carrega o arquivo de labels como um arquivo Excel
layout_2010 = pd.ExcelFile('2010_Layout_microdados_Amostra_.xlsx')

In [4]:
# Funções Auxiliares

def remove_unnamed_columns(df) -> pd.DataFrame:
    """
    Remove colunas com nome Unnamed. Estas colunas vem de colunas que foram mescladas no .xlsx original.

    Args:
        df (pd.DataFrame): DataFrame com colunas Unnamed

    Returns:
        pd.DataFrame: DataFrame sem colunas Unnamed
    
    """
    return df.loc[:, ~df.columns.str.contains('^Unnamed')]


def get_values(split: list):
    values = {}
    for val in range(1, len(split)):
        aux = split[val].split('-')
        if len(aux) == 1:
            aux.append(aux[0])
            aux[0] = ' '    
        # aux[1] = aux[1].replace(' ', '')
        values[aux[0]] = aux[1]
    # print(values)
    return values


def split_col_names_values(col: pd.Series): 
    df = pd.DataFrame(columns = ['VAR', 'NOME', 'CHAVE', 'VALOR'])
    for index, value in col.items():
        df_aux = pd.DataFrame(columns = df.columns)
        split = value.split('\n')
        # df_aux['NOME'] = pd.concat([df_aux['NOME'], pd.Series(split[0])])
        if len(split) > 1:
            # series_values = pd.concat([series_values, pd.Series(get_values(split))])
            # df_line = pd.DataFrame(columns = ['NOME', 'CHAVES', 'VALORES'])
            # # df_line['NOME'] = pd.Series(split[0])
            # df_line = pd.Series(get_values)
            # dct = pd.Series(get_values)
            lst = [{'CHAVE': d_key, 'VALOR': d_value, 'VAR': index} for d_key, d_value in get_values(split).items()]
            df_aux = pd.DataFrame(lst)
            df_aux['NOME'] = split[0]
            
        else:
            df_aux['NOME'] = pd.Series(split[0])
            
        df = pd.concat([df, df_aux])
    return df


# def prepare_df(df: pd.DataFrame):
#     df_main = pd.DataFrame(columns = df.columns)
#     df = remove_unnamed_columns(df) 
#     for index, row in df.iterrows():
#         df_row = pd.DataFrame(columns = df.columns)
#         # for col in df.columns.drop('NOME'):
#         #     # print(entry)
#         #     df_row[col] = df[col]
#         df_row = pd.concat([df_row, split_col_names_values(row['NOME'])])
#         # print(df_entry)
#         df_aux = pd.concat([df_aux, df_row])
#     return df, df_aux


def prepare_df(df: pd.DataFrame)-> tuple:
    df = remove_unnamed_columns(df)
    df_vars = split_col_names_values(df.set_index('VAR')['NOME'])
    return df, df_vars

In [5]:
# Carrega as labels referentes aos microdados de domicílios
df_domi = pd.read_excel(layout_2010, sheet_name='DOMI')
df_domi, df_domi_vars = prepare_df(df_domi)
df_domi.head()

Unnamed: 0,VAR,NOME,POSIÇÃO INICIAL,POSIÇÃO FINAL,INT,DEC,TIPO
0,V0001,UNIDADE DA FEDERAÇÃO:\n11- Rondônia\n12- Acre\...,1,2,2,,A
1,V0002,CÓDIGO DO MUNICÍPIO,3,7,5,,A
2,V0011,ÁREA DE PONDERAÇÃO,8,20,13,,A
3,V0300,CONTROLE,21,28,8,,N
4,V0010,PESO AMOSTRAL,29,44,3,13.0,N


In [6]:
df_domi_vars.head()

Unnamed: 0,VAR,NOME,CHAVE,VALOR
0,V0001,UNIDADE DA FEDERAÇÃO:,11,Rondônia
1,V0001,UNIDADE DA FEDERAÇÃO:,12,Acre
2,V0001,UNIDADE DA FEDERAÇÃO:,13,Amazonas
3,V0001,UNIDADE DA FEDERAÇÃO:,14,Roraima
4,V0001,UNIDADE DA FEDERAÇÃO:,15,Pará


In [7]:
# # Dataframe de pessoas
# df_pess = pd.read_excel(layout_2010, sheet_name='PESS')
# # Dataframe de emigração  
# df_emig = pd.read_excel(layout_2010, sheet_name='EMIG')
# # Dataframe de mortalidade
# df_mort = pd.read_excel(layout_2010, sheet_name='MORT')

In [8]:
MICRODATA_PATH = DATA_PATH + "microdados/"

dfs = {
    'amostra_domicilios_2010': df_domi,
    # 'amostra_pessoas_2010': df_pess,
    # 'amostra_emigracao_2010': df_emig,
    # 'amostra_mortalidade_2010': df_mort
}

dfs_vars = {
    'amostra_domicilios_2010': df_domi_vars,
    # 'amostra_pessoas_2010': df_pess_vars,
    # 'amostra_emigracao_2010': df_emig_vars,
    # 'amostra_mortalidade_2010': df_mort_vars
}

In [10]:
# Funções auxiliares


def save_log(log: str, log_file = DATA_PATH + "logs/translate_microdata.log"):
    with open(log_file, 'a') as f:
        f.write(20*"-" + "\n" + log)
        f.close()

def replace_null(values: list):
    for value in values:
        if str(value).strip() == '':
            value = None
        if value is not None:
            return value


def split_by_index(
        input_string: str,
        indexes: list[int]
    ) -> list[str]:
    """
    Splits a string into a list of strings using the indexes provided.

    Args:
        input_string (str): String to be split
        indexes (list[int]): List of indexes indicating where to split the string

    Returns:
        list[str]: List of splitted strings
    """
    return [input_string[i:j] for i, j in zip(indexes, indexes[1:] + [None])]


def extract_line_values(
        file: str, df: pd.DataFrame
    ) -> dict[int, list[str]]:
    """
    Extracts the values from the line of a microdata microdata file.

    Args:
        file (str): Name of the file to be processed
        df (pd.DataFrame): DataFrame containing the indexes of the columns

    Returns:
        dict[int, list[str]]: Dictionary containing the values of each line.
            The keys are the line numbers and the values are lists of strings containing the values not translated.
    """
    # open file
    with open(MICRODATA_PATH + file, 'r') as f:
        # create a counter for indexing the lines
        count = 0
        # create a dictionary to store the values
        values = []
        # read all lines
        lines = f.readlines()
        # iterate over the lines
        for line in lines:
            # split the lines using the indexes from the dataframe
            line_values = split_by_index(line, df['POSIÇÃO INICIAL'].apply(lambda x: int(x) - 1).tolist())
            # save the values in the dictionary
            line_values[-1] = line_values[-1].replace('\n', '')
            values.append(line_values)
            count += 1
    # close the file
        f.close()
    return values


def translate_categorical_microdata(value: str, df_line: pd.DataFrame, df_vars: pd.DataFrame):
    try:
        # if it is, get the possible values
        possible_values = df_vars[df_vars['VAR'] == df_line['VAR']]
        # try to get the value from the possible values
        try:
            translated_value = possible_values[possible_values['CHAVE'] == value]['VALOR'].values[0]
        # there is the possibility that the value is not in the possible values
        except IndexError:
            # if it is not, just use the value
            translated_value = value
    except Exception as e:
        translated_value = value
        save_log(f"\t\t\tFunção translate_categorical_microdata:\n\t\t\tErro: {e}\n\t\t\tValue: {value}\tVAR: {df_line['VAR']}\t")
    return translated_value


def translate_numerical_microdata(value: str, df_line: pd.DataFrame, df_vars: pd.DataFrame):
    # verify if the value is empty (Not a Number)
    if value.strip() == '':
        translated_value = np.nan
    # verify if the value is a integer
    elif np.isnan(df_line['DEC']):
        # if it is an integer, just convert the value to int
        try:
            translated_value = int(value.strip())
        except Exception as e:
            translated_value = value
            save_log(f"\t\t\tFunção translate_numerical_microdata / Fluxo de int:\n\t\t\tErro: {e}\n\t\t\tValue: {value}\tVAR: {df_line['VAR']}\t")
    # if its decimal
    else:
        try:
            # get the separator position
            separator = int(df_line['DEC'])
            # split the value in two parts, the integer part and the decimal part
            int_part, dec_part = value[:-separator], value[-separator:]
            int_part = replace_null([int_part, 0])
            dec_part = replace_null([dec_part, 0])  
            # join the two parts with a dot and convert to float
            translated_value = float(f"{int_part}.{dec_part}")
        except Exception as e:
            translated_value = value
            save_log(f"\t\t\tFunção translate_numerical_microdata / Fluxo de Float:\n\t\t\tErro: {e}\n\t\t\tValue: {value}\tVAR: {df_line['VAR']}\t")
    return translated_value

def translate_line_microdata(line_values: str, df: pd.DataFrame, df_vars: pd.DataFrame):
    aux_dict = {}
    # iterate over all the columns of the microdata line
    for i in range(len(line_values)):
        # get the value that will be translaed
        value = line_values[i]
        # get the var code
        df_line = dict(df.iloc[i])
        # verify if the value is "translatable"
        if (df_line['TIPO'] == 'C' or df_line['VAR'] == 'V0001') and df_line['VAR'][0] != 'M':
            translated_value = translate_categorical_microdata(value, df_line, df_vars)
        # verify if the value is a number
        elif df_line['TIPO'] == 'N':
            translated_value = translate_numerical_microdata(value, df_line, df_vars)
        else:
            # if it is not translatable, just use the value
            translated_value = value
        # save the value in the dictionary
        aux_dict[df_line['VAR']] = [translated_value]
    return pd.DataFrame(aux_dict)


def process_microdata_files(
        file_list: list[str],
        df: dict[str, pd.DataFrame],
        df_vars: dict[str, pd.DataFrame]
    ):
    """
    WORK IN PROGRESS. This function will process all the microdata files
    """
    data = pd.DataFrame()
    for file in file_list:
        save_log(f"Iniciando a extração do arquivo [{file}]")
        values = extract_line_values(file, df)
        for line in values:
            save_log(f"\tExtraindo os valores da linha [{line}]")
            try:
                line_data = translate_line_microdata(line, df, df_vars)
                data = pd.concat([data, line_data])
            except Exception as e:
                save_log(f"\t\tErro na linha [{line}]: {e}")
                continue
        save_log(f"Finalizando a extração do arquivo [{file}]")
        return data

    # for file in os.listdir(dir_path):
    #     if "amostra" not in file:
    #         continue
    #     # get the name and year of the research from the file name
    #     # the format is amostra_domicilios_YYYY_UF.txt
    #     research = '_'.join(file.split('_')[:3])
    #     try:
    #         # extract the lines from the file
    #         values = extract_line_values(file, dfs[research])
    #         # translate the values
    #         translate_microdata(values, dfs[research])
    #     except KeyError:
    #         # print(f"O arquivo {file} não foi processado.");
    #         continue

In [23]:
# file_list = os.listdir(MICRODATA_PATH)
# file_list = [file for file in file_list if 'amostra_domicilios' in file]
# a = process_microdata_files(file_list, dfs['amostra_domicilios_2010'], dfs_vars['amostra_domicilios_2010'])
# a

In [11]:
%%time
file = "amostra_domicilios_2010_SC.txt"
sc_line_values = extract_line_values(file, dfs['amostra_domicilios_2010'])

CPU times: total: 41.3 s
Wall time: 44 s


In [20]:
a = translate_line_microdata(sc_line_values[0], dfs['amostra_domicilios_2010'], dfs_vars['amostra_domicilios_2010'])
a

Unnamed: 0,V0001,V0002,V0011,V0300,V0010,V1001,V1002,V1003,V1004,V1006,...,M0218,M0219,M0220,M0221,M0222,M0301,M0401,M0402,M0701,V1005
0,Santa Catarina,51,4200051001001,745,3.486869,Região sul (uf=41 a 43),3,9,0,2,...,2,2,2,2,2,2,2,2,2,8


In [11]:
%%time
a = process_microdata_files(["amostra_domicilios_2010_AM.txt"], dfs['amostra_domicilios_2010'], dfs_vars['amostra_domicilios_2010'])
a

CPU times: total: 49min 55s
Wall time: 59min 37s


Unnamed: 0,V0001,V0002,V0011,V0300,V0010,V1001,V1002,V1003,V1004,V1006,...,M0218,M0219,M0220,M0221,M0222,M0301,M0401,M0402,M0701,V1005
0,Amazonas,00029,1300029001001,22464,4.047602,Região norte (uf=11 a 17),03,005,00,1,...,2,2,2,2,2,2,2,2,2,1
0,Amazonas,00029,1300029001001,55516,5.378923,Região norte (uf=11 a 17),03,005,00,2,...,2,2,2,2,2,2,2,2,2,8
0,Amazonas,00029,1300029001001,58181,5.535143,Região norte (uf=11 a 17),03,005,00,2,...,2,2,2,2,2,2,2,2,2,8
0,Amazonas,00029,1300029001001,58620,5.071826,Região norte (uf=11 a 17),03,005,00,2,...,2,2,2,2,2,2,2,2,2,8
0,Amazonas,00029,1300029001001,64387,5.912039,Região norte (uf=11 a 17),03,005,00,1,...,2,2,2,2,2,2,2,2,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
0,Amazonas,04401,1304401001001,6088380,4.641105,Região norte (uf=11 a 17),03,009,00,1,...,2,2,2,2,2,2,2,2,2,1
0,Amazonas,04401,1304401001001,6105259,5.834696,Região norte (uf=11 a 17),03,009,00,1,...,2,2,2,2,2,2,2,2,2,1
0,Amazonas,04401,1304401001001,6120120,5.504584,Região norte (uf=11 a 17),03,009,00,1,...,2,2,2,2,2,2,2,2,2,1
0,Amazonas,04401,1304401001001,6121832,6.652051,Região norte (uf=11 a 17),03,009,00,2,...,2,2,2,2,2,2,2,2,2,8


In [None]:
df_domi

Unnamed: 0,VAR,NOME,POSIÇÃO INICIAL,POSIÇÃO FINAL,INT,DEC,TIPO
0,V0001,UNIDADE DA FEDERAÇÃO:\n11- Rondônia\n12- Acre\...,1,2,2,,A
1,V0002,CÓDIGO DO MUNICÍPIO,3,7,5,,A
2,V0011,ÁREA DE PONDERAÇÃO,8,20,13,,A
3,V0300,CONTROLE,21,28,8,,N
4,V0010,PESO AMOSTRAL,29,44,3,13.0,N
...,...,...,...,...,...,...,...
71,M0301,MARCA DE IMPUTAÇÃO NA V0301: \n1- Sim\n2- Não,168,168,1,,C
72,M0401,MARCA DE IMPUTAÇÃO NA V0401: \n1- Sim\n2- Não,169,169,1,,C
73,M0402,MARCA DE IMPUTAÇÃO NA V0402: \n1- Sim\n2- Não,170,170,1,,C
74,M0701,MARCA DE IMPUTAÇÃO NA V0701: \n1- Sim\n2- Não,171,171,1,,C


In [None]:
# with open(MICRODATA_PATH + file, 'r') as f:
#     lines = f.readlines()
#     print(lines[0], lines[1], lines[2], lines[3], lines[4], sep='')