# Generate .do file

The following script generates a `.do` file to be used on stata in order to translate a .dat file into a .dta file.

In [3]:
# Imports
import sys
import os
sys.path.append('../..')

# Data Manipulation
import pandas as pd
import numpy as np


In [4]:
# Global Variables
DATA_PATH = "../../data/"
LAYOUT_FILE = '../datasets/microdados/2010_Layout_microdados_Amostra_.xlsx'

TEST_DATA_FILE = '../data/microdados/amostra_domicilios_2010_SC.txt'

## 1. Read the layout file

In [5]:
# Carrega o arquivo de labels como um arquivo Excel
layout_2010 = pd.ExcelFile(LAYOUT_FILE)

In [6]:
# Funções Auxiliares

def remove_unnamed_columns(df) -> pd.DataFrame:
    """
    Remove colunas com nome Unnamed. Estas colunas vem de colunas que foram mescladas no .xlsx original.

    Args:
        df (pd.DataFrame): DataFrame com colunas Unnamed

    Returns:
        pd.DataFrame: DataFrame sem colunas Unnamed
    
    """
    return df.loc[:, ~df.columns.str.contains('^Unnamed')]


def get_values(split: list):
    values = {}
    for val in range(1, len(split)):
        aux = split[val].split('-')
        if len(aux) == 1:
            aux.append(aux[0])
            aux[0] = ' '    
        # aux[1] = aux[1].replace(' ', '')
        values[aux[0]] = aux[1]
    # print(values)
    return values


def split_col_names_values(col: pd.Series): 
    df = pd.DataFrame(columns = ['VAR', 'NOME', 'CHAVE', 'VALOR'])
    for index, value in col.items():
        df_aux = pd.DataFrame(columns = df.columns)
        split = value.split('\n')
        # df_aux['NOME'] = pd.concat([df_aux['NOME'], pd.Series(split[0])])
        if len(split) > 1:
            # series_values = pd.concat([series_values, pd.Series(get_values(split))])
            # df_line = pd.DataFrame(columns = ['NOME', 'CHAVES', 'VALORES'])
            # # df_line['NOME'] = pd.Series(split[0])
            # df_line = pd.Series(get_values)
            # dct = pd.Series(get_values)
            lst = [{'CHAVE': d_key, 'VALOR': d_value, 'VAR': index} for d_key, d_value in get_values(split).items()]
            df_aux = pd.DataFrame(lst)
            df_aux['NOME'] = split[0]
            
        else:
            df_aux['NOME'] = pd.Series(split[0])
            df_aux['CHAVE'] = np.nan
            df_aux['VALOR'] = np.nan
            df_aux['VAR'] = index
            
        df = pd.concat([df, df_aux])
    return df

def prepare_df(df: pd.DataFrame)-> tuple:
    df = remove_unnamed_columns(df)
    df_vars = split_col_names_values(df.set_index('VAR')['NOME'])
    return df, df_vars

### 1.X. Testes

## 2. Generate .do file

In [7]:
class DoFile:
    """
    Classe para gerar um arquivo .do para o Stata
    """

    def __init__(
            self, 
            file_name: str,
            data_file: str, 
            layout_file: str,
            layout_file_sheet = 0
        ) -> None:
        
        # Levantamento de erros
        if not isinstance(layout_file_sheet, (str, int)):
            raise TypeError("layout_file_sheet deve ser um inteiro ou uma string")

        # Define uma variável para armazenar o conteúdo do arquivo    
        self.file_str = str()
        # Adiciona o cabeçalho do arquivo
        self.add_header()
        # Adiciona o mapeamento das variáveis
        self.add_var_mapping(layout_file, layout_file_sheet)
        #
        self.add_data_file(data_file)
        # 
        self.add_labels(layout_file, layout_file_sheet)
        
        # Abre o arquivo e escreve o conteúdo
        with open(file_name, 'w', encoding='UTF-8') as file:
            file.write(self.file_str)
            file.close()


    def add_header(
        self
    ) -> None:
        """
        Adiciona o cabeçalho do arquivo .do
        """

        self.file_str += """* NOTE: You need to set the Stata working directory to the path
* where the data file is located.

set more off

clear
        """
        return
    
    def add_var_mapping(
        self, 
        layout_file: str, 
        layout_file_sheet = 0
    ) -> None:

        self.file_str += """\nquietly infix                ///"""
        # get the layout file
        df = pd.read_excel(layout_file, sheet_name = layout_file_sheet)
        
        # iterate over the df
        for index, row in df.iterrows():

            # get the variable name
            col_name = row['VAR']

            # get the variable length
            col_pos_ini = row['POSIÇÃO INICIAL']
            col_pos_end = row['POSIÇÃO FINAL']

            # get the variable type
            
            # if the column has one position
            if col_pos_ini == col_pos_end:
                # save the variable as a byte
                col_type = 'byte'
            # A represents an identifier and C represents a category
            elif row['TIPO'] == 'A' or row['TIPO'] == 'C':
                # save the id as an int
                col_type = 'int'
            # N represents a numeric variable
            elif row['TIPO'] == 'N':
                # verify if the variable has a decimal part
                if row['DEC'] is None:
                    # if not, save as an int
                    col_type = 'int'
                else:
                    # if yes, save as a double
                    col_type = 'double'

            # write the variable in the file
            self.file_str += f"\n  {col_type}\t\t{col_name}\t\t{col_pos_ini}-{col_pos_end}\t\t///"
                                
    def add_data_file(
        self,
        data_file: str
    ) -> None:
        """
        Adiciona o arquivo de dados ao .do
        """

        self.file_str += f'\n  using `"{data_file}"\', clear\n'
            
    def add_labels(
        self,
        layout_file: str, 
        layout_file_sheet = 0
    ) -> None:
        """
        Adiciona os labels ao .do
        """

        # initialize the strings
        label_var_str = str()
        label_define_str = str()

        # read the layout file
        df = pd.read_excel(layout_file, sheet_name = layout_file_sheet)

        # iterate over all the variables
        for index, row in df.iterrows():
            label_var_str += f'\nlabel var {row["VAR"]}\t\t`\"{row["NOME"]}\"\''

        # split the names and the values of the dataframe
        # the layout file has the column NOME with the following format:
        # VARIABLE_NAME \n 1- VALUE_1 \n 2- VALUE_2 \n ...
        df = split_col_names_values(df.set_index('VAR')['NOME'])
        
        for index, row in df.iterrows():
            label_define_str += f', add\nlabel define {row["VAR"]} {row["CHAVE"]} `\"{row["VALOR"]}\"\''

        self.file_str += label_var_str
        self.file_str += label_define_str
    

In [8]:
dofile = DoFile('test.do', TEST_DATA_FILE, layout_2010, layout_file_sheet='DOMI')
# print(dofile.file_str)

In [9]:
type(pd.DataFrame())

pandas.core.frame.DataFrame