# Scraping dos arquivos de Microdados

este notebook realiza o download de todos os arquivos de microdados disponíveis (no site do IBGE)[https://www.ibge.gov.br/estatisticas/sociais/trabalho/22827-censo-demografico-2022.html?edicao=37225&t=microdados]. Os microdados estão separados por censo (2000 e 2010) e por UF.

In [89]:
import sys
import os
import shutil
import requests
import zipfile
sys.path.append('../..')

# Data Manipulation
import pandas as pd

# from utils import *

from bs4 import BeautifulSoup as bs
import urllib.request

In [36]:
url = "https://www.ibge.gov.br/estatisticas/sociais/trabalho/22827-censo-demografico-2022.html?edicao=37225&t=microdados"

DATA_PATH = "../../data/"

# The following variables determine which cells will be run, default is true for all variables
config_vars = {
    'download_zips': True # Download zips from IBGE website
    ,'unzip_files': True # Unzip the .zip files
    ,'delete_zips': False # Delete zips after unzip
    ,'move_up_files': True # Move files from subfolders to the root folder
    ,'rename_files': True # Rename files according to the definition {research}_{year}_{UF}.txt
}

## 1- Download dos Arquivos .zip

In [37]:
%%time
if config_vars['download_zips'] == True:
    census_list = []
    
    # scrape the html from IBGE's website
    soup = bs(requests.get(url).text)

    # find all unordered lists and iterate over them
    uls = soup.find_all("ul")
    for ul in uls:
    # find all links in the unordered list and iterate over them
        ul_as = ul.find_all("a")
        for ul_a in ul_as:
            # if one of the links refers to the state name "Acre", the list contains the census data
            if ul_a.text == "Acre":
                census_list.append(ul)

    # iterate over the census data list
    for census in census_list:
        # find all links in the census data list and iterate over them
        census_as = census.find_all("a")
        for census_a in census_as:
            # get the text of the link
            href = census_a['href']
            # get the year from the link's text
            idx_year = int(href.index("Censo_Demografico_") + len("Censo_Demografico_"))
            nm_year = href[idx_year:int(idx_year) + 4]
            # get the uf from the link's text
            nm_uf = href[href.rindex("/") + 1:-4]
            # define the filename
            nm_file = f"{DATA_PATH}/{nm_year}_{nm_uf}.zip"
            # download the file
            urllib.request.urlretrieve(href, nm_file)

CPU times: total: 15.5 s
Wall time: 2min 26s


## 2- Unzip dos arquivos

In [91]:
%%time
if config_vars['unzip_files'] == True:
    for file_path in os.listdir(DATA_PATH):
        if file_path.endswith('.zip'):
            with zipfile.ZipFile(DATA_PATH + file_path, 'r') as zip_ref:
                try:
                    zip_ref.extractall(DATA_PATH)
                    zip_ref.close()                    
                except Exception as e:
                    # print(f'File: {file_path} not processed due to Error: \n{e}')
                    pass
                

CPU times: total: 44.8 s
Wall time: 2min 9s


In [95]:
if config_vars['delete_zip_files'] == True:
    for file_path in os.listdir(DATA_PATH):
        if file_path.endswith('.zip'):
            os.remove(DATA_PATH + file_path)

# 3- Renomeia os arquivos

In [92]:
new_file_name = {
    # 2010 files
    'Amostra_Domicilios_': 'amostra_domicilios_2010_',
    'Amostra_Emigracao_': 'amostra_emigracao_2010_',
    'Amostra_Mortalidade_': 'amostra_mortalidade_2010_',
    'Amostra_Pessoas_': 'amostra_pessoas_2010_',
    # 2000 files
    'DOM': 'amostra_domicilios_2000_',
    'FAM': 'amostra_familias_2000_',
    'PES': 'amostra_pessoas_2000_',
}

def rename_file(file_name, UF):
    for old, new in new_file_name.items():
        if old.upper() in file_name.upper():
            return new + UF + '.txt'
    return file_name


In [93]:
if config_vars['move_up_files'] == True:
    for dir in os.listdir(DATA_PATH):
        dir_path = os.path.join(DATA_PATH, dir)
        if os.path.isdir(dir_path) and dir != 'microdados' and dir != 'outros':
            for file in os.listdir(dir_path):
                file_path = dir_path + '/' + file
                try:
                    if file.lower().endswith('.txt'):
                        new_file_path = DATA_PATH + 'microdados/' + rename_file(file, dir)
                    else:
                        new_file_path = DATA_PATH + 'outros/' + file
                    os.rename(file_path, new_file_path)
                except Exception as e:
                    print(e)
            shutil.rmtree(dir_path)

[WinError 183] Não é possível criar um arquivo já existente: '../../data/RN/DOM25.txt' -> '../../data/microdados/amostra_domicilios_2000_RN.txt'
[WinError 183] Não é possível criar um arquivo já existente: '../../data/RN/FAMI25.TXT' -> '../../data/microdados/amostra_familias_2000_RN.txt'
[WinError 183] Não é possível criar um arquivo já existente: '../../data/RN/PES25.txt' -> '../../data/microdados/amostra_pessoas_2000_RN.txt'
