# Download data

In [3]:
import os
import requests
import zipfile

In [5]:
def ensure_directory_exists(directory):
    """Ensure the directory exists. Create it if not."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
        return False
    else:
        print(f"Directory '{directory}' already exists.")
        return True

In [7]:
def download_file_if_not_exists(file_path, url):
    """Download a file if it does not exist."""
    if not os.path.exists(file_path):
        response = requests.get(url, stream=True)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Downloaded '{file_path}'.")
    else:
        print(f"File '{file_path}' already exists.")

In [9]:
def unzip(zip_path, extract_to):
    """Unzip a file and delete the .zip file afterward."""
    if zipfile.is_zipfile(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Extracted '{zip_path}' to '{extract_to}'.")
    else:
        print(f"'{zip_path}' is not a valid zip file.")

In [38]:
base_url = "https://ftp.ibge.gov.br/Censos/Censo_Demografico_2022/Agregados_por_Setores_Censitarios/Agregados_por_Municipio_csv/"

zips = ["Agregados_por_municipios_alfabetizacao_BR.zip","Agregados_por_municipios_basico_BR.zip","Agregados_por_municipios_caracteristicas_domicilio1_BR.zip",
       "Agregados_por_municipios_caracteristicas_domicilio2_BR.zip","Agregados_por_municipios_caracteristicas_domicilio3_BR.zip","Agregados_por_municipios_cor_ou_raca_BR.zip",
       "Agregados_por_municipios_demografia_BR.zip","Agregados_por_municipios_domicilios_indigenas_BR.zip","Agregados_por_municipios_domicilios_quilombolas_BR.zip",
       "Agregados_por_municipios_obitos_BR.zip","Agregados_por_municipios_parentesco_BR.zip","Agregados_por_municipios_pessoas_indigenas_BR.zip","Agregados_por_municipios_pessoas_quilombolas_BR.zip"] 

In [44]:
for zip_ in zips:
    download_url = "{}{}".format(base_url,zip_)
    zip_file_path = "{}/{}".format(directory,zip_)
    # Download the file if it doesn't exist
    download_file_if_not_exists(zip_file_path, download_url)
    
    # Unzip the file and delete the .zip
    unzip(zip_file_path, directory_out)

File 'datasets/IBGE/Agregados_por_municipios_alfabetizacao_BR.zip' already exists.
Extracted 'datasets/IBGE/Agregados_por_municipios_alfabetizacao_BR.zip' to 'datasets/IBGE'.
Downloaded 'datasets/IBGE/Agregados_por_municipios_basico_BR.zip'.
Extracted 'datasets/IBGE/Agregados_por_municipios_basico_BR.zip' to 'datasets/IBGE'.
Downloaded 'datasets/IBGE/Agregados_por_municipios_caracteristicas_domicilio1_BR.zip'.
Extracted 'datasets/IBGE/Agregados_por_municipios_caracteristicas_domicilio1_BR.zip' to 'datasets/IBGE'.
Downloaded 'datasets/IBGE/Agregados_por_municipios_caracteristicas_domicilio2_BR.zip'.
Extracted 'datasets/IBGE/Agregados_por_municipios_caracteristicas_domicilio2_BR.zip' to 'datasets/IBGE'.
Downloaded 'datasets/IBGE/Agregados_por_municipios_caracteristicas_domicilio3_BR.zip'.
Extracted 'datasets/IBGE/Agregados_por_municipios_caracteristicas_domicilio3_BR.zip' to 'datasets/IBGE'.
Downloaded 'datasets/IBGE/Agregados_por_municipios_cor_ou_raca_BR.zip'.
Extracted 'datasets/IBGE/

# Load data

In [47]:
import os
import pandas as pd
import sqlite3

In [49]:
db_path = 'sql/eleicao.db' 
connection = sqlite3.connect(db_path)

In [51]:
querry = """
SELECT * 
FROM municipio_nome
"""

municipios = pd.read_sql_query(querry,connection)
municipios.head()

Unnamed: 0,CD_MUN,NM_MUN
0,2907806,CÍCERO DANTAS
1,2923902,PAU BRASIL
2,2923209,OLIVEIRA DOS BREJINHOS
3,2921500,MONTE SANTO
4,2921203,MIGUEL CALMON


In [53]:
def get_dataset(text):
    text = [i for i in text if i != "\""]
    text = "".join(text)
    
    lines = text.split("\n")
    columns = lines[0].split(";")

    df = []    
    for line in lines[1:]:
        features = line.split(";")
        if len(features) == len(columns):
            df.append(features)
    df = pd.DataFrame(df,columns=columns)

    
    for i in df.columns:
        try:
            column_values = df[i]
            tipo = int
            if "," in list(df[i].values[0]):
                column_values = column_values.apply(lambda x: x.replace(",","."))
                tipo = float
                
            df[i] = column_values.astype(tipo)
        except Exception as e:
            df.drop(i,axis=1,inplace=True)
            
    return df

In [55]:
def clean_municipios(df):
    municipios_num = {int(x[0]):x[1] for x in municipios.values}

    df_filter = df[df['CD_MUN'].apply(lambda x: x in municipios_num.keys())].reset_index(drop=True)

    return df_filter

In [57]:
csv_files = ['Agregados_por_municipios_alfabetizacao_BR.csv',
 'Agregados_por_municipios_basico_BR.csv',
 'Agregados_por_municipios_caracteristicas_domicilio1_BR.csv',
 'Agregados_por_municipios_caracteristicas_domicilio2_BR.csv',
 'Agregados_por_municipios_caracteristicas_domicilio3_BR.csv',
 'Agregados_por_municipios_cor_ou_raca_BR.csv',
 'Agregados_por_municipios_demografia_BR.csv',
 'Agregados_por_municipios_domicilios_indigenas_BR.csv',
 'Agregados_por_municipios_domicilios_quilombolas_BR.csv',
 'Agregados_por_municipios_obitos_BR.csv',
 'Agregados_por_municipios_parentesco_BR.csv',
 'Agregados_por_municipios_pessoas_indigenas_BR.csv',
 'Agregados_por_municipios_pessoas_quilombolas_BR.csv']
csv_names = ['alfabetizacao','basico','domicilio1','domicilio2','domicilio3','cor_raca','demografia',
             'domicilios_indigenas','domicilios_quilombolas','obitos','parentesco','pessoas_indigenas','pessoas_quilombolas']

In [59]:
for file,name in zip(csv_files,csv_names):
    with open("datasets/IBGE/{}".format(file),"r") as file:
        text = file.read()

    df = get_dataset(text)
    df = clean_municipios(df)

    # Converter o DataFrame em uma tabela SQL
    df.to_sql(name, connection, if_exists='replace', index=False)

In [61]:
# Fechar a conexão
connection.close()