# Download data

In [81]:
import os
import requests
import zipfile

In [82]:
def ensure_directory_exists(directory):
    """Ensure the directory exists. Create it if not."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
        return False
    else:
        print(f"Directory '{directory}' already exists.")
        return True

In [83]:
def download_file_if_not_exists(file_path, url):
    """Download a file if it does not exist."""
    if not os.path.exists(file_path):
        response = requests.get(url, stream=True)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Downloaded '{file_path}'.")
    else:
        print(f"File '{file_path}' already exists.")

In [84]:
def unzip(zip_path, extract_to):
    """Unzip a file and delete the .zip file afterward."""
    if zipfile.is_zipfile(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Extracted '{zip_path}' to '{extract_to}'.")
    else:
        print(f"'{zip_path}' is not a valid zip file.")

In [85]:
# Main execution
directory = "datasets"
zip_file_path = os.path.join(directory, "votacao_partido_munzona_2022.zip")
download_url = "https://cdn.tse.jus.br/estatistica/sead/odsele/votacao_partido_munzona/votacao_partido_munzona_2022.zip"  # Replace with the actual URL
directory_out = zip_file_path.split(".")[0]

# Ensure directory exists
ensure_directory_exists(directory)
ensure_directory_exists(directory_out)

# Download the file if it doesn't exist
download_file_if_not_exists(zip_file_path, download_url)

# Unzip the file and delete the .zip
unzip(zip_file_path, directory_out)

Directory 'datasets' created.
Directory 'datasets/votacao_partido_munzona_2022' created.
Downloaded 'datasets/votacao_partido_munzona_2022.zip'.
Extracted 'datasets/votacao_partido_munzona_2022.zip' to 'datasets/votacao_partido_munzona_2022'.


# Carregando dados

In [86]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [87]:
try:
    with open("datasets/votacao_partido_munzona_2022/votacao_partido_munzona_2022_BA.csv","r") as file:
        text = file.read()
except UnicodeDecodeError:
    with open("datasets/votacao_partido_munzona_2022/votacao_partido_munzona_2022_BA.csv","rb") as file:
        text = file.read().decode("latin1")

In [88]:
data = []
for line in text.split("\n"):
    campos = line.split(";")
    if len(campos) == 36:
        data.append(campos)
        
columns = data[0]
columns = [i[1:-1] for i in columns]
data = data[1:]
data = pd.DataFrame(data,columns = columns)

In [89]:
columns = data.columns 
for i in columns:
    if len(data[i].unique()) == 1:
        data.drop(i,axis=1,inplace=True)

# Passar nulo para np.nan

In [90]:
nulo = data['SG_FEDERACAO'].values[0]

In [91]:
data.replace(nulo,value=pd.NA,inplace=True)

In [92]:
data.isnull().mean(axis=0)

NR_TURNO                      0.000000
CD_ELEICAO                    0.000000
DT_ELEICAO                    0.000000
CD_MUNICIPIO                  0.000000
NM_MUNICIPIO                  0.000000
NR_ZONA                       0.000000
CD_CARGO                      0.000000
DS_CARGO                      0.000000
TP_AGREMIACAO                 0.000000
NR_PARTIDO                    0.000000
SG_PARTIDO                    0.000000
NM_PARTIDO                    0.000000
NR_FEDERACAO                  0.000000
NM_FEDERACAO                  0.742857
SG_FEDERACAO                  0.742857
DS_COMPOSICAO_FEDERACAO       0.742857
SQ_COLIGACAO                  0.000000
NM_COLIGACAO                  0.000000
DS_COMPOSICAO_COLIGACAO       0.000000
QT_VOTOS_LEGENDA_VALIDOS      0.000000
QT_TOTAL_VOTOS_LEG_VALIDOS    0.000000
QT_VOTOS_NOMINAIS_VALIDOS     0.000000
dtype: float64

# Tirar alguns redudantes

In [93]:
for i in data.columns:
    if len(data[i].unique()) < 10:
        print(data[i].unique())
        print(i)
        print()
    

['1' '2']
NR_TURNO

['546' '547']
CD_ELEICAO

['"02/10/2022"' '"30/10/2022"']
DT_ELEICAO

['7' '6' '3' '5']
CD_CARGO

['"Deputado Estadual"' '"Deputado Federal"' '"Governador"' '"Senador"']
DS_CARGO

['"Partido isolado"' '"Federação"' '"Coligação"']
TP_AGREMIACAO

['-1' '2' '1' '3']
NR_FEDERACAO

[<NA> '"Federação Brasil da Esperança - FE BRASIL"'
 '"Federação PSDB Cidadania"' '"Federação PSOL REDE"']
NM_FEDERACAO

[<NA> '"PT/PC do B/PV"' '"PSDB/CIDADANIA"' '"PSOL/REDE"']
SG_FEDERACAO

[<NA> '"PC do B / PT / PV"' '"CIDADANIA / PSDB"' '"PSOL / REDE"']
DS_COMPOSICAO_FEDERACAO

['"PARTIDO ISOLADO"' '"FEDERAÇÃO"' '"PRA MUDAR A BAHIA"'
 '"PELA BAHIA, PELO BRASIL"' '"Agora é a vez do povo"'
 '" PELA BAHIA,PELO BRASIL"' '"BAHIA DE MÃOS DADAS COM O BRASIL "'
 '"Juntos Vamos Governar"' '"BAHIA DE MÃOS DADAS COM O BRASIL"']
NM_COLIGACAO



In [94]:
redudante = ['CD_ELEICAO','DT_ELEICAO','NR_FEDERACAO','CD_CARGO']
data.drop(redudante,axis=1,inplace=True)

# Corrigir strings

In [95]:
def remove_aspas(x):
    if isinstance(x,str) == False:
        return x
    x = [i for i in x if i !="\""]
    return ''.join(x)
    
for i in data.columns:
    data[i] = data[i].apply(remove_aspas)

# Tipos

In [96]:
for i in data.columns:
    try:
        data[i] = data[i].astype(int)
    except:
        pass

# Alinhar municipios

In [97]:
import geopandas

In [98]:
def ensure_directory_exists(directory):
    """Ensure the directory exists. Create it if not."""
    if not os.path.exists(directory):
        os.makedirs(directory)
        print(f"Directory '{directory}' created.")
        return False
    else:
        print(f"Directory '{directory}' already exists.")
        return True

In [99]:
def download_file_if_not_exists(file_path, url):
    """Download a file if it does not exist."""
    if not os.path.exists(file_path):
        response = requests.get(url, stream=True)
        with open(file_path, 'wb') as file:
            for chunk in response.iter_content(chunk_size=1024):
                file.write(chunk)
        print(f"Downloaded '{file_path}'.")
    else:
        print(f"File '{file_path}' already exists.")

In [100]:
def unzip(zip_path, extract_to):
    """Unzip a file and delete the .zip file afterward."""
    if zipfile.is_zipfile(zip_path):
        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(extract_to)
        print(f"Extracted '{zip_path}' to '{extract_to}'.")
    else:
        print(f"'{zip_path}' is not a valid zip file.")

In [101]:
# Main execution
directory = "datasets"
zip_file_path = os.path.join(directory, "BA_Municipios_2022.zip")
download_url = "https://geoftp.ibge.gov.br/organizacao_do_territorio/malhas_territoriais/malhas_municipais/municipio_2022/UFs/BA/BA_Municipios_2022.zip"  # Replace with the actual URL
directory_out = zip_file_path.split(".")[0]

# Ensure directory exists
ensure_directory_exists(directory)
ensure_directory_exists(directory_out)

# Download the file if it doesn't exist
download_file_if_not_exists(zip_file_path, download_url)

# Unzip the file and delete the .zip
unzip(zip_file_path, directory_out)

Directory 'datasets' already exists.
Directory 'datasets/BA_Municipios_2022' created.
Downloaded 'datasets/BA_Municipios_2022.zip'.
Extracted 'datasets/BA_Municipios_2022.zip' to 'datasets/BA_Municipios_2022'.


In [102]:
mun = geopandas.read_file('datasets/BA_Municipios_2022/BA_Municipios_2022.dbf')
mun_names = mun['NM_MUN'].apply(lambda x: x.upper()).values
mun

Unnamed: 0,CD_MUN,NM_MUN,SIGLA_UF,AREA_KM2,geometry
0,2900108,Abaíra,BA,538.677,"POLYGON ((-41.74667 -13.32509, -41.74679 -13.3..."
1,2900207,Abaré,BA,1604.923,"POLYGON ((-39.23481 -8.70419, -39.23427 -8.705..."
2,2900306,Acajutiba,BA,181.475,"POLYGON ((-38.02809 -11.75007, -38.02811 -11.7..."
3,2900355,Adustina,BA,629.099,"POLYGON ((-37.97867 -10.652, -37.97874 -10.652..."
4,2900405,Água Fria,BA,742.775,"POLYGON ((-38.60169 -11.92432, -38.59707 -11.9..."
...,...,...,...,...,...
412,2933307,Vitória da Conquista,BA,3254.186,"POLYGON ((-40.71779 -14.88926, -40.71774 -14.8..."
413,2933406,Wagner,BA,522.370,"POLYGON ((-41.06276 -12.28494, -41.07128 -12.2..."
414,2933455,Wanderley,BA,2920.579,"POLYGON ((-43.82392 -12.12517, -43.82324 -12.1..."
415,2933505,Wenceslau Guimarães,BA,655.057,"POLYGON ((-39.49336 -13.60308, -39.49335 -13.6..."


In [103]:
for i in data['NM_MUNICIPIO'].unique():
    if i not in mun_names:
        print(i)

CAEM
SANTO ESTEVÃO
DIAS D ÁVILA
CAMACÃ


In [104]:
len(data['NM_MUNICIPIO'].unique())

417

In [105]:
data.replace('CAMACÃ', 'CAMACAN',inplace=True)
data.replace('SANTO ESTEVÃO', 'SANTO ESTÊVÃO',inplace=True)
data.replace('CAEM', 'CAÉM',inplace=True)
data.replace('DIAS D ÁVILA', 'DIAS D\'ÁVILA',inplace=True)

In [106]:
mun_dict = {nm_mun.upper():cd_mun for cd_mun,nm_mun in mun[['CD_MUN','NM_MUN']].values }

In [107]:
data['CD_MUN'] = data['NM_MUNICIPIO'].apply(lambda x: mun_dict[x])
data['NM_MUN'] = data['NM_MUNICIPIO']

In [108]:
data.drop(['CD_MUNICIPIO','NM_MUNICIPIO'],axis=1,inplace=True)

# SQL

In [109]:
import sqlite3

In [110]:
db_path = 'sql/eleicao.db' 
connection = sqlite3.connect(db_path)

## votos_partido

In [111]:
data.columns

Index(['NR_TURNO', 'NR_ZONA', 'DS_CARGO', 'TP_AGREMIACAO', 'NR_PARTIDO',
       'SG_PARTIDO', 'NM_PARTIDO', 'NM_FEDERACAO', 'SG_FEDERACAO',
       'DS_COMPOSICAO_FEDERACAO', 'SQ_COLIGACAO', 'NM_COLIGACAO',
       'DS_COMPOSICAO_COLIGACAO', 'QT_VOTOS_LEGENDA_VALIDOS',
       'QT_TOTAL_VOTOS_LEG_VALIDOS', 'QT_VOTOS_NOMINAIS_VALIDOS', 'CD_MUN',
       'NM_MUN'],
      dtype='object')

In [112]:
votos_partido = data[['CD_MUN','NR_TURNO','NR_ZONA','DS_CARGO','NR_PARTIDO','QT_VOTOS_LEGENDA_VALIDOS',
                'QT_TOTAL_VOTOS_LEG_VALIDOS','QT_VOTOS_NOMINAIS_VALIDOS', 'SQ_COLIGACAO']]

In [113]:
# Converter o DataFrame em uma tabela SQL
table_name = 'votos_partido'
votos_partido.to_sql(table_name, connection, if_exists='replace', index=False)

31500

## partido

In [114]:
partido = data[['NR_PARTIDO','NM_PARTIDO','TP_AGREMIACAO','SG_PARTIDO']].drop_duplicates().dropna().reset_index(drop=True)
partido.head()

Unnamed: 0,NR_PARTIDO,NM_PARTIDO,TP_AGREMIACAO,SG_PARTIDO
0,44,UNIÃO BRASIL,Partido isolado,UNIÃO
1,11,PROGRESSISTAS,Partido isolado,PP
2,35,Partido da Mulher Brasileira,Partido isolado,PMB
3,20,Partido Social Cristão,Partido isolado,PSC
4,43,Partido Verde,Federação,PV


In [115]:
# Converter o DataFrame em uma tabela SQL
table_name = 'partido'
partido.to_sql(table_name, connection, if_exists='replace', index=False)

37

## federação

In [116]:
federacao = data[['SG_FEDERACAO', 'NM_FEDERACAO','DS_COMPOSICAO_FEDERACAO']].drop_duplicates().dropna().reset_index(drop=True)
federacao

Unnamed: 0,SG_FEDERACAO,NM_FEDERACAO,DS_COMPOSICAO_FEDERACAO
0,PT/PC do B/PV,Federação Brasil da Esperança - FE BRASIL,PC do B / PT / PV
1,PSDB/CIDADANIA,Federação PSDB Cidadania,CIDADANIA / PSDB
2,PSOL/REDE,Federação PSOL REDE,PSOL / REDE


In [117]:
# Converter o DataFrame em uma tabela SQL
table_name = 'federacao'
federacao.to_sql(table_name, connection, if_exists='replace', index=False)

3

## coligação

In [118]:
coligacao = data[['NM_COLIGACAO','SQ_COLIGACAO','DS_COMPOSICAO_COLIGACAO']].drop_duplicates().dropna().reset_index(drop=True)
coligacao.head()

Unnamed: 0,NM_COLIGACAO,SQ_COLIGACAO,DS_COMPOSICAO_COLIGACAO
0,PARTIDO ISOLADO,50001683240,UNIÃO
1,PARTIDO ISOLADO,50001682242,PP
2,PARTIDO ISOLADO,50001683104,PMB
3,PARTIDO ISOLADO,50001681251,PSC
4,FEDERAÇÃO,50001681283,Federação Brasil da Esperança - FE BRASIL(PT/P...


In [119]:
# Converter o DataFrame em uma tabela SQL
table_name = 'coligacao'
coligacao.to_sql(table_name, connection, if_exists='replace', index=False)

60

## municipio

In [120]:
municipio = data[['CD_MUN','NM_MUN']].drop_duplicates().dropna().reset_index(drop=True)
municipio.head()

Unnamed: 0,CD_MUN,NM_MUN
0,2926004,REMANSO
1,2905404,CAIRU
2,2901106,AMÉLIA RODRIGUES
3,2926202,RIACHÃO DAS NEVES
4,2924801,PIRITIBA


In [121]:
# Converter o DataFrame em uma tabela SQL
table_name = 'municipio_nome'
municipio.to_sql(table_name, connection, if_exists='replace', index=False)

417

In [122]:
# Fechar a conexão
connection.close()