In [67]:
import pandas as pd
import pathlib
import math
from collections import defaultdict

DATA_FOLDER = "processed_zonas_eleitorais/"
NEW_DATA_FOLDER = "zonas_lat_long_per_state/"
NEW_FILE = "zonas_lat_long.csv"

ERROR_FILES = ["empty_first.csv", "empty_second.csv", "empty_third.csv"]
DUPLICATE_COORS_FILES  = ["more_than_one_first.csv", "more_than_one_second.csv", \
                  "more_than_one_third.csv"]

TOTAL_DESCONSIDERED = 0
SKIP_ERROS = True

all_data = {}

# Reads our files, mapping them to a dictionary for each state's name/initials
for csv_file in pathlib.Path(DATA_FOLDER).iterdir():
    if not csv_file.is_file():
        continue
    name = str(csv_file).split("/")[-1]
    all_data[name] = pd.read_csv(csv_file, encoding="latin1", index_col="numero_zona")

In [68]:
def read_list_files(file_list, read=True):
    """
    Process a list of files, apending them into a single dataframe.
    If read = False, a dictionary is expected
    """
    data = None
    for file in file_list:
        if data is None:
            data = pd.read_csv(file, encoding="latin1")
            data.columns = ["numero_zona"] + list(data.columns[1:])
        else:
            temp = pd.read_csv(file, encoding="latin1")
            temp.columns = ["numero_zona"] + list(temp.columns[1:])
            data = data.append(temp, sort=False)
    return data

# Checks the total amount of errors

In [69]:
# First read the wrong data
error_data = None

error_data = read_list_files(ERROR_FILES)

total_count = 0
for state, count in error_data.groupby("sigla_uf.1").size().iteritems():
    total_count += count
    print(f"There were {count} errors in state '{state}'")
print(f"\nIn total there were {total_count} errors")

TOTAL_DESCONSIDERED += total_count

There were 1 errors in state 'MA'
There were 2 errors in state 'MG'
There were 2 errors in state 'PB'
There were 1 errors in state 'PR'
There were 2 errors in state 'RS'

In total there were 8 errors


# Checks if the multiple address were always the same lat/long

In [70]:
# First read the dupliccate data
duplicates_data = read_list_files(DUPLICATE_COORS_FILES)

total_count = 0
for state, count in duplicates_data.groupby("sigla_uf.1").size().iteritems():
    total_count += count
    print(f"There were {count} duplicates in state '{state}'")
print(f"\nIn total there were {total_count} duplicates")

There were 1 duplicates in state 'AC'
There were 2 duplicates in state 'AL'
There were 1 duplicates in state 'AM'
There were 2 duplicates in state 'AP'
There were 18 duplicates in state 'MA'
There were 19 duplicates in state 'MG'
There were 6 duplicates in state 'MT'
There were 20 duplicates in state 'PA'
There were 11 duplicates in state 'PB'
There were 13 duplicates in state 'PE'
There were 7 duplicates in state 'PI'
There were 30 duplicates in state 'PR'
There were 20 duplicates in state 'RJ'
There were 9 duplicates in state 'RN'
There were 1 duplicates in state 'RR'
There were 8 duplicates in state 'RS'
There were 1 duplicates in state 'SC'
There were 7 duplicates in state 'SE'
There were 19 duplicates in state 'SP'
There were 6 duplicates in state 'TO'

In total there were 201 duplicates


In [71]:
total_duplicates = 0
duplicates = defaultdict(list)

# Don't consider same lat/long as duplicates
for state, state_data in duplicates_data.groupby("sigla_uf.1"):
    obj = all_data[state.lower()]
    extra_cols = (len(obj.columns) - (list(obj.columns).index("lng_0")+1)) // 2
    
    duplicate_state_count = 0
    for num_zona in state_data["numero_zona"]:
        obj = all_data[state.lower()].query("numero_zona == @num_zona").iloc[0]
        lat, lng = obj["lat_0"], obj["lng_0"]
        
        # For each extra lat/long coordinate, check if it is the same value
        for extra_col in range(extra_cols):
            extra_lat, extra_lng = obj[f"lat_{extra_col+1}"], obj[f"lng_{extra_col+1}"]
            if math.isnan(extra_lat) or math.isnan(extra_lng):
                continue
            else:
                if extra_lat != lat or extra_lng != lng:
                    print(f"Different! {extra_lat}, {extra_lng}, {lat}, {lng}")
                    duplicate_state_count += 1
                    duplicates[state.lower()].append(num_zona)
                    total_duplicates  += 1
                    break
    print(f"{state} in practice had {duplicate_state_count} duplicates")
print("\n\n")
print(f"In total we will desconsider {total_duplicates} zonas because of duplicate address\n\n")
TOTAL_DESCONSIDERED += total_duplicates

print(f"--> Considering BOTH duplicate and errors, "\
        f"we are desconsidering {TOTAL_DESCONSIDERED} zonas")

AC in practice had 0 duplicates
AL in practice had 0 duplicates
AM in practice had 0 duplicates
AP in practice had 0 duplicates
MA in practice had 0 duplicates
MG in practice had 0 duplicates
MT in practice had 0 duplicates
PA in practice had 0 duplicates
PB in practice had 0 duplicates
PE in practice had 0 duplicates
PI in practice had 0 duplicates
PR in practice had 0 duplicates
RJ in practice had 0 duplicates
RN in practice had 0 duplicates
RR in practice had 0 duplicates
RS in practice had 0 duplicates
SC in practice had 0 duplicates
SE in practice had 0 duplicates
SP in practice had 0 duplicates
TO in practice had 0 duplicates



In total we will desconsider 0 zonas because of duplicate address


--> Considering BOTH duplicate and errors, we are desconsidering 8 zonas


In [72]:
# Map or remove real duplicates
for state in duplicates.keys():
    for num_zon in duplicates[state]:
        all_data[state] = all_data[state][all_data[state]["numero_zona"] != num_zona]
        print(num_zona)

In [78]:
# Map errors:
mapping = {
    "10-0061": (4.877643, -44.8823997),
    "13-0014": (-21.738108, -44.30558),
    "13-0089": (-19.169828, -41.469617),
    "15-0011": (-6.970878, -35.696102),
    "15-0031": (-6.772458, -37.795260),
    "14-0120": (-24.294004, -53.311368),
    "21-0123": ( -31.866516, -52.825239 ),
    "21-0149": (-29.567182, -50.798952),
}

# WARNING: THIS IS NOT WORKING, to map manually
# I have yet to figure out why :/
if not SKIP_ERROS:
    for idx, row in error_data.iterrows():
        state = row["sigla_uf.1"].lower()
        if state not in all_data:
            print("Something went wrong with ", row)
        else:
            lat, lng = mapping[row["sigla_uf"]]
            all_data[state][row["numero_zona"]] = row[1:].copy(deep=True)
            all_data[state].loc[row["numero_zona"], "lat_0"] = lat
            all_data[state].loc[row["numero_zona"], "lng_0"] = lng
# Delete the errors 
else:
    for idx, row in error_data.iterrows():
        state = row["sigla_uf.1"].lower()
        if state not in all_data:
            print("Something went wrong with ", row)
        else:
            all_data[state] = all_data[state].drop(row["numero_zona"])
        


In [79]:
# Save our data, both as individual files and as a single big file
temp = None
for state in all_data.keys():
    print(state)
    all_data[state].to_csv(f"{NEW_DATA_FOLDER}{state}.csv", header=True, \
                           index=True, columns=["lat_0", "lng_0"])
    if temp is None:
        temp = all_data[state]
    else:
        temp = temp.append(all_data[state], sort=False)
        
temp.to_csv(f"{NEW_FILE}", header=True, \
                       index=True, columns=["sigla_uf.1", "lat_0", "lng_0"])


mt
ce
al
ro
pe
pi
rj
pa
pr
ms
pb
sc
am
ap
go
ba
mg
df
to
sp
rr
es
rs
se
ma
ac
rn


In [80]:
all_data["pb"]

Unnamed: 0_level_0,sigla_uf,endereco,cep,bairro,nome_municipio,sigla_uf.1,lat_0,lng_0,lat_1,lng_1
numero_zona,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
65,15-0065,"AV. BARÃO DO RIO BRANCO, 455",58700370,BRASÃLIA,PATOS,PB,-21.743304,-43.354250,-21.743304,-43.354250
36,15-0036,"AV. DEP. AMÃRICO MAIA, S/N",58884000,JOÃO SERAFIM,CATOLÃ DO ROCHA,PB,-6.350704,-37.746299,,
10,15-0010,"AV. JUSCELINO KUBITSCHEK, S/N",58200000,JUÃ,GUARABIRA,PB,-6.848561,-35.486431,,
61,15-0061,"AV. LIBERDADE, N. 3423",58306000,CENTRO,BAYEUX,PB,-7.124843,-34.923883,,
72,15-0072,"AV. RIO GRANDE DO SUL, S/N -",58414025,LIBERDADE,CAMPINA GRANDE,PB,-7.231572,-35.892178,,
53,15-0053,"AVENIDA FRANCISCO LEÃO VELOSO, N 438-A",58915000,ALGASA,UIRAÃNA,PB,-6.524157,-38.414876,,
28,15-0028,"AVENIDA RIO BRANCO, 455",58700370,CENTRO,PATOS,PB,-7.024857,-37.272872,,
57,15-0057,BR 230 KM 01- B,58310000,CENTRO,CABEDELO,PB,-6.978847,-34.830734,,
26,15-0026,FÃRUM ELEITORAL DES. LUIZ SILVIO RAMALHO - RU...,58600000,BAIRRO SAO JOSE,SANTA LUZIA,PB,-6.871427,-36.912480,,
35,15-0035,FÃRUM ELEITORAL DES. WALTER SARMENTO DE SÃ -...,58804718,RACHEL GADELHA,SOUSA,PB,-6.762525,-38.224935,,
