In [177]:
import pandas as pd
import pathlib
import math
from collections import defaultdict

DATA_FOLDER = "processed_zonas_eleitorais/"
NEW_DATA_FOLDER = "zonas_lat_long_per_state/"
NEW_FILE = "zonas_lat_long.csv"

#ERROR_FILES = ["empty_first.csv", "empty_second.csv", "empty_third.csv"]
#MORE_LOCATIONS = ["more_than_one_first.csv", "more_than_one_second.csv", \
#                  "more_than_one_third.csv"]

ERROR_FILES = ["empty_second.csv", "empty_third.csv"]
DUPLICATE_COORS_FILES = ["more_than_one_second.csv", \
                  "more_than_one_third.csv"]
TOTAL_DESCONSIDERED = 0

all_data = {}

# Reads our files, mapping them to a dictionary for each state's name/initials
for csv_file in pathlib.Path(DATA_FOLDER).iterdir():
    if not csv_file.is_file():
        continue
    name = str(csv_file).split("/")[-1]
    all_data[name] = pd.read_csv(csv_file, encoding="latin1", index_col="numero_zona")

In [164]:
def read_list_files(file_list, read=True):
    """
    Process a list of files, apending them into a single dataframe.
    If read = False, a dictionary is expected
    """
    data = None
    for file in file_list:
        if data is None:
            data = pd.read_csv(file, encoding="latin1")
            data.columns = ["numero_zona"] + list(data.columns[1:])
        else:
            temp = pd.read_csv(file, encoding="latin1")
            temp.columns = ["numero_zona"] + list(temp.columns[1:])
            data = data.append(temp, sort=False)
    return data

# Checks the total amount of errors

In [165]:
# First read the wrong data
error_data = None

"""
for error_file in ERROR_FILES:
    if error_data is None:
        error_data = pd.read_csv(error_file, encoding="latin1")
        error_data.columns = ["numero_zona"] + list(error_data.columns[1:])
    else:
        temp = pd.read_csv(error_file, encoding="latin1")
        temp.columns = ["numero_zona"] + list(temp.columns[1:])
        error_data = error_data.append(temp, sort=False)
"""
error_data = read_list_files(ERROR_FILES)

total_count = 0
for state, count in error_data.groupby("sigla_uf.1").size().iteritems():
    total_count += count
    print(f"There were {count} errors in state '{state}'")
print(f"\nIn total there were {total_count} errors")

TOTAL_DESCONSIDERED += total_count

There were 2 errors in state 'MG'
There were 2 errors in state 'PB'
There were 1 errors in state 'PR'
There were 2 errors in state 'RS'

In total there were 7 errors


# Checks if the multiple address were always the same lat/long

In [166]:
# First read the dupliccate data
duplicates_data = read_list_files(DUPLICATE_COORS_FILES)

total_count = 0
for state, count in duplicates_data.groupby("sigla_uf.1").size().iteritems():
    total_count += count
    print(f"There were {count} duplicates in state '{state}'")
print(f"\nIn total there were {total_count} duplicates")

There were 19 duplicates in state 'MG'
There were 6 duplicates in state 'MT'
There were 20 duplicates in state 'PA'
There were 11 duplicates in state 'PB'
There were 13 duplicates in state 'PE'
There were 7 duplicates in state 'PI'
There were 30 duplicates in state 'PR'
There were 20 duplicates in state 'RJ'
There were 9 duplicates in state 'RN'
There were 1 duplicates in state 'RR'
There were 8 duplicates in state 'RS'
There were 1 duplicates in state 'SC'
There were 7 duplicates in state 'SE'
There were 19 duplicates in state 'SP'
There were 6 duplicates in state 'TO'

In total there were 177 duplicates


In [168]:
total_duplicates = 0
duplicates = defaultdict(list)

# Don't consider same lat/long as duplicates
for state, state_data in duplicates_data.groupby("sigla_uf.1"):
    obj = all_data[state.lower()]
    extra_cols = (len(obj.columns) - (list(obj.columns).index("lng_0")+1)) // 2
    
    duplicate_state_count = 0
    for num_zona in state_data["numero_zona"]:
        obj = all_data[state.lower()].query("numero_zona == @num_zona").iloc[0]
        lat, lng = obj["lat_0"], obj["lng_0"]
        
        # For each extra lat/long coordinate, check if it is the same value
        for extra_col in range(extra_cols):
            extra_lat, extra_lng = obj[f"lat_{extra_col+1}"], obj[f"lng_{extra_col+1}"]
            if math.isnan(extra_lat) or math.isnan(extra_lng):
                continue
            else:
                if extra_lat != lat or extra_lng != lng:
                    print(f"Different! {extra_lat}, {extra_lng}, {lat}, {lng}")
                    duplicate_state_count += 1
                    duplicates[state.lower()].append(num_zona)
                    total_duplicates  += 1
                    break
    print(f"{state} in practice had {duplicate_state_count} duplicates")
print("\n\n")
print(f"In total we will desconsider {total_duplicates} zonas because of duplicate address\n\n")
TOTAL_DESCONSIDERED += total_duplicates

print(f"--> Considering BOTH duplicate and errors, "\
        f"we are desconsidering {TOTAL_DESCONSIDERED} zonas")

MG in practice had 0 duplicates
MT in practice had 0 duplicates
PA in practice had 0 duplicates
PB in practice had 0 duplicates
PE in practice had 0 duplicates
PI in practice had 0 duplicates
PR in practice had 0 duplicates
RJ in practice had 0 duplicates
RN in practice had 0 duplicates
RR in practice had 0 duplicates
RS in practice had 0 duplicates
SC in practice had 0 duplicates
SE in practice had 0 duplicates
SP in practice had 0 duplicates
TO in practice had 0 duplicates



In total we will desconsider 0 zonas because of duplicate address


--> Considering BOTH duplicate and errors, we are desconsidering 7 zonas


In [169]:
# Remove real duplicates:
for state in duplicates.keys():
    for num_zon in duplicates[state]:
        all_data[state] = all_data[state][all_data[state]["numero_zona"] != num_zona]

In [178]:
# Save our data, both as individual files and as a single big file
temp = None
for state in all_data.keys():
    all_data[state].to_csv(f"{NEW_DATA_FOLDER}{state}.csv", header=True, \
                           index=True, columns=["lat_0", "lng_0"])
    if temp is None:
        temp = all_data[state]
    else:
        temp = temp.append(all_data[state], sort=False)
        
temp.to_csv(f"{NEW_FILE}", header=True, \
                       index=True, columns=["sigla_uf.1", "lat_0", "lng_0"])