In [2]:
#Import necessary packages
import pandas as pd

from data_cleaning.country_match import country_code_map as geomap

In [8]:
def extract_dataframe(path: str, remove_extra_header=False):
    '''
    Creates a Pandas Data Frame from a CSV File and makes headers uniform (lower cases, separated by _ vs space)
    :param path - A path to a CSV File
    :param remove_extra_header - A boolean to indicate if there is an extra header (Portuguese) which can be removed
    :return a Pandas dataframe
    '''

    if remove_extra_header:
        df = pd.read_csv(path, skiprows=[1])
        new_column_names = [x.lower().replace(' ','_') for x in df.columns]
        df.columns = new_column_names
        return df
    else:
        df = pd.read_csv(path)
        new_column_names = [x.lower().replace(' ','_') for x in df.columns]
        df.columns = new_column_names
        return df

In [9]:
#Create Dataframes from each electorate data file

df_1998 = extract_dataframe('../data/perfil_eleitorado_1998.csv', remove_extra_header=True)
df_2002 = extract_dataframe('../data/perfil_eleitorado_2002.csv', remove_extra_header=True)
df_2006 = extract_dataframe('../data/perfil_eleitorado_2006.csv', remove_extra_header=True)
df_2010 = extract_dataframe('../data/perfil_eleitorado_2010.csv', remove_extra_header=False)
df_2014 = extract_dataframe('../data/perfil_eleitorado_2014.csv', remove_extra_header=True)
df_2018 = extract_dataframe('../data/perfil_eleitorado_2018.csv', remove_extra_header=True)

In [13]:
def standardize_country_code(df_orig):
    '''
    Standardizes post-2010 municipality codes to pre-2010 country codes
    :param df_orig - A Pandas dataframe
    :return a Pandas dataframe with the municipality code transformation applied

    '''
    df = df_orig.copy() #Create a copy of the dataframe
    
    #Iterate through dataframe and update municipality code and name
    for row_idx in df.index:
        code = df.at[row_idx, 'municipality_code']
        new_code, name = geomap[code]
        df.at[row_idx, 'municipality_code'] = new_code
        df.at[row_idx, 'municipality_name'] = name
    
    return df

In [14]:
#Standardize Post 2010 data to use pre-2010 country codes
df_2010_std = standardize_country_code(df_2010)
df_2014_std = standardize_country_code(df_2014)
df_2018_std = standardize_country_code(df_2018)

In [22]:
def combine_data(*dfs):
    '''
    Combines multiple dataframes into a single Pandas dataframe
    :param dfs - a comma separated collection of dataframes
    :return a new Pandas dataframe with all dataframes combined

    '''
    dfs = list(dfs) #Convert to list
    df = dfs[0].copy() #Extract first element
    
    #Add remaining elements
    for dataframe in dfs[1:]:
        df.append(dataframe, ignore_index=True)
    
    return df

In [23]:
df_combined = combine_data(df_1998, df_2002, df_2006, df_2010_std, df_2014_std, df_2018_std)

print(df_combined.head())

   election_year place_(zz_=_exterior)  municipality_code municipality_name  \
0           1998                    ZZ              11487           POLONIA   
1           1998                    ZZ              98361           NIGERIA   
2           1998                    ZZ              98965            ANGOLA   
3           1998                    ZZ              98507        COSTA RICA   
4           1998                    ZZ              98620          BULGARIA   

   gender_code gender_description  marital_status_code  \
0            2          MASCULINO                   -3   
1            2          MASCULINO                   -3   
2            2          MASCULINO                   -3   
3            4           FEMININO                   -3   
4            4           FEMININO                   -3   

  marital_status_description  age_group_code age_group_description  \
0                        #NE              -3                   #NE   
1                        #NE        