## Context
The previous dataset didn't provide the sufficient details for the feature.
This web scraper will create a dataset with the results for each party in each parish ('parish') in Portugal (inland and autonomous territories) as well as the votes from the Europe and Outside of Europe circles

### Running this Notebook
You can run all cells to get the full dataset, or execute group 1 or 2 in separate

- Group 1 - National Territory (3.000+ files, can take some minutes to run)
- Group 2 - Europe and Out of Europe

### Setup

In [None]:
#%pip install pandas
import pandas as pd
import os

In [None]:
# Base url
    # There are two sites, they sometimes shift the base url:
# site = 'https://www.legislativas2024.mai.gov.pt/assets/static'
site = 'https://www.eleicoes.mai.gov.pt/legislativas2024/assets/static'

# Local path
tchild = site +'/territory-children/territory-children-' # children territory codes url
tresult = site +'/territory-results/territory-results-' # voting results url

portugal =      {   # Portugal parent
                    'compensation': [False], 
                    'name': ['Portugal'], 
                    'territoryKey': ['LOCAL-500000'],
                    'district': False,
                    'county': False,
                    'level': ['pais']
}

foreign =       {   # Foreign territories parent
                    'compensation': [False], 
                    'name': ['Estrangeiro'], 
                    'territoryKey': ['FOREIGN-600000'],
                    'district': False,
                    'county': False,
                    'level': ['pais']
}

parish_dict =   {   # Necessary translation due to the "/" character in the names
                    'Longueira/Almograve': 'Longueira, Almograve',
                    'Alverca da Beira/Bouça Cova': 'Alverca da Beira, Bouça Cova',
                    'Valbom/Bogalhal': 'Valbom, Bogalhal',
                    'Santa Cruz/Trindade e Sanjurge': 'Santa Cruz, Trindade e Sanjurge',
                    'Vila Cova do Covelo/Mareco': 'Vila Cova do Covelo, Mareco'
}

codes = pd.DataFrame(portugal) # creates codes dataframe, and populates with highest-tier level

### Methods

In [None]:
# Saves children json from url          
def get_children(territoryKey):
    url = tchild + territoryKey + '.json'
    return pd.read_json(url)

# Saves result json from url
def get_result(territoryKey):
    url = tresult + territoryKey + '-AR.json'
    return pd.read_json(url)

# Populates codes dataframe
def get_codes(row, district, county, level):
    return  {
                'compensation':     row['compensation'],
                'name':             row['name'], 
                'territoryKey':     row['territoryKey'],
                'district':         district,
                'county':           county,
                'level':            level
            }

# Creates directories
def mkdir(lv2_codes):
    for row in lv2_codes.iterrows():
        tkey = row[1]['territoryKey']
        dist = row[1]['district']
        county = row[1]['name']
        loc = tkey.split('-')[0].capitalize()

        path = '/datasets/crawler/' + loc + '/' + dist + '/' + county + '/'

        working_dir = os.getcwd()
        newpath = working_dir + path

        if not os.path.exists(newpath):
            os.makedirs(newpath)

In [None]:
# Drill down to districts
def drill_lv1(parent_lv0_codes):
    tkey = parent_lv0_codes['territoryKey'][0]
    districts = get_children(tkey)

    return pd.DataFrame(get_codes(districts, False, False, 'district'))

# Drill down to counties
def drill_lv2(parent_lv1_codes):
    county_codes = pd.DataFrame()


    for row in parent_lv1_codes.iterrows():
        tkey = row[1]['territoryKey']
        dist = row[1]['name']
        county = False
        
        counties = get_children(tkey)
        codes = pd.DataFrame(get_codes(counties, dist, county, 'county'))
        county_codes = pd.concat([county_codes, codes])

    return county_codes.reset_index(drop=True)

# Drill down to parishes
def drill_lv3(parent_lv2_codes):
    parish_codes = pd.DataFrame()

    for row in parent_lv2_codes.iterrows():
        tkey = row[1]['territoryKey']
        dist = row[1]['district']
        county = row[1]['name']
        
        parishes = get_children(tkey)
        codes = pd.DataFrame(get_codes(parishes, dist, county, 'parish'))
        parish_codes = pd.concat([parish_codes, codes])

    return parish_codes.reset_index(drop=True)

# Saves the results
def save_results(parent_lv3_codes):

    for row in parent_lv3_codes.iterrows():
        tkey = row[1]['territoryKey']
        dist = row[1]['district']
        county = row[1]['county']
        parish = row[1]['name']
        
        result = pd.DataFrame(get_result(tkey))
        result.index.name = 'index'
        
        loc = tkey.split('-')[0].capitalize()
        path = 'datasets/crawler/' + loc + '/' + dist + '/' + county + '/'  + parish + '.csv'
                
        result.to_csv(path)

# Saves the codes
def save_codes(codes):
    loc = codes['territoryKey'][0].split('-')[0].lower()
    path = 'datasets/crawler/' + loc + '_codes.csv'
    codes = (
        codes.drop(columns=['level', 'compensation'])
        .rename(columns={'name': 'parish'})
        .reindex(columns=['territoryKey', 'parish', 'county', 'district'])
        .set_index('territoryKey')
        .to_csv(path)
    )
    

### Scraping National Territory

In [None]:
districts = drill_lv1(portugal)
districts

In [None]:
counties = drill_lv2(districts)
counties

In [None]:
mkdir(counties)

In [None]:
parishes = (
    drill_lv3(counties)
    .replace(parish_dict)
    .reset_index(drop=True)
)

parishes

In [None]:
save_codes(parishes)

In [None]:
save_results(parishes)

### Scraping Europe and Out of Europe

In [None]:
circles = drill_lv1(foreign)
circles

In [None]:
countries = drill_lv2(circles)
countries

In [None]:
#Creates directories
mkdir(countries)

In [None]:
embassies = drill_lv3(countries)
embassies

In [None]:
save_codes(embassies)

In [None]:
save_results(embassies)