https://devhints.io/xpath
https://selenium-python.readthedocs.io/api.html


In [25]:
import scrapper_utils as scrapper_utils
import re
# Path to Chromium bin
# $x('//*[@id="select-distriro"]').map(x=>x.textContent)

chromium_path =  "drivers/chrome-linux64/chrome"
service_path = "drivers/chromedriver-linux64/chromedriver"

url = "https://prep2024.ine.mx/publicacion/nacional/presidencia/nacional/entidad/{state}/candidatura" # 1 to 32

XPATH = {
    'XPATH_STATES' : '//select[@id="select-entidad"]',
    'XPATH_DISTRICT' : '//select[@id="select-distriro"]',
    'XPATH_INFO' : '/html/body/app-root/app-federal/div/div/div[3]/app-entidad/div/div[1]/div[2]/app-partido-politico/div[2]/div/div[{candidate}]', # 1 to 5
    'XPATH_PART' : '/html/body/app-root/app-federal/div/div/div[3]/app-entidad/div/app-porcentaje-participacion/div/div/div/div/div[2]/div[2]/p'
}

In [4]:
def parse_election_data(data: str) -> dict:
    # Extract name using regular expression
    name_match = re.search(r'([A-Za-zÁÉÍÓÚáéíóúÑñ\s]+)Total de votos', data)
    name = name_match.group(1).strip() if name_match else ''

    # Extract total votes using regular expression
    total_votes_match = re.search(r'Total de votos([0-9,]+)', data)
    total_votes = int(total_votes_match.group(1).replace(',', '')) if total_votes_match else 0

    # Extract percentage using regular expression
    percent_match = re.search(r'Porcentaje([0-9.]+)\s%', data)
    percent = float(percent_match.group(1).replace(',', '.')) / 100 if percent_match else 0.0

    # Extract votes in territory national using regular expression
    national_votes_match = re.search(r'En Territorio Nacional([0-9,]+)', data)
    national_votes = int(national_votes_match.group(1).replace(',', '')) if national_votes_match else 0

    # Extract votes abroad using regular expression
    abroad_votes_match = re.search(r'En el Extranjero([0-9,]+)', data)
    abroad_votes = int(abroad_votes_match.group(1).replace(',', '')) if abroad_votes_match else 0

    return {
        'name': name,
        'total_votes': total_votes,
        'percent': percent,
        'national_votes': national_votes,
        'abroad_votes': abroad_votes
    }

In [18]:
import time
import os 
import json
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait       
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver import ActionChains
import lxml.html as html
from selenium.webdriver.support.ui import Select


def get_info(chromium_path, service_path, XPATH, url):
    s = Service(service_path)
    chrome_options = Options()
    chrome_options.binary_location = chromium_path


    with webdriver.Chrome(service=s, options=chrome_options) as driver:
        driver.get(url)
        driver.maximize_window()
        wait = WebDriverWait(driver, 10)

        dropdown = Select(driver.find_element(By.XPATH,XPATH['XPATH_STATES']))
        states = [option.text for option in dropdown.options]

        time.sleep(10)
        json_parsed = {}
        for num in range(1,6):
            # print(driver.find_element(By.XPATH,XPATH['XPATH_INFO'].format(candidate = num)).get_attribute('textContent'))
            json_parsed[num] = parse_election_data(driver.find_element(By.XPATH,XPATH['XPATH_INFO'].format(candidate = num)).get_attribute('textContent'))
        print(json_parsed)
    time.sleep(5)
    # driver.close() # Revisar
    return json_parsed

In [21]:
def create_folder_if_not_exists(folder_name):
    if not os.path.exists(folder_name):
        os.makedirs(folder_name)
        print(f"The folder '{folder_name}' has been created.")
    else:
        print(f"The folder '{folder_name}' already exists.")

def create_json_file(json_data, file_name):
    with open(f'{file_name}.json', 'w', encoding='utf-8') as json_file:
        json.dump(json_data, json_file, indent=4, ensure_ascii=False)

In [24]:
create_folder_if_not_exists(f'states')
for id_state in range(1,33):
    create_json_file(get_info(chromium_path, service_path, XPATH, url.format(state=id_state)),f'states/state_{id_state}')

The folder 'states' already exists.
{1: {'name': 'Jorge Álvarez Máynez', 'total_votes': 56998, 'percent': 0.089398, 'national_votes': 56934, 'abroad_votes': 64}, 2: {'name': 'Bertha Xóchitl Gálvez Ruíz', 'total_votes': 294218, 'percent': 0.461466, 'national_votes': 293070, 'abroad_votes': 1148}, 3: {'name': 'Claudia Sheinbaum Pardo', 'total_votes': 270252, 'percent': 0.423876, 'national_votes': 269391, 'abroad_votes': 861}, 4: {'name': 'Candidaturas no registradas', 'total_votes': 1959, 'percent': 0.0030719999999999996, 'national_votes': 1953, 'abroad_votes': 6}, 5: {'name': 'Votos nulos', 'total_votes': 14145, 'percent': 0.022185, 'national_votes': 14128, 'abroad_votes': 17}}
{1: {'name': 'Jorge Álvarez Máynez', 'total_votes': 132459, 'percent': 0.10122, 'national_votes': 132356, 'abroad_votes': 103}, 2: {'name': 'Bertha Xóchitl Gálvez Ruíz', 'total_votes': 290133, 'percent': 0.221708, 'national_votes': 288804, 'abroad_votes': 1329}, 3: {'name': 'Claudia Sheinbaum Pardo', 'total_votes