In [1]:
import pandas as pd
import time
import random
import json
from datetime import datetime
from pytz import timezone

# scrapping
from selenium import webdriver
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import Select
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import bs4
import requests
from requests.exceptions import RequestException



from CHN.general import Scrapping


import warnings
warnings.filterwarnings('ignore')

In [2]:
scrap = Scrapping()

In [3]:
url = 'https://voxpopuliguate.com/'
busqueda = '?s='

In [4]:
nombre = 'bernardo arevalo'
name = '+'.join(nombre.split())

In [5]:
response = scrap.genRequest(url + busqueda + name)

In [6]:
response.status_code

200

In [7]:
soup = BeautifulSoup(response.content, 'html.parser')

In [8]:
with open('../data/voxpopuli/search_test_bernardo.html', 'w') as search:
    search.write(soup.prettify())

### extraemos la info de los articulos

In [9]:
def extraer_info_articulo(soup: BeautifulSoup) -> dict:
    link_tag = soup.find("a", class_="image-link")
    link = link_tag["href"] if link_tag else None
    title_tag = soup.find("h2", class_="is-title post-title")
    title = title_tag.get_text(strip=True) if title_tag else None
    
    fecha_entrada_tag = soup.find("time", class_="post-date")
    fecha_entrada = fecha_entrada_tag["datetime"] if fecha_entrada_tag else None

    
    return {
        "titulo": title,
        "link": link,
        "fecha_entrada": fecha_entrada.split('T')[0] if fecha_entrada else None
    }

In [10]:
articles = [extraer_info_articulo(article) for article in soup.select("div.loop article.l-post")]

In [11]:
articles

[{'titulo': None,
  'link': 'https://voxpopuliguate.com/de-reprimir-protestas-a-jefe-de-seguridad-del-congreso/',
  'fecha_entrada': '2025-01-29'},
 {'titulo': None,
  'link': 'https://voxpopuliguate.com/el-misterioso-blindado-que-usa-el-presidente-del-congreso/',
  'fecha_entrada': '2025-01-27'},
 {'titulo': None,
  'link': 'https://voxpopuliguate.com/archivo-viejo-la-zona-insalubre-y-olvidada-del-hospital-san-juan-de-dios/',
  'fecha_entrada': '2025-01-21'},
 {'titulo': 'Contraloría denuncia a funcionarios por compra irregular de pasaportes, pero protege a otros implicados',
  'link': 'https://voxpopuliguate.com/contraloria-denuncia-a-funcionarios-por-compra-irregulares-de-pasaportes-pero-protege-a-otros-implicados/',
  'fecha_entrada': '2024-11-15'},
 {'titulo': 'La ruta del millonario negocio de los pasaportes',
  'link': 'https://voxpopuliguate.com/la-ruta-del-millonario-negocio-de-los-pasaportes/',
  'fecha_entrada': '2024-11-07'},
 {'titulo': 'Hermana de Miguel Martínez vive en 

## paginas

In [12]:
def obtener_max_pags(soup : bs4.BeautifulSoup) -> int:
    paginas = soup.find_all('a', class_='page-numbers')

    numeros_paginas = [int(element.text.strip()) for element in paginas if element.text.strip().isdigit()]

    if numeros_paginas:
        return max(numeros_paginas)
    else:
        return 1

In [13]:
max_pages = obtener_max_pags(soup)

In [14]:
full = []
print('Inicio...')
for i in range(1, max_pages + 1):
    # agregamos el número de página
    page = f'page/{i}/'
    print(f'Solicitando info pag {i}')
    
    try:
        # headers
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/132.0.0.0 Safari/537.36'}
        # solicitamos la información de la pag
        response = scrap.genRequest(url + page + busqueda + name, headers=headers)
        print(f'Estado: {response.status_code}')

        # si se recibio respuesta correcta, parseamos
        if response.status_code == 200:
            soup = BeautifulSoup(response.content, 'html.parser')

            # extraemos la información de los articulos
            articles = [extraer_info_articulo(article) for article in soup.select("div.loop article.l-post")]

            # juntamos la data en formato json
            data = {"page": i, "articulos": articles}
            full.append(data)
    except RequestException as e:
        print(f'Error al procesar pagina {i}: {e}')
        continue

info = {'voxPopuli': full}
with open(f"../data/voxpopuli/resultados_1_{name}.json", 'w', encoding='utf-8') as ai:
    json.dump(info, ai, indent=4, ensure_ascii=False)

Inicio...
Solicitando info pag 1
Estado: 200
Solicitando info pag 2
Estado: 200
Solicitando info pag 3
Estado: 200
