In [None]:
# Install required packages from requirement.txt
import sys
!{sys.executable} -m pip install -r requirements.txt

Collecting beautifulsoup4 (from -r requirements.txt (line 2))
  Downloading beautifulsoup4-4.13.4-py3-none-any.whl.metadata (3.8 kB)
Collecting tqdm (from -r requirements.txt (line 3))
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
     ---------------------------------------- 0.0/57.7 kB ? eta -:--:--
     ---------------------------------------- 0.0/57.7 kB ? eta -:--:--
     ------------- ------------------------ 20.5/57.7 kB 330.3 kB/s eta 0:00:01
     -------------------- ----------------- 30.7/57.7 kB 330.3 kB/s eta 0:00:01
     --------------------------------- ---- 51.2/57.7 kB 375.8 kB/s eta 0:00:01
     -------------------------------------- 57.7/57.7 kB 338.8 kB/s eta 0:00:00
Collecting soupsieve>1.2 (from beautifulsoup4->-r requirements.txt (line 2))
  Downloading soupsieve-2.7-py3-none-any.whl.metadata (4.6 kB)
Collecting typing-extensions>=4.0.0 (from beautifulsoup4->-r requirements.txt (line 2))
  Using cached typing_extensions-4.14.0-py3-none-any.whl.metada


[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: c:\Users\franc\Documents\GitHub\nlp_project\.venv\Scripts\python.exe -m pip install --upgrade pip


In [10]:
import os
import requests
from bs4 import BeautifulSoup
import csv
import time
import random
from tqdm import tqdm
from colorama import Fore, Style

In [5]:
# Constants
BASE_URL = "https://www.casarosada.gob.ar/informacion/discursos"
OUTPUT_DIR = "scraped_data"
CSV_FILE = os.path.join(OUTPUT_DIR, "speeches.csv")
PROGRESS_FILE = os.path.join(OUTPUT_DIR, "progress.txt")

HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36"
}

In [6]:
# Ensure output directory exists
os.makedirs(OUTPUT_DIR, exist_ok=True)


def safe_get(url):
    try:
        response = requests.get(url, headers=HEADERS, timeout=10)
        response.raise_for_status()
        return response
    except requests.exceptions.RequestException as e:
        print(Fore.RED + f"[ERROR] No se pudo acceder a: {url}\n{e}" + Style.RESET_ALL)
        return None


def get_speech_links(page_url):
    response = safe_get(page_url)
    if not response:
        return []
    soup = BeautifulSoup(response.content, "html.parser")
    links = soup.find_all("a", class_="panel")
    return [link["href"] for link in links]


def scrape_speech(speech_url):
    response = safe_get(speech_url)
    if not response:
        return "No Title", "No Date", "No Content"

    soup = BeautifulSoup(response.content, "html.parser")

    title_tag = soup.find("h2", class_="panel-title")
    date_tag = soup.find("time")
    content_tag = soup.find("div", class_="item-page")

    title = title_tag.get_text(strip=True) if title_tag else "No Title"
    date = date_tag.get_text(strip=True) if date_tag else "No Date"
    content = content_tag.get_text(strip=True) if content_tag else "No Content"

    return title, date, content


def save_speech_to_txt(speech_id, date, content):
    formatted_date = date.replace(" ", "_").replace(",", "").replace(":", "")
    filename = os.path.join(OUTPUT_DIR, f"{speech_id}_{formatted_date}.txt")
    with open(filename, "w", encoding="utf-8") as file:
        file.write(content)


def load_progress():
    if os.path.exists(PROGRESS_FILE):
        with open(PROGRESS_FILE, "r") as file:
            return set(line.strip() for line in file)
    return set()


def save_progress(speech_id):
    with open(PROGRESS_FILE, "a") as file:
        file.write(f"{speech_id}\n")


def main():
    all_speeches = []
    page_number = 1
    speech_id = 1
    completed_speeches = load_progress()

    while True:
        page_url = f"{BASE_URL}?start={(page_number - 1) * 40}"
        print(Fore.GREEN + f"\nScraping página {page_number}..." + Style.RESET_ALL)
        speech_links = get_speech_links(page_url)

        if not speech_links:
            print(Fore.YELLOW + "No se encontraron más discursos. Fin del scraping." + Style.RESET_ALL)
            break

        for link in tqdm(speech_links, desc="Discursos en la página"):
            full_url = f"https://www.casarosada.gob.ar{link}"
            if str(speech_id) in completed_speeches:
                speech_id += 1
                continue

            title, date, content = scrape_speech(full_url)
            all_speeches.append([title, date, full_url, content])
            save_speech_to_txt(speech_id, date, content)
            save_progress(speech_id)
            speech_id += 1

            # Pausa aleatoria para evitar bloqueo
            time.sleep(random.uniform(1.5, 3.0))

        page_number += 1

    # Guardar CSV
    with open(CSV_FILE, "w", newline="", encoding="utf-8") as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["Title", "Date", "URL", "Content"])
        writer.writerows(all_speeches)

    print(Fore.BLUE + "\nScraping completado exitosamente." + Style.RESET_ALL)


if __name__ == "__main__":
    main()

[32m
Scraping página 1...[0m


Discursos en la página:   2%|▎         | 1/40 [00:06<03:58,  6.11s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50997-palabras-del-presidente-de-la-nacion-javier-milei-luego-de-recibir-el-premio-escuela-de-salamanca-desde-el-centro-riojano-espana
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  15%|█▌        | 6/40 [00:30<03:03,  5.39s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50992-palabras-del-presidente-de-la-nacion-javier-milei-en-jerusalen
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  18%|█▊        | 7/40 [00:34<02:38,  4.80s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50989-discurso-del-presidente-de-la-nacion-javier-milei-en-el-madrid-economic-forum-espana
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  25%|██▌       | 10/40 [00:51<02:49,  5.65s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50959-discurso-del-presidente-javier-milei-en-la-11-edicion-del-latam-economic-forum-2025-en-la-ciudad-de-buenos-aires
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  38%|███▊      | 15/40 [01:20<02:32,  6.10s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50926-palabras-a-la-prensa-del-presidente-javier-milei-en-declaracion-conjunta-durante-la-visita-oficial-a-la-republica-del-paraguay-desde-el-palacio-de-gobierno
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  50%|█████     | 20/40 [01:45<01:41,  5.08s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50868-discurso-del-presidente-javier-milei-en-el-cpac-de-washington-d-c-2025
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  52%|█████▎    | 21/40 [01:48<01:26,  4.55s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50866-discurso-del-presidente-javier-milei-en-el-banco-interamericano-de-desarrollo-bid-en-washington-d-c
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  60%|██████    | 24/40 [02:02<01:20,  5.02s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50848-discurso-del-presidente-de-la-nacion-javier-milei-desde-el-foro-de-davos-suiza
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  62%|██████▎   | 25/40 [02:07<01:13,  4.90s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50844-palabras-del-presidente-de-la-nacion-javier-milei-en-el-milken-center-en-washington-d-c
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  68%|██████▊   | 27/40 [02:15<00:57,  4.41s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50841-palabras-del-presidente-javier-milei-en-la-gala-inaugural-hispanica-tras-recibir-el-premio-lws-2025-titan-de-la-reforma-economica-washington-dc
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  70%|███████   | 28/40 [02:19<00:51,  4.25s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50826-el-presidente-de-la-nacion-realiza-anuncios-en-materia-nuclear
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  72%|███████▎  | 29/40 [02:22<00:45,  4.14s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50825-palabras-del-presidente-de-la-nacion-javier-milei-en-la-inauguracion-de-la-nueva-sede-de-la-bolsa-de-comercio-de-cordoba
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  82%|████████▎ | 33/40 [02:45<00:38,  5.48s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50820-discurso-del-presidente-javier-milei-tras-recibir-el-premio-internacional-milton-friedman-en-roma
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  85%|████████▌ | 34/40 [02:49<00:30,  5.01s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50817-discurso-del-presidente-javier-milei-en-cadena-nacional-por-el-ano-de-gestion
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  88%|████████▊ | 35/40 [02:53<00:23,  4.69s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50813-palabras-del-presidente-de-la-nacion-javier-milei-luego-de-recibir-un-reconocimiento-de-la-asociacion-de-dirigentes-de-marketing-de-uruguay-en-montevideo
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  92%|█████████▎| 37/40 [03:01<00:13,  4.36s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50811-palabras-del-presidente-de-la-nacion-javier-milei-en-la-lxv-cumbre-del-mercosur-en-montevideo-uruguay
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  95%|█████████▌| 38/40 [03:05<00:08,  4.31s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50808-palabras-del-presidente-de-la-nacion-javier-milei-en-el-encuentro-de-los-lideres-en-el-cronista
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página: 100%|██████████| 40/40 [03:13<00:00,  4.83s/it]


[32m
Scraping página 2...[0m


Discursos en la página:   5%|▌         | 2/40 [00:10<03:19,  5.25s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50792-palabras-del-presidente-de-la-nacion-javier-milei-en-la-entrega-de-sables-a-las-fuerzas-de-seguridad-en-el-salon-blanco-de-la-casa-rosada
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  10%|█         | 4/40 [00:19<02:59,  5.00s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50785-declaracion-conjunta-del-presidente-de-la-nacion-javier-milei-y-la-primer-ministro-de-italia-giorgia-meloni-en-visita-de-estado-a-nuestro-pais-desde-casa-rosada
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  12%|█▎        | 5/40 [00:23<02:42,  4.65s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50780-intervencion-del-presidente-de-la-republica-argentina-javier-milei-en-la-sesion-ii-de-la-cumbre-de-lideres-del-g20
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  18%|█▊        | 7/40 [00:30<02:17,  4.15s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50777-palabras-del-presidente-de-la-nacion-javier-milei-ante-inversores-de-la-cpac-en-mar-a-lago-florida-estados-unidos
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  25%|██▌       | 10/40 [00:45<02:19,  4.66s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50769-palabras-del-presidente-de-la-nacion-javier-milei-en-la-gala-anual-de-la-fundacion-endeavor
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  30%|███       | 12/40 [00:53<02:01,  4.34s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50767-palabras-del-presidente-de-la-nacion-javier-milei-despues-de-visitar-la-sede-corporativa-de-uala-en-caba
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  45%|████▌     | 18/40 [01:27<01:59,  5.45s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50729-palabras-del-presidente-de-la-nacion-javier-milei-en-el-60-coloquio-de-idea-2024-mar-del-plata
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  50%|█████     | 20/40 [01:36<01:38,  4.94s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50713-palabras-del-presidente-de-la-nacion-javier-milei-en-la-presentacion-del-centro-cultural-palacio-libertad-domingo-faustino-sarmiento-ex-cck
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  75%|███████▌  | 30/40 [02:26<00:52,  5.23s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50614-palabras-del-presidente-javier-milei-en-la-cena-de-camaraderia-de-las-fuerzas-armadas
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  78%|███████▊  | 31/40 [02:30<00:43,  4.85s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50612-palabras-del-presidente-de-la-nacion-en-congreso-de-inversiones-inmobiliarias
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página:  85%|████████▌ | 34/40 [02:47<00:33,  5.53s/it]

[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos/50593-palabras-del-presidente-de-la-nacion-javier-milei-en-la-exposicion-de-ganaderia-agricultura-e-industria-internacional-en-la-rural
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m


Discursos en la página: 100%|██████████| 40/40 [03:18<00:00,  4.95s/it]


[32m
Scraping página 3...[0m
[31m[ERROR] No se pudo acceder a: https://www.casarosada.gob.ar/informacion/discursos?start=80
('Connection aborted.', RemoteDisconnected('Remote end closed connection without response'))[0m
[33mNo se encontraron más discursos. Fin del scraping.[0m
[34m
Scraping completado exitosamente.[0m
