In [13]:
# Install required packages from requirement.txt
import sys
!{sys.executable} -m pip install -r requirements.txt




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: c:\Users\franc\Documents\GitHub\nlp_project\.venv\Scripts\python.exe -m pip install --upgrade pip


In [14]:
import os
import pandas as pd
import datetime
import openpyxl
import re

In [15]:
def month_to_number(date):
    """
    Converts a date written in Spanish (e.g., '14 de mayo de 2025') into a numeric format string (e.g., '2025-5-14').

    Parameters:
        date (str): A date string in Spanish, e.g., "14 de Mayo de 2025".

    Returns:
        str: The date in numeric form as "YYYY-M-D" if parsing is successful, otherwise returns the original input.
    """

    months = {
        "enero": 1, "febrero": 2, "marzo": 3, "abril": 4,
        "mayo": 5, "junio": 6, "julio": 7, "agosto": 8,
        "septiembre": 9, "octubre": 10, "noviembre": 11, "diciembre": 12
    }
    
    split = date.lower().split()
    
    try:
        day = int(split[0])
        month_name = split[2]
        year = int(split[4])
        month = months.get(month_name, 0)
        if month == 0:
            return date
        return f"{year}-{month}-{day}"
    except Exception:
        return date

In [16]:
# Variable containing the two foldeers that have the different files with different name format
folders = [
    r"C:\Users\franc\Documents\GitHub\nlp_project\1. Scraping Speeches\own_scraped_data",
    r"C:\Users\franc\Documents\GitHub\nlp_project\1. Scraping Speeches\ghithub_scraped_data"
]

In [17]:
# Create an empty variable where I'll save the whole dataset
all_data = []

In [18]:
# Create a list with the date of the speech and the whole content
for folder in folders:
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)

        if os.path.isfile(file_path):
            raw_name = filename.replace(".txt", "")  # Remove .txt extension if present

            # Try to extract date from filename
            if "_" in raw_name:
                parts = raw_name.split("_")  # Split name into components

                # Case: format like "2_Viernes_12_de_junio_de_2025"
                if "de" in parts:
                    try:
                        idx = parts.index("de")
                        date_str = " ".join(parts[idx - 1:])
                        date_str = month_to_number(date_str)  # Convert to numeric date
                    except:
                        date_str = raw_name

                # Case: format like "2018_10_05_921"
                elif len(parts[0]) == 4 and len(parts[1]) == 2:
                    try:
                        date_str = f"{parts[0]}-{parts[1]}-{parts[2]}"
                    except:
                        date_str = raw_name

                else:
                    date_str = raw_name
            else:
                date_str = raw_name

            # Read file content
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    content = file.read().strip().replace("\n", " ")

                all_data.append({
                    "date": date_str,
                    "content": content
                })

            except Exception as e:
                print(f"Error reading {filename}: {e}")

In [19]:
# Transform all_data into a DataFrame
df_speeches = pd.DataFrame(all_data)

In [20]:
# Convert date into datetime format
df_speeches['date'] = pd.to_datetime(df_speeches['date'], format='%Y-%m-%d', errors='coerce')

In [21]:
# Output final DataFrame to see final output
df_speeches

Unnamed: 0,date,content
0,2025-05-14,Discurso del Presidente Javier Milei en el 42°...
1,2025-05-08,Discurso del Presidente Javier Milei en la 11°...
2,2025-04-30,Discurso del Presidente Javier Milei en la Exp...
3,2025-04-27,"Palabras del Presidente de la Nación, Javier M..."
4,2025-04-14,Declaración conjunta del Presidente Javier Mil...
...,...,...
972,2024-10-12,"Palabras del Presidente de la Nación, Javier M..."
973,2024-10-15,Palabras del Presidente en Jornadas Monetarias...
974,2024-10-18,Palabras del Presidente de la Nación Javier Mi...
975,2024-10-19,"Palabras del Presidente de la Nación, Javier M..."


In [22]:
# Save it for possible manual control of data
df_speeches.to_excel("df_speeches.xlsx")

In [23]:
# Copy variable for further modification
df_speeches_final = df_speeches

In [24]:
def title_content_split(text):
    '''
    Attempts to separate the title and content of a speech.

    This task is complex due to the inconsistency across different texts.
    However, most speeches tend to begin with standard greetings like "Buenos días", "Buenas tardes", or "Buenas noches", which this function uses as anchors to split
    the title from the main content.

    Any annotations like [APLAUSOS] or (RISAS) are removed prior to processing, regardless of case or bracket style.

    Parameters:
        text (str): The full raw text of a speech, including both title and content.

    Returns:
        pandas.Series: A Series with two elements:
            - title (str): The extracted title (or an empty string if no match is found).
            - content (str): The body of the speech.
    '''
    # Remove any annotation such as APLAUSOS (clapping in english), RISAS (laughs in english), etc. They may be between brackets [] or parentheses ().
    text = re.sub(r"[\[\(][^\]\)]+[\]\)]", "", text, flags=re.IGNORECASE).strip()

    # Split title from content using some common introductions in speeches
    patron = re.compile(r"(Buenos días|Buenas noches|Buenas tardes)", re.IGNORECASE)
    match = patron.search(text)
    
    if match:
        idx = match.start()
        title = text[:idx].strip()
        content = text[idx:].strip()
        return pd.Series([title, content])
    else:
        return pd.Series(["", text.strip()])

In [25]:
def cleaned_content(text):
    '''
    Cleans a transcript by extracting only the president's speech.

    The function checks whether the text is a monologue or includes additional speakers
    (e.g., journalists, citizens, ministers). If other speakers are detected, only the 
    segments explicitly spoken by the president are retained.

    It also ensures each sentence ends with proper punctuation for clarity.

    Parameters:
        text (str): The full raw transcript of a speech, potentially with multiple speakers.

    Returns:
        str: The cleaned speech containing only the president's words, properly punctuated.
    '''

    # Check if there are additional speakers
    other_speakers = re.findall(r"\b(?!PRESIDENTE)[A-ZÁÉÍÓÚÑ]+\.?-", text)

    if other_speakers:

        # Just extract the president's part
        president_part = re.findall(r"PRESIDENTE\.-(.*?)(?=(?:[A-ZÁÉÍÓÚÑ]+\.?-)|$)", text, re.DOTALL)

        # Add punctuation if it is missing, to separate clear sentences
        president_part = [
            x.strip() + ("" if x.strip().endswith(('.', '!', '?')) else ".")
            for x in president_part
        ]

        # Include also previous words in case the president did an opening speech, before additional speakers started talking with him
        intros = re.split(r"[A-ZÁÉÍÓÚÑ]+\.?-", text)
        text_out_speech = intros[0].strip()
        extra = text_out_speech if text_out_speech else ""

        # Combine both results to have final speech
        speech = " ".join(filter(None, [extra] + president_part))
        return re.sub(r"\s+", " ", speech).strip()

    else:
        # If there are no additional speakers, just add he punctuation at the end (if needed)
        text = re.sub(r"\s+", " ", text).strip()
        return text if text.endswith(('.', '!', '?')) else text + "."

In [26]:
# Step 1: Split title from content and output two additional columns for further manual control
df_speeches_final[['sep_title', 'sep_content']] = df_speeches_final['content'].apply(title_content_split)

In [27]:
# Step 2: Clean the content in case there are additional speakers
df_speeches_final['cleaned_content'] = df_speeches_final['sep_content'].apply(cleaned_content)

In [28]:
# Output final results
df_speeches_final

Unnamed: 0,date,content,sep_title,sep_content,cleaned_content
0,2025-05-14,Discurso del Presidente Javier Milei en el 42°...,,Discurso del Presidente Javier Milei en el 42°...,Discurso del Presidente Javier Milei en el 42°...
1,2025-05-08,Discurso del Presidente Javier Milei en la 11°...,Discurso del Presidente Javier Milei en la 11°...,"Buenos días a todos. Gracias Rab, gracias Darí...","Buenos días a todos. Gracias Rab, gracias Darí..."
2,2025-04-30,Discurso del Presidente Javier Milei en la Exp...,Discurso del Presidente Javier Milei en la Exp...,Buenos días. Muchísimas gracias a las autorida...,Buenos días. Muchísimas gracias a las autorida...
3,2025-04-27,"Palabras del Presidente de la Nación, Javier M...","Palabras del Presidente de la Nación, Javier M...",Buenos días a todos. Es para mí un placer y un...,Buenos días a todos. Es para mí un placer y un...
4,2025-04-14,Declaración conjunta del Presidente Javier Mil...,Declaración conjunta del Presidente Javier Mil...,Buenas tardes a todos. Antes de comenzar quier...,Buenas tardes a todos. Antes de comenzar quier...
...,...,...,...,...,...
972,2024-10-12,"Palabras del Presidente de la Nación, Javier M...",,"Palabras del Presidente de la Nación, Javier M...","Palabras del Presidente de la Nación, Javier M..."
973,2024-10-15,Palabras del Presidente en Jornadas Monetarias...,Palabras del Presidente en Jornadas Monetarias...,"buenas tardes. En primer lugar, muchas gracias...","buenas tardes. En primer lugar, muchas gracias..."
974,2024-10-18,Palabras del Presidente de la Nación Javier Mi...,Palabras del Presidente de la Nación Javier Mi...,"Buenas tardes a todos. En primer lugar, deseo ...","Buenas tardes a todos. En primer lugar, deseo ..."
975,2024-10-19,"Palabras del Presidente de la Nación, Javier M...",,"Palabras del Presidente de la Nación, Javier M...","Palabras del Presidente de la Nación, Javier M..."


In [29]:
# Save dataset for final control and cleaning (manual process)
df_speeches_final.to_excel("df_speeches_final_II.xlsx")