In [12]:
# Install required packages from requirement.txt
import sys
!{sys.executable} -m pip install -r requirements.txt




[notice] A new release of pip is available: 24.0 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [13]:
import os
import pandas as pd
import datetime
import openpyxl
import re

In [14]:
def month_to_number(date):
    """
    Converts a date written in Spanish (e.g., '14 de mayo de 2025') into a numeric format string (e.g., '2025-5-14').

    Parameters:
        date (str): A date string in Spanish, e.g., "14 de Mayo de 2025".

    Returns:
        str: The date in numeric form as "YYYY-M-D" if parsing is successful, otherwise returns the original input.
    """

    months = {
        "enero": 1, "febrero": 2, "marzo": 3, "abril": 4,
        "mayo": 5, "junio": 6, "julio": 7, "agosto": 8,
        "septiembre": 9, "octubre": 10, "noviembre": 11, "diciembre": 12
    }
    
    split = date.lower().split()
    
    try:
        day = int(split[0])
        month_name = split[2]
        year = int(split[4])
        month = months.get(month_name, 0)
        if month == 0:
            return date
        return f"{year}-{month}-{day}"
    except Exception:
        return date

In [15]:
# Variable containing the two foldeers that have the different files with different name format
folders = [
    r"C:\Users\franc\Documents\GitHub\nlp_project\1. Scraping Speeches\own_scraped_data",
    r"C:\Users\franc\Documents\GitHub\nlp_project\1. Scraping Speeches\ghithub_scraped_data"
]

In [16]:
# Create an empty variable where I'll save the whole dataset
all_data = []

In [17]:
# Create a list with the date of the speech and the whole content
for folder in folders:
    for filename in os.listdir(folder):
        file_path = os.path.join(folder, filename)

        if os.path.isfile(file_path):
            raw_name = filename.replace(".txt", "")  # Remove .txt extension if present

            # Try to extract date from filename
            if "_" in raw_name:
                parts = raw_name.split("_")  # Split name into components

                # Case: format like "2_Viernes_12_de_junio_de_2025"
                if "de" in parts:
                    try:
                        idx = parts.index("de")
                        date_str = " ".join(parts[idx - 1:])
                        date_str = month_to_number(date_str)  # Convert to numeric date
                    except:
                        date_str = raw_name

                # Case: format like "2018_10_05_921"
                elif len(parts[0]) == 4 and len(parts[1]) == 2:
                    try:
                        date_str = f"{parts[0]}-{parts[1]}-{parts[2]}"
                    except:
                        date_str = raw_name

                else:
                    date_str = raw_name
            else:
                date_str = raw_name

            # Read file content
            try:
                with open(file_path, "r", encoding="utf-8") as file:
                    content = file.read().strip().replace("\n", " ")

                all_data.append({
                    "date": date_str,
                    "content": content
                })

            except Exception as e:
                print(f"Error reading {filename}: {e}")

In [18]:
# Transform all_data into a DataFrame
df_speeches = pd.DataFrame(all_data)

In [19]:
# Convert date into datetime format
df_speeches['date'] = pd.to_datetime(df_speeches['date'], format='%Y-%m-%d', errors='coerce')

In [20]:
# Output final DataFrame to see final output
df_speeches

Unnamed: 0,date,content
0,2025-05-14,Discurso del Presidente Javier Milei en el 42°...
1,2025-05-08,Discurso del Presidente Javier Milei en la 11°...
2,2025-04-30,Discurso del Presidente Javier Milei en la Exp...
3,2025-04-27,"Palabras del Presidente de la Nación, Javier M..."
4,2025-04-14,Declaración conjunta del Presidente Javier Mil...
...,...,...
972,2024-10-12,"Palabras del Presidente de la Nación, Javier M..."
973,2024-10-15,Palabras del Presidente en Jornadas Monetarias...
974,2024-10-18,Palabras del Presidente de la Nación Javier Mi...
975,2024-10-19,"Palabras del Presidente de la Nación, Javier M..."


In [21]:
# Save it for possible manual control of data
df_speeches.to_excel("df_speeches.xlsx")

In [22]:
df_speeches_II = df_speeches

In [23]:
def title_content_split(text):
    '''
    Attempts to separate the title and content of a speech.

    This task is complex due to the inconsistency across different texts.
    However, most speeches tend to begin with standard greetings like "Buenos días", "Buenas tardes", or "Buenas noches", which this function uses as anchors to split
    the title from the main content.

    Parameters:
        text (str): The full raw text of a speech, including both title and content.

    Returns:
        pandas.Series: A Series with two elements:
            - title (str): The extracted title (or an empty string if no match is found).
            - content (str): The body of the speech.
    '''

    # Split title from content using some common introductions in speeches
    patron = re.compile(r"(Buenos días|Buenas noches|Buenas tardes)", re.IGNORECASE)
    match = patron.search(text)
    
    if match:
        idx = match.start()
        title = text[:idx].strip()
        content = text[idx:].strip()
        return pd.Series([title, content])
    else:
        return pd.Series(["", text.strip()])

In [24]:
# Split title from content and output two additional columns for further manual control
df_speeches_II[['sep_title', 'sep_content']] = df_speeches_II['content'].apply(title_content_split)

In [25]:
# Save dataset for control and cleaning (manual process)
df_speeches_II.to_excel("df_speeches_II.xlsx")

In [26]:
# Get the dataset manually manipulated after first cleaning
df_speeches_III = pd.read_excel(r"C:\Users\franc\Documents\GitHub\nlp_project\Not using\Final Dataset.xlsx")

In [27]:
def cleaned_content(text):
    '''
    Cleans a transcript by extracting only the president's speech.

    The function checks whether the text is a monologue or includes additional speakers.
    If other speakers are detected, only the segments explicitly spoken by the president are retained.

    It also ensures each sentence ends with proper punctuation for clarity.

    Any annotations like [APPLAUSOS] or (RISAS) are removed prior to processing, regardless of case or bracket style.

    Parameters:
        text (str): The full raw transcript of a speech, potentially with multiple speakers.

    Returns:
        str: The cleaned speech containing only the president's words, properly punctuated.
             If no presidential speech is found but other speakers exist, returns the cleaned full text.
    '''
    # Remove annotations like [APLAUSOS], (RISAS), etc.
    text = re.sub(r"[\[\(][^\]\)]+[\]\)]", "", text, flags=re.IGNORECASE).strip()
    
    # Pattern to match speakers (one or more uppercase words), allowing multiple-word names
    # Accepts one or more short or long dashes (- or –) as separator
    speaker_pattern = r"[A-ZÁÉÍÓÚÑ]+(?:\s+[A-ZÁÉÍÓÚÑ]+)*\.?\s*[-–]+"
    
    # Detect other speakers except PRESIDENTE
    other_speakers = re.findall(rf"\b(?!PRESIDENTE){speaker_pattern}", text)

    if other_speakers:
        # Extract PRESIDENTE's parts
        president_part = re.findall(
            rf"PRESIDENTE\.?\s*[-–]+\s*(.*?)(?={speaker_pattern}|$)",
            text,
            re.DOTALL
        )

        # Add punctuation if missing
        president_part = [
            x.strip() + ("" if x.strip().endswith(('.', '!', '?')) else ".")
            for x in president_part
        ]

        # Extract possible intro before first speaker
        intros = re.split(speaker_pattern, text)
        text_out_speech = intros[0].strip()
        extra = text_out_speech if text_out_speech else ""

        # If no presidential speech found, return cleaned full text
        if not president_part and not extra:
            return text
        
        # Combine intro and president parts
        speech = " ".join(filter(None, [extra] + president_part))
        return re.sub(r"\s+", " ", speech).strip()
    
    else:
        # No other speakers — return whole text with proper punctuation
        text = re.sub(r"\s+", " ", text).strip()
        return text if text.endswith(('.', '!', '?')) else text + "."

In [28]:
# Clean the content in case there are additional speakers
df_speeches_III['cleaned_content'] = df_speeches_III['content'].apply(cleaned_content)

In [29]:
# Output final results
df_speeches_III

Unnamed: 0,date,president,content,cleaned_content
0,2024-01-17,Javier Milei,"Buenas tardes, muchas gracias: hoy estoy acá p...","Buenas tardes, muchas gracias: hoy estoy acá p..."
1,2024-04-02,Javier Milei,Hoy estamos aquí reunidos a 42 años del inicio...,Hoy estamos aquí reunidos a 42 años del inicio...
2,2024-02-24,Javier Milei,Hola a todos. Yo soy el león. Yo también los a...,Hola a todos. Yo soy el león. Yo también los a...
3,2024-01-26,Javier Milei,"En primer lugar, quiero comenzar por agradecer...","En primer lugar, quiero comenzar por agradecer..."
4,2024-02-08,Javier Milei,Buenos días. Quiero compartir con ustedes una ...,Buenos días. Quiero compartir con ustedes una ...
...,...,...,...,...
1306,2023-12-08,Alberto Fernández,Querido Pueblo Argentino: Hace exactamente 40 ...,Querido Pueblo Argentino: Hace exactamente 40 ...
1307,2023-12-10,Javier Milei,"Hola a todos. Señores ministros de la Corte, s...","Hola a todos. Señores ministros de la Corte, s..."
1308,2023-12-10,Javier Milei,"Hola a todos. ¡Viva la libertad, carajo! ¡Viva...","Hola a todos. ¡Viva la libertad, carajo! ¡Viva..."
1309,2023-12-20,Javier Milei,"Argentinos, hoy es un día histórico para nuest...","Argentinos, hoy es un día histórico para nuest..."


In [30]:
# Save dataset for final control and cleaning (manual process)
df_speeches_III.to_excel("df_speeches_III.xlsx")