# AMLO Exploratory Data Analysis

In [29]:
import re
import os
import pandas as pd


In [80]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
REGEX_PATTERNS = [
    r"copyright derechos reservados 2011-2020 - sitio oficial de andrés manuel lópez obrador",
    r"versión estenográfica de la conferencia de prensa matutina del presidente de méxico, andrés manuel lópez obrador – amlo \| \d+/\d+/\d+",
    r" descarga audio: \d+-\d+-\d+ audio conferencia de prensa presidente de méxico palacio nacional,",
]

STOPWORDS = [
    "el",
    "ella",
    "ellos",
    "ellas",
    "con",
    "contra",
    "como",
    "de",
    "por",
    "para",
    "a",
    "ante",
    "bajo",
    "cabe",
    "con",
    "contra",
    "de",
    "desde",
    "durante",
    "en",
    "entre",
    "hacia",
    "hasta",
    "mediante",
    "para",
    "por",
    "según",
    "sin",
    "so",
    "sobre",
    "tras",
    "versus",
    "vía",
    "y",
    "e",
    "ni",
    "o",
    "u",
    "pero",
    "aunque",
    "la",
    "las",
    "los",
    "lo",
    "un",
    "una",
    "unos",
    "unas",
    "al",
    "del",
    "lo",
    "le",
    "les",
    "me",
    "te",
    "se",
    "nos",
    "os",
    "les",
    "le",
    "me",
    "te",
    "se",
    "nos",
    "que",
    "esta",
    "este",
    "estas",
    "estos",
    "porque",
    "si",
    "yo",
]


#### Defining classes and functions

In [141]:
class TextParser:
    REGEX_PATTERNS = REGEX_PATTERNS
    STOPWORDS = STOPWORDS

    def __init__(self, path):
        self.path = path
        self.president_split = "presidente andrés manuel lópez obrador:"

    def txt_to_list(self, filename):
        """
        Add each line of a text file to a list
        """

        file_path = os.path.join(self.path, filename)
        lines = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip().split()
                lines.append(line)

        return lines

    def file_to_string(self, filename):
        """
        Add each line of a text file to a string
        """
        text = ""
        file_path = os.path.join(self.path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                text += line

        text = text.strip()
        text = text.lower()
        text = re.sub(r"\s+", " ", text)

        for pattern in self.REGEX_PATTERNS:
            text = re.sub(pattern, "", text)

        return text

    def remove_stopwords(self, text):
        """
        Removes predefined stopwords from a string
        """
        text = text.split()
        text = [word for word in text if word not in self.STOPWORDS]
        text = " ".join(text)
        return text

    def get_presidents_dialogues(self, filename):
        """
        Get the president's dialogues from a text file
        """
        text = self.file_to_string(filename)
        text = text.split(self.president_split)

        # Apply regex to only get the president's dialogues
        text = [re.sub(r"pregunta:s+.+", "", line) for line in text]
        text = [re.sub(r"intervención:s+.+", "", line) for line in text]

        text = [re.sub(r"interlocutor:s+.+", "", line) for line in text]
        text = [re.sub(r"interlocutora:s+.+", "", line) for line in text]

        text = [
            "presidente andrés manuel lópez obrador:" + line
            for line in text
            if line != ""
        ]
        text = text[1:]
        return text

    def save_all_presidents_dialogues(self, filename):
        """
        Save the president's dialogues to a text file
        """

        all_files = os.listdir(self.path)

        for file in all_files:
            if file.endswith(".txt"):
                text = self.get_presidents_dialogues(file)
                file_path = os.path.join(self.path, file)
                file_path = file_path.replace(".txt", "_president_dialogues.txt")
                with open(file_path, "w", encoding="utf-8") as f:
                    for line in text:
                        f.write(line)
                        f.write("\n")


In [142]:
text_parser = TextParser(PATH)
presidents_dialogues = text_parser.get_presidents_dialogues("20230202903.txt")


In [143]:
presidents_dialogues

['presidente andrés manuel lópez obrador: buenos días. ánimo, ánimo. vamos a informar a partir de lo que ustedes pregunten, porque no hay nada que exponer el día de hoy. pregunta: el cero impunidad. ',
 'presidente andrés manuel lópez obrador: sí, el cero impunidad. es que salió el general bucio a guatemala, fue a un congreso y entonces decidimos esperarlo. lo que podemos hacer es el video de lo del juicio de garcía luna y ya después abrimos para preguntas. (inicia video) voz hombre: el día de ayer, 1º de febrero, concluyó la sexta audiencia del juicio contra genaro garcía luna en la corte del distrito este de brooklyn, nueva york. la audiencia comenzó con el contrainterrogatorio al testigo protegido israel ávila, contador de los hermanos pineda. el abogado defensor, florian miedel, enfrentó dificultades para obtener respuestas concretas del testigo. el juez brian cogan incluso tuvo que intervenir y pedir a ávila que se limitara a responder con un ‘sí’ o un ‘no’. ávila afirmó no conoce

In [120]:
text = "hpla andrés manuel lópez obrador: hola"
text.split("andrés manuel lópez obrador:")


['hpla ', ' hola']

In [92]:
# Use TF-IDF to find the most important words in the text

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

text = " ".join(presidents_dialogues)

# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply the vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform([text])

# Print the result
scores = tfidf_matrix.toarray()[0]
words = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame with the result
df = pd.DataFrame({"Word": words, "Score": scores})
df.sort_values("Score", ascending=False, inplace=True)
print(df.head(100))


           Word     Score
306          de  0.555621
964         que  0.419671
662          la  0.329038
388          el  0.273870
1043         se  0.267959
...         ...       ...
943   propósito  0.013792
1148     tiempo  0.013792
1151     tienen  0.013792
315    decisión  0.013792
464       están  0.013792

[100 rows x 2 columns]


In [84]:
df.to_csv("C:/Users/fdmol/Desktop/AMLO-NLP/src/data/word_scores.csv", index=False)