# AMLO Exploratory Data Analysis

In [2]:
import re
import os
import pandas as pd

In [3]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
REGEX_PATTERNS = [
    r"copyright derechos reservados 2011-2020 - sitio oficial de andrés manuel lópez obrador",
    r"versión estenográfica de la conferencia de prensa matutina del presidente de méxico, andrés manuel lópez obrador – amlo \| \d+/\d+/\d+",
    r" descarga audio: \d+-\d+-\d+ audio conferencia de prensa presidente de méxico palacio nacional,",
]

STOPWORDS = [
    "el",
    "ella",
    "ellos",
    "ellas",
    "con",
    "contra",
    "como",
    "de",
    "por",
    "para",
    "a",
    "ante",
    "bajo",
    "cabe",
    "con",
    "contra",
    "de",
    "desde",
    "durante",
    "en",
    "entre",
    "hacia",
    "hasta",
    "mediante",
    "para",
    "por",
    "según",
    "sin",
    "so",
    "sobre",
    "tras",
    "versus",
    "vía",
    "y",
    "e",
    "ni",
    "o",
    "u",
    "pero",
    "aunque",
    "la",
    "las",
    "los",
    "lo",
    "un",
    "una",
    "unos",
    "unas",
    "al",
    "del",
    "lo",
    "le",
    "les",
    "me",
    "te",
    "se",
    "nos",
    "os",
    "les",
    "le",
    "me",
    "te",
    "se",
    "nos",
    "que",
    "esta",
    "este",
    "estas",
    "estos",
    "porque",
    "si",
    "yo",
]


PRESIDENT_REGEXES = [
    r"pregunta:.*",
    r"interlocutor:.*",
    r"intervención:.*",
    r"interlocutora:.*",
]

#### Defining classes and functions

In [4]:
class TextParser:
    REGEX_PATTERNS = REGEX_PATTERNS
    STOPWORDS = STOPWORDS
    PRESIDENT_REGEXES = PRESIDENT_REGEXES

    def __init__(self, path):
        self.path = path
        self.president_split = "presidente andrés manuel lópez obrador:"

    def txt_to_list(self, filename):
        """
        Add each line of a text file to a list
        """

        file_path = os.path.join(self.path, filename)
        lines = []
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                line = line.strip().split()
                lines.append(line)

        return lines

    def file_to_string(self, filename):
        """
        Add each line of a text file to a string
        """
        text = ""
        file_path = os.path.join(self.path, filename)
        with open(file_path, "r", encoding="utf-8") as f:
            for line in f:
                text += line

        text = text.strip()
        text = text.lower()
        text = re.sub(r"\s+", " ", text)

        for pattern in self.REGEX_PATTERNS:
            text = re.sub(pattern, "", text)

        return text

    def remove_stopwords(self, text):
        """
        Removes predefined stopwords from a string
        """
        text = text.split()
        text = [word for word in text if word not in self.STOPWORDS]
        text = " ".join(text)
        return text

    def get_presidents_dialogues(self, filename):
        """
        Get the president's dialogues from a text file
        """
        text = self.file_to_string(filename)
        text = text.split(self.president_split)

        # Apply regex to only get the president's dialogues
        for regex in self.PRESIDENT_REGEXES:
            text = [re.sub(regex, "", line) for line in text]

        text = [line.strip() for line in text if line.strip() != ""]
        text = text[1:]
        return text

    def save_all_presidents_dialogues(self, filename):
        """
        Save the president's dialogues to a text file
        """

        all_files = os.listdir(self.path)

        for file in all_files:
            if file.endswith(".txt"):
                text = self.get_presidents_dialogues(file)
                file_path = os.path.join(self.path, file)
                file_path = file_path.replace(".txt", "_president_dialogues.txt")
                with open(file_path, "w", encoding="utf-8") as f:
                    for line in text:
                        f.write(line)
                        f.write("\n")

In [5]:
text_parser = TextParser(PATH)
presidents_dialogues = text_parser.get_presidents_dialogues("202401025.txt")

In [12]:
len(presidents_dialogues)

total_length = 0
for dialogue in presidents_dialogues:
    total_length += len(dialogue)

total_length


66124

In [11]:
# Use TF-IDF to find the most important words in the text

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

text = " ".join(presidents_dialogues)

# Create a TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()

# Apply the vectorizer
tfidf_matrix = tfidf_vectorizer.fit_transform([text])

# Print the result
scores = tfidf_matrix.toarray()[0]
words = tfidf_vectorizer.get_feature_names_out()

# Create a DataFrame with the result
df = pd.DataFrame({"Word": words, "Score": scores})
df.sort_values("Score", ascending=False, inplace=True)
print(df)

           Word     Score
664          de  0.537529
1944        que  0.377017
856          el  0.353687
1351         la  0.304227
886          en  0.286496
...         ...       ...
214    aparatos  0.000933
1140     guerra  0.000933
215   apartados  0.000933
216    apegados  0.000933
2471      único  0.000933

[2472 rows x 2 columns]


df.to_csv("C:/Users/fdmol/Desktop/AMLO-NLP/src/data/word_scores.csv", index=False)

### Possible models to use:

Summarization:

https://huggingface.co/mrm8488/bert2bert_shared-spanish-finetuned-summarization


Sentiment Analysis:

https://huggingface.co/finiteautomata/beto-sentiment-analysis


Emotion Analysis:

https://huggingface.co/finiteautomata/beto-emotion-analysis


In [16]:
# pip install transformers

In [30]:
presidents_dialogues[9]

'jurados ciudadanos. bueno, pero tenemos que seguir avanzando en todo esto. y sí es importante iniciando el año recordar eso, que se necesita una reforma constitucional y que, para llevar a cabo una reforma constitucional se necesita contar con el apoyo de mayoría calificada; es decir, no sólo es la mayoría simple en las cámaras, 50 más uno, 50 por ciento más uno, sino se requieren dos terceras partes, a eso se le llama mayoría calificada. si no se tiene esa mayoría calificada, no se puede llevar a cabo ninguna reforma constitucional; tomar eso en cuenta, informarlo, porque ni modo que, a los manipuladores, a los que les conviene que siga este régimen de injusticias vayan a informarlo. ¿cuánto tiempo ha pasado, y quién sabía que para reformar la constitución se necesitaba de mayoría calificada? ¿quién sabía qué cosa es una mayoría calificada? ¿quién sabía cuántos votos de los 500 de la cámara de diputados son mayoría calificada? no son 251, sino son 374. jesús ramírez cuevas: trescient

In [24]:
import torch
from transformers import BertTokenizerFast, EncoderDecoderModel

device = "cuda" if torch.cuda.is_available() else "cpu"
ckpt = "mrm8488/bert2bert_shared-spanish-finetuned-summarization"
tokenizer = BertTokenizerFast.from_pretrained(ckpt)
model = EncoderDecoderModel.from_pretrained(ckpt).to(device)


def generate_summary(text):
    inputs = tokenizer(
        [text],
        padding="max_length",
        truncation=True,
        max_length=512,
        return_tensors="pt",
    )
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)
    output = model.generate(input_ids, attention_mask=attention_mask)
    return tokenizer.decode(output[0], skip_special_tokens=True)


text = presidents_dialogues[8]
generate_summary(text)


The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']
The following encoder weights were not tied to the decoder ['bert/pooler']


'¿ Cómo gestionar la libertad de cualquier persona, aun tratándose de delincuentes peligrosos, que hacen lo que les da la gana, se enriquecen?'

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("finiteautomata/beto-emotion-analysis")
model = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/beto-emotion-analysis"
)


SequenceClassifierOutput(loss=None, logits=tensor([[ 4.6140, -0.8551, -0.1549,  0.8921, -1.5799, -2.1146, -1.5987]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
