# AMLO Exploratory Data Analysis

In [25]:
import re
import os
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

from amlo_parser import AMLOParser

#### CONSTANTS

In [26]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"

#### Defining some classes and functions to help us with the analysis

In [27]:
# Use TF-IDF to find the most important words in the text
def get_most_important_words(text, filename, save=False):
    # Create a TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Apply the vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)

    # Print the result
    scores = tfidf_matrix.toarray()[0]
    words = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with the result
    df = pd.DataFrame({"Word": words, "Score": scores})
    df.sort_values("Score", ascending=False, inplace=True)

    if save:
        df.to_csv(filename, index=False)

    return df

### Pipeline

 Word Scoring

In [28]:
text_parser = AMLOParser(PATH)

# List of all the text files
all_files = os.listdir(PATH)

print(all_files)

# Get the president's dialogues from a text file
new_path = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/word_scores/"

for file in tqdm(all_files):
    if file.endswith(".txt"):
        text = text_parser.get_presidents_dialogues(file, remove_stopwords=True)
        new_file_path = os.path.join(new_path, file)
        new_file_path = new_file_path.replace(".txt", "_word_scores.csv")
        get_most_important_words(text, new_file_path, save=True)

['20181207.txt', '20181210.txt', '20181211.txt', '20181212.txt', '20181217.txt', '20181218.txt', '20181219.txt', '20181224.txt', '20181226.txt', '20181227.txt', '20181228.txt', '20190102.txt', '20190103.txt', '20190104.txt', '20190107.txt', '20190108.txt', '20190109.txt', '20190110.txt', '20190111.txt', '20190114.txt', '20190115.txt', '20190116.txt', '20190117.txt', '20190118.txt', '20190119.txt', '20190120.txt', '20190121.txt', '20190122.txt', '20190123.txt', '20190124.txt', '20190125.txt', '20190128.txt', '20190129.txt', '20190130.txt', '20190131.txt', '20190201.txt', '20190204.txt', '20190205.txt', '20190206.txt', '20190207.txt', '20190208.txt', '20190211.txt', '20190212.txt', '20190213.txt', '20190214.txt', '20190215.txt', '20190218.txt', '20190219.txt', '20190220.txt', '20190221.txt', '20190222.txt', '20190225.txt', '20190226.txt', '20190227.txt', '20190228.txt', '20190301.txt', '20190304.txt', '20190305.txt', '20190307.txt', '20190311.txt', '20190312.txt', '20190313.txt', '201903

  0%|          | 0/1241 [00:00<?, ?it/s]

100%|██████████| 1241/1241 [00:41<00:00, 29.89it/s]


### Possible models to use:

Summarization:

https://huggingface.co/mrm8488/bert2bert_shared-spanish-finetuned-summarization


Sentiment Analysis:

https://huggingface.co/finiteautomata/beto-sentiment-analysis


Emotion Analysis:

https://huggingface.co/finiteautomata/beto-emotion-analysis


['buenos días. ánimo. feliz año todas todos. da mucho gusto estar aquí nuevo tabasco, nuestra tierra, nuestra agua; aquí, teniendo frente todas ceibas, son sagradas época mayas. era fertilidad tierra sostén cielo. ayer estuvimos palenque, precisamente pakal; está renaciendo selva, digo, ceiba. vamos informar. primera conferencia año vamos dar palabra gobernador tabasco, carlos merino. carlos manuel merino campos, gobernador tabasco: muy buenos días, señor presidente. bienvenido su tierra, su agua su gente. saludo agradezco presencia integrantes gabinete federal, así amigos medios comunicación hoy acompañan. expreso, nombre tabasqueños, nuestros mejores deseos año nuevo 2024, sirva renovar ánimos, fe, esperanza, seguir consolidando cuarta transformación méxico. quiero decirles todos sentimos muy honrados su visita. agradecemos distinción presidente república, licenciado andrés manuel lópez obrador, permitirnos ser anfitriones reanudación espacio expresión, diálogo debate tabasco. siempr

In [28]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification

tokenizer = AutoTokenizer.from_pretrained("finiteautomata/beto-emotion-analysis")
model = AutoModelForSequenceClassification.from_pretrained(
    "finiteautomata/beto-emotion-analysis"
)

# Tokenize the text
inputs = tokenizer(
    presidents_dialogues, return_tensors="pt", padding=True, truncation=True
)

# Get the model's output
outputs = model(**inputs)

SequenceClassifierOutput(loss=None, logits=tensor([[ 4.6140, -0.8551, -0.1549,  0.8921, -1.5799, -2.1146, -1.5987]],
       grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)
