# AMLO Exploratory Data Analysis

In [1]:
import re
import os
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

from amlo_parser import AMLOParser

#### CONSTANTS

In [2]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
LABELED_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/amlo_labeling.xlsx"


#### Defining some classes and functions to help us with the analysis

In [3]:
# Use TF-IDF to find the most important words in the text
def get_most_important_words(text, filename, save=False):
    # Create a TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Apply the vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)

    # Print the result
    scores = tfidf_matrix.toarray()[0]
    words = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with the result
    df = pd.DataFrame({"Word": words, "Score": scores})
    df.sort_values("Score", ascending=False, inplace=True)

    if save:
        df.to_csv(filename, index=False)

    return df

### Pipeline

 Word Scoring

In [4]:
text_parser = AMLOParser(PATH)

# List of all the text files
all_files = os.listdir(PATH)

# Get the president's dialogues from a text file
new_path = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/word_scores/"

for file in tqdm(all_files):
    if file.endswith(".txt"):
        text = text_parser.get_presidents_dialogues(file, remove_stopwords=True)
        new_file_path = os.path.join(new_path, file)
        new_file_path = new_file_path.replace(".txt", "_word_scores.csv")
        get_most_important_words(text, new_file_path, save=True)

100%|██████████| 1241/1241 [00:37<00:00, 32.75it/s]


### Classification Task

In [5]:
# Read the labeled data
class TrainingSet:
    TRAINING_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/training_data/"
    LABELED_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/amlo_labeling.xlsx"
    TEXT_FILES_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
    DIALOGUES_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/presidents_dialogues/"

    def __init__(self, remove_stopwords):
        self.amlo_parser = AMLOParser(self.TEXT_FILES_PATH)
        self.path = LABELED_PATH
        self.labeled_data, self.agressive_phrases = self.read_labeled_data()
        self.remove_stopwords = remove_stopwords

        self.labeled_conference_ids = self.labeled_data["conference_id"].unique()
        self.all_files = os.listdir(self.TEXT_FILES_PATH)

    def read_labeled_data(self):
        """
        Reads the labeled data and agressive phrases from the excel file
        """
        labeled_data = pd.read_excel(self.path, sheet_name="labels")
        agressive_phrases = pd.read_excel(self.path, sheet_name="frases_odio")

        labeled_data = labeled_data.dropna()
        labeled_data.reset_index(drop=True, inplace=True)

        return labeled_data, agressive_phrases

    def agressive_phrases_to_txt(self, agressive_phrases, conference_id):
        """
        Parses the utterances from the dataframe and saves them to a txt file
        """
        # Save the agressive phrases to a txt file
        agressive_phrases_df = agressive_phrases.loc[
            agressive_phrases["conference_id"] == conference_id, :
        ]

        agressive_phrases_df.reset_index(drop=True, inplace=True)
        agressive_phrases_df = agressive_phrases_df.loc[:, ["phrase"]]

        # Write the text to a file
        new_file_path = os.path.join(self.TRAINING_PATH, f"{conference_id}.txt")
        new_file_path = new_file_path.replace(".txt", "_agressive_phrases.txt")

        with open(new_file_path, "w", encoding="utf-8") as f:
            for index, row in agressive_phrases_df.iterrows():
                phrase = self.amlo_parser.clean_text(
                    row["phrase"], remove_stopwords=self.remove_stopwords
                )
                f.write(phrase + "\n")

    def non_agressive_to_txt(self, conference_id):
        """
        Copies the non-agressive phrases to a txt file. Such phrases are under the
        president's dialogues folder
        """
        # Save the non-agressive phrases to a txt file
        print(f"Conference {conference_id} is not agressive")
        # Copy file to training data
        dialogue_path = os.path.join(
            self.DIALOGUES_PATH, f"{conference_id}_president_dialogues.txt"
        )

        with open(dialogue_path, "r", encoding="utf-8") as f:
            text = f.read()
            text = self.amlo_parser.clean_text(
                text, remove_stopwords=self.remove_stopwords
            )

        new_file_path = os.path.join(
            self.TRAINING_PATH, f"{conference_id}_non_agressive.txt"
        )

        with open(new_file_path, "w", encoding="utf-8") as f:
            f.write(text)

    def create_training_set(self):
        for file in tqdm(self.all_files):
            if file.endswith(".txt"):
                conference_id = int(re.findall(r"\d+", file)[0])

                if conference_id in self.labeled_conference_ids:
                    # Get conference label
                    conference_label = self.labeled_data.loc[
                        self.labeled_data["conference_id"] == conference_id,
                        "is_agressive",
                    ].values[0]

                    if conference_label == 1:
                        # Get the agressive phrases
                        self.agressive_phrases_to_txt(
                            self.agressive_phrases, conference_id
                        )
                    else:
                        # Copy the file to the training data
                        self.non_agressive_to_txt(conference_id)
                else:
                    continue


In [6]:
all_files = os.listdir(PATH)

training_set = TrainingSet(remove_stopwords=True)
training_set.create_training_set()


 39%|███▉      | 490/1241 [00:00<00:00, 4556.83it/s]

Conference 20181207 is not agressive
Conference 20190102 is not agressive
Conference 20190111 is not agressive
Conference 20190227 is not agressive
Conference 20200128 is not agressive
Conference 20210510 is not agressive
Conference 20221125 is not agressive


100%|██████████| 1241/1241 [00:00<00:00, 5084.57it/s]


### Possible models to use:

Summarization:

https://huggingface.co/mrm8488/bert2bert_shared-spanish-finetuned-summarization


Sentiment Analysis:

https://huggingface.co/finiteautomata/beto-sentiment-analysis


Emotion Analysis:

https://huggingface.co/finiteautomata/beto-emotion-analysis
