# AMLO Exploratory Data Analysis

In [4]:
import re
import os
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

from amlo_parser import AMLOParser

#### CONSTANTS

In [7]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
LABELED_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/amlo_labeling.xlsx"


#### Defining some classes and functions to help us with the analysis

In [3]:
# Use TF-IDF to find the most important words in the text
def get_most_important_words(text, filename, save=False):
    # Create a TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Apply the vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)

    # Print the result
    scores = tfidf_matrix.toarray()[0]
    words = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with the result
    df = pd.DataFrame({"Word": words, "Score": scores})
    df.sort_values("Score", ascending=False, inplace=True)

    if save:
        df.to_csv(filename, index=False)

    return df

### Pipeline

### Classification Task

In [8]:
# Read the labeled data
class TrainingSet:
    TRAINING_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/training_data/"
    LABELED_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/amlo_labeling.xlsx"
    TEXT_FILES_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
    DIALOGUES_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/presidents_dialogues/"

    def __init__(self, remove_stopwords):
        self.amlo_parser = AMLOParser(self.TEXT_FILES_PATH)
        self.path = LABELED_PATH
        self.labeled_data, self.agressive_phrases = self.read_labeled_data()
        self.remove_stopwords = remove_stopwords

        self.labeled_conference_ids = self.labeled_data["conference_id"].unique()
        self.all_files = os.listdir(self.TEXT_FILES_PATH)

    def read_labeled_data(self):
        """
        Reads the labeled data and agressive phrases from the excel file
        """
        labeled_data = pd.read_excel(self.path, sheet_name="labels")
        agressive_phrases = pd.read_excel(self.path, sheet_name="frases_odio")

        labeled_data = labeled_data.dropna()
        labeled_data.reset_index(drop=True, inplace=True)

        return labeled_data, agressive_phrases

    def agressive_phrases_to_txt(self, agressive_phrases, conference_id):
        """
        Parses the utterances from the dataframe and saves them to a txt file
        """
        # Save the agressive phrases to a txt file
        agressive_phrases_df = agressive_phrases.loc[
            agressive_phrases["conference_id"] == conference_id, :
        ]

        agressive_phrases_df.reset_index(drop=True, inplace=True)
        agressive_phrases_df = agressive_phrases_df.loc[:, ["phrase"]]

        # Write the text to a file
        new_file_path = os.path.join(self.TRAINING_PATH, f"{conference_id}.txt")
        new_file_path = new_file_path.replace(".txt", "_agressive_phrases.txt")

        with open(new_file_path, "w", encoding="utf-8") as f:
            for index, row in agressive_phrases_df.iterrows():
                phrase = self.amlo_parser.clean_text(
                    row["phrase"], remove_stopwords=self.remove_stopwords
                )
                f.write(phrase + "\n")

    def non_agressive_to_txt(self, conference_id):
        """
        Copies the non-agressive phrases to a txt file. Such phrases are under the
        president's dialogues folder
        """
        # Save the non-agressive phrases to a txt file
        print(f"Conference {conference_id} is not agressive")
        # Copy file to training data
        dialogue_path = os.path.join(
            self.DIALOGUES_PATH, f"{conference_id}_president_dialogues.txt"
        )

        with open(dialogue_path, "r", encoding="utf-8") as f:
            text = f.read()
            text = self.amlo_parser.clean_text(
                text, remove_stopwords=self.remove_stopwords
            )

        new_file_path = os.path.join(
            self.TRAINING_PATH, f"{conference_id}_non_agressive.txt"
        )

        with open(new_file_path, "w", encoding="utf-8") as f:
            f.write(text)

    def create_training_set(self):
        for file in tqdm(self.all_files):
            if file.endswith(".txt"):
                conference_id = int(re.findall(r"\d+", file)[0])

                if conference_id in self.labeled_conference_ids:
                    # Get conference label
                    conference_label = self.labeled_data.loc[
                        self.labeled_data["conference_id"] == conference_id,
                        "is_agressive",
                    ].values[0]

                    if conference_label == 1:
                        # Get the agressive phrases
                        self.agressive_phrases_to_txt(
                            self.agressive_phrases, conference_id
                        )
                    else:
                        # Copy the file to the training data
                        self.non_agressive_to_txt(conference_id)
                else:
                    continue


In [9]:
all_files = os.listdir(PATH)

training_set = TrainingSet(remove_stopwords=True)
training_set.create_training_set()


 21%|██▏       | 266/1241 [00:00<00:00, 2450.23it/s]

Conference 20181207 is not agressive
Conference 20190102 is not agressive
Conference 20190111 is not agressive
Conference 20190227 is not agressive
Conference 20200128 is not agressive
Conference 20210510 is not agressive


100%|██████████| 1241/1241 [00:00<00:00, 4295.63it/s]

Conference 20221125 is not agressive





### REGRESSION TASK

In [50]:
import os
import pandas as pd

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

In [49]:
def create_classification_training_df():
    """
    Creates a DataFrame with the training data for the classification model
    """

    # Initialize an empty list to store data
    data = []

    # Specify the path to your training data folder
    folder_path = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/training_data/"

    # Iterate through each file in the folder
    for file in os.listdir(folder_path):
        if file.endswith(".txt"):
            # Determine the label based on the file name
            label = 0 if "non" in file else 1
            id = int(re.findall(r"\d+", file)[0])
            # Read the text file
            with open(os.path.join(folder_path, file), "r", encoding="utf-8") as f:
                text = f.read()
            data.append({"id": id, "text": text, "label": label})

    # Convert the list to a DataFrame
    df = pd.DataFrame(data)

    return df

In [52]:
def train_classification_model(df):
    """
    Trains a simple classification model using TF-IDF and logistic regression
    """
    # Initialize the TF-IDF vectorizer
    tfidf_vectorizer = TfidfVectorizer(max_features=1000)

    # Fit and transform the 'text' column
    X = tfidf_vectorizer.fit_transform(df["text"])

    # Assuming 'label' is your target variable
    y = df["label"]

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=42
    )

    # Initialize and train the logistic regression model
    model = LogisticRegression(max_iter=1000, class_weight="balanced")
    model.fit(X_train, y_train)

    # Make predictions on the testing set
    y_pred = model.predict(X_test)

    # Print out the evaluation metrics
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
    print(classification_report(y_test, y_pred))

    return model


In [45]:
# Now, let us do it for the entire dataset

data = []

# Iterate through each file in the folder
for file in os.listdir(training_set.DIALOGUES_PATH):
    if file.endswith(".txt"):
        # Determine the label based on the file name
        conference_id = int(re.findall(r"\d+", file)[0])

        if conference_id not in training_set.labeled_conference_ids:
            # Read the text file
            with open(
                os.path.join(training_set.DIALOGUES_PATH, file), "r", encoding="utf-8"
            ) as f:
                text = f.read()
                data.append({"id": conference_id, "text": text})


unseen_df = pd.DataFrame(data)


In [46]:
# Predict the labels for the unseen data

# Transform the new texts
X_new = tfidf_vectorizer.transform(unseen_df["text"])

# Make predictions
y_new = model.predict(X_new)

# Add the predictions to the DataFrame
unseen_df["label"] = y_new

# Print the results
unseen_df


Unnamed: 0,id,text,label
0,20181210,buenas tardes sí son buenos días llegué tarde ...,0
1,20181211,caso sindicatos hemos hecho compromiso impulsa...,0
2,20181212,buenos días hoy vamos presentarles plan genera...,0
3,20181217,buenos díasestamos iniciando semana terminamos...,0
4,20181218,buenos días informo haya dudas falta informaci...,0
...,...,...,...
1207,20240212,buenos días ánimo bueno vamos iniciar semana t...,0
1208,20240213,buenos días ánimo ánimo pues vamos informar ho...,0
1209,20240214,buenos días ánimo\nbien bien mejor mejor ayer ...,0
1210,20240215,da mucho gusto estar nuevo aquí ustedes acapul...,0


In [48]:
print(unseen_df.loc[unseen_df["label"] == 1, :])

            id                                               text  label
38    20190211  buenos días día hoy vamos dar conocer venido s...      1
536   20210303  buenos díastardes alargó mucho reunión tenemos...      1
538   20210305  buenos días estamos finalizando semana vamos g...      1
552   20210326  buenos días bueno vamos iniciar hoy vamos gira...      1
578   20210507  buenos días bueno vamos informar vio ayer fina...      1
675   20211006  buenos días ánimo bueno pues día hoy muy impor...      1
694   20211105  buenos días está haciendo frío bueno ayer acor...      1
725   20211229  buenos días vamos informar hoy quién quién men...      1
784   20220401  buenos días bueno pues vamos dedicar día conte...      1
789   20220408  muy buenos días hablemos más vámonos preguntas...      1
963   20221228  buenos días ánimo muy bien empezaron poner muñ...      1
983   20230203  buenos días ánimo bueno pues vamos informar do...      1
995   20230221  buenos días ánimo bueno día hoy vam

### Possible models to use:

Summarization:

https://huggingface.co/mrm8488/bert2bert_shared-spanish-finetuned-summarization


Sentiment Analysis:

https://huggingface.co/finiteautomata/beto-sentiment-analysis


Emotion Analysis:

https://huggingface.co/finiteautomata/beto-emotion-analysis
