# AMLO Exploratory Data Analysis

In [1]:
import re
import os
import pandas as pd
from tqdm import tqdm

from sklearn.feature_extraction.text import TfidfVectorizer

from amlo_parser import AMLOParser

#### CONSTANTS

In [2]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
LABELED_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/amlo_labeling.xlsx"


#### Defining some classes and functions to help us with the analysis

In [27]:
# Use TF-IDF to find the most important words in the text
def get_most_important_words(text, filename, save=False):
    # Create a TfidfVectorizer
    tfidf_vectorizer = TfidfVectorizer()

    # Apply the vectorizer
    tfidf_matrix = tfidf_vectorizer.fit_transform(text)

    # Print the result
    scores = tfidf_matrix.toarray()[0]
    words = tfidf_vectorizer.get_feature_names_out()

    # Create a DataFrame with the result
    df = pd.DataFrame({"Word": words, "Score": scores})
    df.sort_values("Score", ascending=False, inplace=True)

    if save:
        df.to_csv(filename, index=False)

    return df

### Pipeline

 Word Scoring

In [28]:
text_parser = AMLOParser(PATH)

# List of all the text files
all_files = os.listdir(PATH)

# Get the president's dialogues from a text file
new_path = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/word_scores/"

for file in tqdm(all_files):
    if file.endswith(".txt"):
        text = text_parser.get_presidents_dialogues(file, remove_stopwords=True)
        new_file_path = os.path.join(new_path, file)
        new_file_path = new_file_path.replace(".txt", "_word_scores.csv")
        get_most_important_words(text, new_file_path, save=True)

['20181207.txt', '20181210.txt', '20181211.txt', '20181212.txt', '20181217.txt', '20181218.txt', '20181219.txt', '20181224.txt', '20181226.txt', '20181227.txt', '20181228.txt', '20190102.txt', '20190103.txt', '20190104.txt', '20190107.txt', '20190108.txt', '20190109.txt', '20190110.txt', '20190111.txt', '20190114.txt', '20190115.txt', '20190116.txt', '20190117.txt', '20190118.txt', '20190119.txt', '20190120.txt', '20190121.txt', '20190122.txt', '20190123.txt', '20190124.txt', '20190125.txt', '20190128.txt', '20190129.txt', '20190130.txt', '20190131.txt', '20190201.txt', '20190204.txt', '20190205.txt', '20190206.txt', '20190207.txt', '20190208.txt', '20190211.txt', '20190212.txt', '20190213.txt', '20190214.txt', '20190215.txt', '20190218.txt', '20190219.txt', '20190220.txt', '20190221.txt', '20190222.txt', '20190225.txt', '20190226.txt', '20190227.txt', '20190228.txt', '20190301.txt', '20190304.txt', '20190305.txt', '20190307.txt', '20190311.txt', '20190312.txt', '20190313.txt', '201903

  0%|          | 0/1241 [00:00<?, ?it/s]

100%|██████████| 1241/1241 [00:41<00:00, 29.89it/s]


### Classification Task

In [9]:
# Read the labeled data
def read_labeled_data(path):
    labeled_data = pd.read_excel(path, sheet_name="labels")
    agressive_phrases = pd.read_excel(path, sheet_name="frases_odio")

    labeled_data = labeled_data.dropna()
    labeled_data.reset_index(drop=True, inplace=True)

    return labeled_data, agressive_phrases


In [10]:
labeled_data, agressive_phrases = read_labeled_data(LABELED_PATH)

In [8]:
labeled_conference_ids = labeled_data["conference_id"].unique()
all_files = os.listdir(PATH)


TRAINING_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/training_data/"

for file in tqdm(all_files):
    if file.endswith(".txt"):
        conference_id = int(re.findall(r"\d+", file)[0])
        if conference_id in labeled_conference_ids:
            conference_label = labeled_data.loc[
                labeled_data["conference_id"] == conference_id, :
            ]["is_agressive"].values[0]

            if conference_label == 1:
                print(f"Conference {conference_id} is agressive")

                # Get text from dataframe
                agressive_phrases_df = agressive_phrases.loc[
                    agressive_phrases["conference_id"] == conference_id, :
                ]

                agressive_phrases_df.reset_index(drop=True, inplace=True)
                agressive_phrases_df = agressive_phrases_df.loc[:, ["phrase"]]

                # Write the text to a file
                new_file_path = os.path.join(new_path, file)
                new_file_path = new_file_path.replace(".txt", "_agressive_phrases.txt")
                with open(new_file_path, "w", encoding="utf-8") as f:
                    for index, row in agressive_phrases_df.iterrows():
                        f.write(row["phrase"] + "\n")

            else:
                print(f"Conference {conference_id} is not agressive")
                # Copy file to training data
                new_file_path = os.path.join(TRAINING_PATH, file)

                if not os.path.exists(new_file_path):
                    # Copy the file
                    with open(os.path.join(PATH, file), "r", encoding="utf-8") as f:
                        text = f.read()

                    with open(new_file_path, "w", encoding="utf-8") as f:
                        f.write(text)

                print(agressive_phrases_df)


100%|██████████| 1241/1241 [00:00<00:00, 21773.14it/s]

Conference 20190325 is agressive
     conference_id       date   
232       20190325  3/25/2019  \
233       20190325  3/25/2019   
234       20190325  3/25/2019   

                                                phrase  
232  Existe una prensa fifi, no es una invención, e...  
233             A ver, tecnócratas, a ver prensa fifí.  
234  Fox, ¿que no era el cambio? Resultó un traidor...  
Conference 20190430 is agressive
     conference_id       date   
137       20190430  4/30/2019  \
138       20190430  4/30/2019   
139       20190430  4/30/2019   
140       20190430  4/30/2019   
141       20190430  4/30/2019   
142       20190430  4/30/2019   
143       20190430  4/30/2019   

                                                phrase  
137  Jesús nos explicaba lo que ha significado la p...  
138  porque lo que pasaba es que todo el presupuest...  
139  porque antes había consigna de parte del Ejecu...  
140  Eso no existía, antes había hasta teléfono dir...  
141  Porque resulta, co




### Possible models to use:

Summarization:

https://huggingface.co/mrm8488/bert2bert_shared-spanish-finetuned-summarization


Sentiment Analysis:

https://huggingface.co/finiteautomata/beto-sentiment-analysis


Emotion Analysis:

https://huggingface.co/finiteautomata/beto-emotion-analysis
