# AMLO Exploratory Data Analysis

In [9]:
import os
import pandas as pd
from tqdm import tqdm
import torch


from amlo_parser import AMLOParser
from training_set import TrainingSet


from torch.utils.data import Dataset, DataLoader
import torch
import numpy as np
import pandas as pd
import os
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from collections import Counter
from torch.nn.utils.rnn import pad_sequence

# Local imports
from xgb_model import XGBoost

#### CONSTANTS

In [3]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
LABELED_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/amlo_labeling.xlsx"

### Pipeline

Create training set, along with its correspoding txt files

In [4]:
all_files = os.listdir(PATH)

training_set = TrainingSet(remove_stopwords=True)
training_set.create_training_set()

100%|██████████| 1246/1246 [00:00<00:00, 6523.12it/s]

Conference 20181207 is not agressive
Conference 20190102 is not agressive
Conference 20190111 is not agressive
Conference 20190227 is not agressive
Conference 20200128 is not agressive
Conference 20210510 is not agressive
Conference 20221125 is not agressive





In [16]:
# Specify the path to your training data folder
folder_path = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/training_data/"

param = {
    "max_depth": 8,
    "eta": 0.15,
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
}


xgb_model = XGBoost(
    folder_path=folder_path,
    dialogues_path=training_set.DIALOGUES_PATH,
    xgb_params=param,
)


xgb_model.create_regression_training_df()
xgb_model.create_unseen_df()

In [17]:
training_df = xgb_model.training_df
unseen_df = xgb_model.unseen_df

### Trying to implement a NNet model

In [21]:
from collections import Counter


def build_vocab(texts, tokenizer, min_freq=1):
    """
    Builds a vocabulary from the given texts based on frequency.

    Args:
    - texts (list of str): List of text samples.
    - tokenizer (callable): Function to tokenize text.
    - min_freq (int): Minimum frequency for a word to be included in the vocab.

    Returns:
    - vocab (dict): Mapping of word to unique index.
    """
    # Tokenize all texts and count word frequencies
    counter = Counter(token for text in texts for token in tokenizer(text))

    # Filter words by min_freq and assign unique indices
    vocab = {
        word: i + 2
        for i, (word, freq) in enumerate(counter.items())
        if freq >= min_freq
    }  # Start indexing from 2

    # Special tokens
    vocab["<pad>"] = 0  # Padding token
    vocab["<unk>"] = 1  # Unknown word token

    return vocab


# Example tokenizer function
def tokenizer(text):
    return text.split()


In [22]:
class AggressivityDataset(Dataset):
    def __init__(self, texts, vocab, scores=None):
        """
        texts: List of text data
        vocab: A dictionary mapping tokens to indices
        scores: List of aggressivity scores (for training data); None for unseen data
        """
        self.texts = [self.numericalize(text, vocab) for text in texts]
        self.scores = scores

    def numericalize(self, text, vocab):
        # Simple tokenization and numericalization based on the provided vocab
        tokenized = (
            text.lower().split()
        )  # Simple whitespace tokenization, adjust as needed
        return [
            vocab.get(token, 0) for token in tokenized
        ]  # 0 as the index for unknown words

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        if self.scores is not None:
            return torch.tensor(self.texts[idx], dtype=torch.long), torch.tensor(
                self.scores[idx], dtype=torch.float
            )
        return (
            torch.tensor(self.texts[idx], dtype=torch.long),
            0,
        )  # Return 0 as a placeholder score for unseen data


# Example usage (assuming 'vocab' is your vocabulary dictionary mapping words to indices):


In [23]:
# Build vocab from your training_df['text']
vocab = build_vocab(training_df["text"].tolist(), tokenizer)

training_dataset = AggressivityDataset(
    training_df["text"].tolist(), vocab, training_df["score"].tolist()
)
