# AMLO Exploratory Data Analysis

In [61]:
import os
from collections import Counter

import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, Dataset
from tqdm import tqdm

# Local imports
from training_set import TrainingSet
from xgb_model import XGBoost

#### CONSTANTS

In [62]:
PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/text_files/"
LABELED_PATH = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/amlo_labeling.xlsx"

### Pipeline

Create training set, along with its correspoding txt files

In [63]:
all_files = os.listdir(PATH)

training_set = TrainingSet(remove_stopwords=True)
training_set.create_training_set()

 47%|████▋     | 587/1246 [00:00<00:00, 5863.95it/s]

Conference 20181207 is not agressive
Conference 20190102 is not agressive
Conference 20190111 is not agressive
Conference 20190227 is not agressive
Conference 20200128 is not agressive
Conference 20210510 is not agressive
Conference 20221125 is not agressive


100%|██████████| 1246/1246 [00:00<00:00, 5468.31it/s]


In [64]:
# Specify the path to your training data folder
folder_path = "C:/Users/fdmol/Desktop/AMLO-NLP/src/data/training_data/"

param = {
    "max_depth": 8,
    "eta": 0.15,
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
}


xgb_model = XGBoost(
    folder_path=folder_path,
    dialogues_path=training_set.DIALOGUES_PATH,
    xgb_params=param,
)


xgb_model.create_regression_training_df()
xgb_model.create_unseen_df()


In [65]:
training_df = xgb_model.training_df
unseen_df = xgb_model.unseen_df


### Trying to implement a NNet model

In [66]:
def build_vocab(texts, tokenizer, min_freq=1):
    """
    Builds a vocabulary from the given texts based on frequency.

    Args:
    - texts (list of str): List of text samples.
    - tokenizer (callable): Function to tokenize text.
    - min_freq (int): Minimum frequency for a word to be included in the vocab.

    Returns:
    - vocab (dict): Mapping of word to unique index.
    """
    # Tokenize all texts and count word frequencies
    counter = Counter(token for text in texts for token in tokenizer(text))

    # Filter words by min_freq and assign unique indices
    vocab = {
        word: i + 2
        for i, (word, freq) in enumerate(counter.items())
        if freq >= min_freq
    }  # Start indexing from 2

    # Special tokens
    vocab["<pad>"] = 0  # Padding token
    vocab["<unk>"] = 1  # Unknown word token

    return vocab


# Example tokenizer function
def tokenizer(text):
    return text.split()

In [67]:
class AggressivityDataset(Dataset):
    def __init__(self, texts, vocab, scores=None):
        """
        texts: List of text data
        vocab: A dictionary mapping tokens to indices
        scores: List of aggressivity scores (for training data); None for unseen data
        """
        self.texts = [self.numericalize(text, vocab) for text in texts]
        self.scores = scores

    def numericalize(self, text, vocab):
        # Simple tokenization and numericalization based on the provided vocab
        tokenized = (
            text.lower().split()
        )  # Simple whitespace tokenization, adjust as needed
        return [
            vocab.get(token, 0) for token in tokenized
        ]  # 0 as the index for unknown words

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        item = {"text": self.texts[idx]}
        if self.scores is not None:  # Handle cases where scores might not be available
            item["score"] = self.scores[idx]
        return item


def collate_fn(batch):
    text_sequences = [
        torch.tensor(item["text"], dtype=torch.long) for item in batch
    ]  # Convert each sequence to a tensor
    scores = torch.tensor([item["score"] for item in batch], dtype=torch.float)

    # Pad the sequences to have the same length
    text_sequences_padded = pad_sequence(
        text_sequences, batch_first=True, padding_value=0
    )

    return {"text": text_sequences_padded, "score": scores}

In [68]:
# Build vocab from your training_df['text']
vocab = build_vocab(training_df["text"].tolist(), tokenizer)

training_dataset = AggressivityDataset(
    training_df["text"].tolist(), vocab, training_df["score"].tolist()
)


unseen_dataset = AggressivityDataset(unseen_df["text"].tolist(), vocab, scores=None)

batch_size = 32  # Adjust batch size according to your need

In [69]:
train_loader = DataLoader(
    training_dataset, batch_size=batch_size, shuffle=True, collate_fn=collate_fn
)
unseen_loader = DataLoader(
    unseen_dataset, batch_size=32, shuffle=False, collate_fn=collate_fn
)

## Define Neural Network model

In [70]:
class NeuralRegressor(nn.Module):
    def __init__(self, vocab_size, embedding_dim):
        super(NeuralRegressor, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)
        self.fc = nn.Linear(embedding_dim, 1)

    def forward(self, x):
        embedded = self.embedding(x)
        pooled = F.avg_pool2d(embedded, (embedded.shape[1], 1)).squeeze(
            1
        )  # Average pooling
        output = self.fc(pooled)
        return output

In [60]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = NeuralRegressor(vocab_size=len(vocab) + 1, embedding_dim=100).to(device)

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

num_epochs = 100

for epoch in range(num_epochs):
    model.train()
    for batch in train_loader:
        inputs, targets = batch["text"].to(device), batch["score"].to(device)

        optimizer.zero_grad()
        outputs = model(inputs).squeeze(1)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

    print(f"Epoch {epoch+1}, Loss: {loss.item()}")

Epoch 1, Loss: 0.0055724638514220715
Epoch 2, Loss: 0.005368649493902922
Epoch 3, Loss: 0.0051779067143797874
Epoch 4, Loss: 0.004998402204364538
Epoch 5, Loss: 0.004828650038689375
Epoch 6, Loss: 0.004667455796152353
Epoch 7, Loss: 0.004513794556260109
Epoch 8, Loss: 0.00436673779040575
Epoch 9, Loss: 0.004225453827530146
Epoch 10, Loss: 0.0040892343968153
Epoch 11, Loss: 0.003957508131861687
Epoch 12, Loss: 0.0038298284634947777
Epoch 13, Loss: 0.003705863608047366
Epoch 14, Loss: 0.003585378173738718
Epoch 15, Loss: 0.003468212904408574
Epoch 16, Loss: 0.003354267915710807
Epoch 17, Loss: 0.0032434870954602957
Epoch 18, Loss: 0.0031358483247458935
Epoch 19, Loss: 0.0030313502065837383
Epoch 20, Loss: 0.0029300020541995764
Epoch 21, Loss: 0.002831819234415889
Epoch 22, Loss: 0.00273680966347456
Epoch 23, Loss: 0.002644962863996625
Epoch 24, Loss: 0.002556247403845191
Epoch 25, Loss: 0.0024706004187464714
Epoch 26, Loss: 0.002387929242104292
Epoch 27, Loss: 0.002308111870661378
Epoch 

### Predict score

In [71]:
model.eval()

# Predict on the unseen data
predictions = []
with torch.no_grad():
    for batch in unseen_loader:
        inputs = batch["text"].to(device)
        outputs = model(inputs).squeeze(1)
        predictions.extend(outputs.cpu().numpy())

unseen_df["predicted_score"] = predictions


KeyError: 'score'