In [1]:
import os
import sys
import random
from datetime import datetime

import numpy as np
import polars as pl
import plotly.express as px

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:
# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.training import train_epochs, model_metrics
from toxicity.embeddings_rnn.training import trainer, validate
from toxicity.embeddings_rnn.model import EmbeddingModel, EmbeddingDataset

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
TRAIN_BATCH_SIZE = 8
TEST_BATCH_SIZE = 8
EPOCHS = 20
LEARNING_RATE = 1.5e-05
POS_WEIGHT = 1.663


EMBEDDING_FILE = os.path.join(ROOT_PATH, 'cbow_s100.txt')
EMBEDDING_NAME = 'cbow_s100'
MAX_LEN = 60

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:

def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=RANDOM_SEED)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""17643984771725418028""","""caralho q vergonha kkkkk""",[1],[0],"""caralho q vergonha kkkkk""","""caralho q vergonha kkkkk""","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]"
"""ToLD-Br""","""3886050625220892585""","""foda-se, vou encher o cu de po…",[1],[0],"""foda se vou encher o cu de por…","""foda se vou encher o cu de por…","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]"
"""ToLD-Br""","""14936095030342170465""","""USER USER USER Vc só pensa no …",[1],[1],"""USER USER USER Vc só pensa no …","""user user user vc só pensa no …","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]"
"""ToLD-Br""","""18279259074216789411""","""família""",[0],[0],"""família""","""família""","[""família""]","[""família""]","[""familia""]","[""familia""]","[""família""]","[""família""]","[""familia""]","[""familia""]"
"""OLID-Br""","""7f36b160e8624968a32e82b1c6750f…","""RT USER: vey a juliette veio c…",[0],[0],"""RT USER vey a juliette veio co…","""rt user vey a juliette veio co…","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]"


### Load Embeddings

In [7]:
EMBEDDING_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}')
os.makedirs(EMBEDDING_PATH, exist_ok=True)

emb_dim = None
token_count = None
embeddings = {}

if not os.path.exists(f'{EMBEDDING_PATH}/embeddings.parquet.zstd'):
    with open(EMBEDDING_FILE, 'r') as f:
        fl = f.readline()
        token_count, emb_dim = map(int, fl.split(' '))

        while line := f.readline():
            emb = line.split(' ')

            token = emb[0]
            values = [float(v) for v in emb[1:]]

            if emb_dim is None:
                emb_dim = len(values)
            elif emb_dim != len(values):
                raise ValueError('Inconsistent embedding length')

            embeddings[token] = values
    
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')
    embedding_df = pl.DataFrame({
        'token': list(embeddings.keys()),
        'embedding': list(embeddings.values())
    })
    embedding_df.write_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd', compression="zstd", compression_level=9)
else:
    embedding_df = pl.read_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd')
    token_list = embedding_df['token'].to_list()
    embedding_list = embedding_df['embedding'].to_list()
    embeddings = dict(zip(token_list, embedding_list))
    emb_dim = len(embeddings[next(iter(embeddings))])
    token_count = len(embeddings)
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')


Embedding Length: 100
Embedding Vocab Size: 929606; Expected: 929606


## Init Model

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks.

In [8]:
model = EmbeddingModel(emb_dim, MAX_LEN)
model.to(PYTORCH_DEVICE)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

## Data Split

In [9]:
train_df, test_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)

train_loader = DataLoader(
    EmbeddingDataset(train_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    EmbeddingDataset(test_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

## Training

In [10]:
def validate_result():
    # Validate the results
    raw_results, raw_targets = validate(model, test_loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    return model_metrics(fixed_targets, fixed_results)

### Training Loop

In [11]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
MODEL_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}-rnn', TIMESTAMP)
BEST_MODEL_PATH = os.path.join(MODEL_PATH, 'best_model.pt')
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', f'embeddings-{EMBEDDING_NAME}-rnn', TIMESTAMP)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

loss_history = []
metric_history = []
target_metric = ('Weighted F2', 'weighted_f2')
best_metric = float('-inf')
best_epoch = 0

# Save the best model; Override checkpoints; Track metrics
def epoch_callback(epoch, avg_loss):
    global loss_history, metric_history, target_metric, best_metric, best_epoch

    metrics = validate_result()
    print(f'Epoch {epoch+1}: Loss: {avg_loss:.4f}')
    print(f'{target_metric[0]}: {metrics[target_metric[1]]:.4f}')
    loss_history.append(avg_loss)
    metric_history.append(metrics)

    if metrics[target_metric[1]] > best_metric:
        print(f'New best model found!')
        best_metric = metrics[target_metric[1]]
        best_epoch = epoch
        torch.save(model, BEST_MODEL_PATH)



train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)


Running training epoch 1/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 1: Loss: 0.8663
Weighted F2: 0.5666
New best model found!
Finished training epoch 1/20; Average Loss: 0.8663
Running training epoch 2/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 2: Loss: 0.8614
Weighted F2: 0.6205
New best model found!
Finished training epoch 2/20; Average Loss: 0.8614
Running training epoch 3/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 3: Loss: 0.8468
Weighted F2: 0.6001
Finished training epoch 3/20; Average Loss: 0.8468
Running training epoch 4/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 4: Loss: 0.8404
Weighted F2: 0.6719
New best model found!
Finished training epoch 4/20; Average Loss: 0.8404
Running training epoch 5/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 5: Loss: 0.8430
Weighted F2: 0.6733
New best model found!
Finished training epoch 5/20; Average Loss: 0.8430
Running training epoch 6/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 6: Loss: 0.8358
Weighted F2: 0.6822
New best model found!
Finished training epoch 6/20; Average Loss: 0.8358
Running training epoch 7/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 7: Loss: 0.8629
Weighted F2: 0.5661
Finished training epoch 7/20; Average Loss: 0.8629
Running training epoch 8/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 8: Loss: 0.8661
Weighted F2: 0.2757
Finished training epoch 8/20; Average Loss: 0.8661
Running training epoch 9/20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 9: Loss: 0.8659
Weighted F2: 0.5669
Finished training epoch 9/20; Average Loss: 0.8659
Running training epoch 10/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 10: Loss: 0.8655
Weighted F2: 0.2757
Finished training epoch 10/20; Average Loss: 0.8655
Running training epoch 11/20


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 11: Loss: 0.8667
Weighted F2: 0.4695
Finished training epoch 11/20; Average Loss: 0.8667
Running training epoch 12/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 12: Loss: 0.8657
Weighted F2: 0.5666
Finished training epoch 12/20; Average Loss: 0.8657
Running training epoch 13/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 13: Loss: 0.8569
Weighted F2: 0.6733
Finished training epoch 13/20; Average Loss: 0.8569
Running training epoch 14/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 14: Loss: 0.8494
Weighted F2: 0.6783
Finished training epoch 14/20; Average Loss: 0.8494
Running training epoch 15/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 15: Loss: 0.8569
Weighted F2: 0.6212
Finished training epoch 15/20; Average Loss: 0.8569
Running training epoch 16/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 16: Loss: 0.8626
Weighted F2: 0.5661
Finished training epoch 16/20; Average Loss: 0.8626
Running training epoch 17/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 17: Loss: 0.8662
Weighted F2: 0.5661
Finished training epoch 17/20; Average Loss: 0.8662
Running training epoch 18/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 18: Loss: 0.8604
Weighted F2: 0.5777
Finished training epoch 18/20; Average Loss: 0.8604
Running training epoch 19/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 19: Loss: 0.8494
Weighted F2: 0.6183
Finished training epoch 19/20; Average Loss: 0.8494
Running training epoch 20/20


  0%|          | 0/2796 [00:00<?, ?it/s]

  0%|          | 0/699 [00:00<?, ?it/s]

Epoch 20: Loss: 0.8551
Weighted F2: 0.5661
Finished training epoch 20/20; Average Loss: 0.8551


In [12]:
# Build result metrics data frame per epoch
result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
    'loss': loss_history,
}).with_columns(pl.from_dicts(metric_history))


In [13]:
# Plot Loss and Target Metric per Epoch, highlighting the peak
fig_a = px.line(result_df, x='epoch', y='loss', title='Loss per Epoch', template='plotly_dark')
fig_a.add_scatter(x=[best_epoch+1], y=[loss_history[best_epoch]], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_a.update_layout()
fig_a.show()

fig_b = px.line(result_df, x='epoch', y='weighted_f2', title='Weighted F2 per Epoch', template='plotly_dark')
fig_b.add_scatter(x=[best_epoch+1], y=[metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_b.update_layout()
fig_b.show()
