In [1]:
import os
import sys
import json
import random
from datetime import datetime

import numpy as np
import polars as pl
import plotly.express as px

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:

# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.training import train_epochs, model_metrics
from toxicity.bow.training import trainer, validate
from toxicity.bow.model import BoWModel, BoWDataset

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
TRAIN_BATCH_SIZE = 128
TEST_BATCH_SIZE = 128
EPOCHS = 20
LEARNING_RATE = 2e-05
POS_WEIGHT = 1.663


OCC_TRESHOLD = 10
OTHER_TOKEN = '[OTHER]'
MAX_LEN = 256

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=RANDOM_SEED)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""17643984771725418028""","""caralho q vergonha kkkkk""",[1],[0],"""caralho q vergonha kkkkk""","""caralho q vergonha kkkkk""","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]"
"""ToLD-Br""","""3886050625220892585""","""foda-se, vou encher o cu de po…",[1],[0],"""foda se vou encher o cu de por…","""foda se vou encher o cu de por…","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]"
"""ToLD-Br""","""14936095030342170465""","""USER USER USER Vc só pensa no …",[1],[1],"""USER USER USER Vc só pensa no …","""user user user vc só pensa no …","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]"
"""ToLD-Br""","""18279259074216789411""","""família""",[0],[0],"""família""","""família""","[""família""]","[""família""]","[""familia""]","[""familia""]","[""família""]","[""família""]","[""familia""]","[""familia""]"
"""OLID-Br""","""7f36b160e8624968a32e82b1c6750f…","""RT USER: vey a juliette veio c…",[0],[0],"""RT USER vey a juliette veio co…","""rt user vey a juliette veio co…","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]"


## Setup Vocabulary for BoW

In [7]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
MODEL_PATH = os.path.join(ROOT_PATH, 'models', 'bow', TIMESTAMP)
os.makedirs(MODEL_PATH, exist_ok=True)

# Count frequency of tokens
occurences = {}
for tokens in df['lemma_no_stop_words_no_accents']:
    for t in tokens:
        if t not in occurences:
            occurences[t] = 1
        else:
            occurences[t] += 1
            
# Sort by frequency
occurences = dict(sorted(occurences.items(), key=lambda item: item[1], reverse=True))

# Build vocabulary
vocabulary = []
for w, counter in occurences.items():
    if counter > OCC_TRESHOLD:
        vocabulary.append(w)

# Sort vocabulary and add a token for other words
vocabulary = sorted(vocabulary)
vocabulary.append(OTHER_TOKEN)

# Vocabulary lookup table
vocab_index = {}
for i, token in enumerate(vocabulary):
    vocab_index[token] = i

# Write vocabulary to file
with open(os.path.join(MODEL_PATH, 'vocab.txt'), 'w') as f:
    f.write('\n'.join(vocabulary))
with open(os.path.join(MODEL_PATH, 'vocab_index.json'), 'w') as f:
    json.dump(vocab_index, f)
with open(os.path.join(MODEL_PATH, 'vocab.json'), 'w') as f:
    json.dump(vocabulary, f)

print(f'Vocabulary size: {len(vocabulary)}')

Vocabulary size: 2832


## Init Model

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks.

In [8]:
model = BoWModel(len(vocabulary))
model.to(PYTORCH_DEVICE)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

## Data Split

In [9]:
train_df, test_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)

train_loader = DataLoader(
    BoWDataset(train_df, 'lemma_no_stop_words_no_accents', 'off_relaxed', vocab_index=vocab_index), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    BoWDataset(test_df, 'lemma_no_stop_words_no_accents', 'off_relaxed', vocab_index=vocab_index), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

## Training

In [10]:
def validate_result():
    # Validate the results
    raw_results, raw_targets = validate(model, test_loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    metrics = model_metrics(fixed_targets, fixed_results)
    return metrics

### Training Loop

In [11]:
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', 'bow', TIMESTAMP)
MODEL_PATH = os.path.join(ROOT_PATH, 'models', 'bow', TIMESTAMP)
BEST_MODEL_PATH = os.path.join(MODEL_PATH, 'best_model.pt')
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

loss_history = []
metric_history = []
target_metric = ('Weighted F2', 'weighted_f2')
best_metric = float('-inf')
best_epoch = 0

# Save the best model; Override checkpoints; Track metrics
def epoch_callback(epoch, avg_loss):
    global loss_history, metric_history, target_metric, best_metric, best_epoch
    
    metrics = validate_result()
    print(f'Epoch {epoch+1}: Loss: {avg_loss:.4f}')
    print(f'{target_metric[0]}: {metrics[target_metric[1]]:.4f}')
    loss_history.append(avg_loss)
    metric_history.append(metrics)
    
    if metrics[target_metric[1]] > best_metric:
        print(f'New best model found!')
        best_metric = metrics[target_metric[1]]
        best_epoch = epoch
        torch.save(model, BEST_MODEL_PATH)
    
    

train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)

Running training epoch 1/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 1: Loss: 0.8642
Weighted F2: 0.2964
New best model found!
Finished training epoch 1/20; Average Loss: 0.8642
Running training epoch 2/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 2: Loss: 0.8449
Weighted F2: 0.6480
New best model found!
Finished training epoch 2/20; Average Loss: 0.8449
Running training epoch 3/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 3: Loss: 0.7695
Weighted F2: 0.7266
New best model found!
Finished training epoch 3/20; Average Loss: 0.7695
Running training epoch 4/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 4: Loss: 0.6674
Weighted F2: 0.7493
New best model found!
Finished training epoch 4/20; Average Loss: 0.6674
Running training epoch 5/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 5: Loss: 0.5972
Weighted F2: 0.7670
New best model found!
Finished training epoch 5/20; Average Loss: 0.5972
Running training epoch 6/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 6: Loss: 0.5540
Weighted F2: 0.7718
New best model found!
Finished training epoch 6/20; Average Loss: 0.5540
Running training epoch 7/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 7: Loss: 0.5215
Weighted F2: 0.7705
Finished training epoch 7/20; Average Loss: 0.5215
Running training epoch 8/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 8: Loss: 0.4950
Weighted F2: 0.7716
Finished training epoch 8/20; Average Loss: 0.4950
Running training epoch 9/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 9: Loss: 0.4713
Weighted F2: 0.7761
New best model found!
Finished training epoch 9/20; Average Loss: 0.4713
Running training epoch 10/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 10: Loss: 0.4485
Weighted F2: 0.7718
Finished training epoch 10/20; Average Loss: 0.4485
Running training epoch 11/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 11: Loss: 0.4275
Weighted F2: 0.7629
Finished training epoch 11/20; Average Loss: 0.4275
Running training epoch 12/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 12: Loss: 0.4064
Weighted F2: 0.7685
Finished training epoch 12/20; Average Loss: 0.4064
Running training epoch 13/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 13: Loss: 0.3857
Weighted F2: 0.7714
Finished training epoch 13/20; Average Loss: 0.3857
Running training epoch 14/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 14: Loss: 0.3641
Weighted F2: 0.7655
Finished training epoch 14/20; Average Loss: 0.3641
Running training epoch 15/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 15: Loss: 0.3441
Weighted F2: 0.7668
Finished training epoch 15/20; Average Loss: 0.3441
Running training epoch 16/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 16: Loss: 0.3235
Weighted F2: 0.7630
Finished training epoch 16/20; Average Loss: 0.3235
Running training epoch 17/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 17: Loss: 0.3037
Weighted F2: 0.7630
Finished training epoch 17/20; Average Loss: 0.3037
Running training epoch 18/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 18: Loss: 0.2841
Weighted F2: 0.7672
Finished training epoch 18/20; Average Loss: 0.2841
Running training epoch 19/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 19: Loss: 0.2648
Weighted F2: 0.7633
Finished training epoch 19/20; Average Loss: 0.2648
Running training epoch 20/20


  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/44 [00:00<?, ?it/s]

Epoch 20: Loss: 0.2475
Weighted F2: 0.7624
Finished training epoch 20/20; Average Loss: 0.2475


In [12]:
# Build result metrics data frame per epoch
result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
    'loss': loss_history,
}).with_columns(pl.from_dicts(metric_history))
result_df.head()

epoch,loss,weighted_f1,macro_f1,weighted_f2,macro_f2,accuracy,recall,precision
i64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.864209,0.232955,0.297193,0.296354,0.389386,0.383652,0.383652,0.685429
2,0.84487,0.655248,0.645448,0.647954,0.654204,0.649437,0.649437,0.692047
3,0.76947,0.728662,0.712615,0.726569,0.71589,0.725809,0.725809,0.734616
4,0.667402,0.752525,0.740007,0.749277,0.745886,0.748882,0.748882,0.764721
5,0.597228,0.76834,0.753849,0.766954,0.75674,0.76641,0.76641,0.772127


In [14]:
# Plot Loss and Target Metric per Epoch, highlighting the peak
fig_a = px.line(result_df, x='epoch', y='loss', title='Loss per Epoch')
fig_a.add_scatter(x=[best_epoch+1], y=[loss_history[best_epoch]], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_a.update_layout()
fig_a.show()

fig_b = px.line(result_df, x='epoch', y='weighted_f2', title='Weighted F2 per Epoch')
fig_b.add_scatter(x=[best_epoch+1], y=[metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_b.update_layout()
fig_b.show()