In [1]:
import os
import sys
import random
from datetime import datetime

import numpy as np
import polars as pl
import plotly.express as px

from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:
# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.training import train_epochs, model_metrics
from toxicity.embeddings.training import trainer, validate
from toxicity.embeddings.model import EmbeddingModel, EmbeddingDataset

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
TRAIN_BATCH_SIZE = 16
TEST_BATCH_SIZE = 16
EPOCHS = 20
LEARNING_RATE = 1.8e-05
POS_WEIGHT = 1.663


EMBEDDING_FILE = os.path.join(ROOT_PATH, 'cbow_s100.txt')
EMBEDDING_NAME = 'cbow_s100'
MAX_LEN = 128

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:

def reseed(seed: int = RANDOM_SEED):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

reseed()

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'pre_processed_data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=RANDOM_SEED)

dataset,id,text,off_strict,off_relaxed,base_clean,base_clean_lower,tokenized,lemmatized,no_accents,lemma_no_accents,no_stop_words,lemma_no_stop_words,no_stop_words_no_accents,lemma_no_stop_words_no_accents
str,str,str,"array[i32, 1]","array[i32, 1]",str,str,list[str],list[str],list[str],list[str],list[str],list[str],list[str],list[str]
"""ToLD-Br""","""17643984771725418028""","""caralho q vergonha kkkkk""",[1],[0],"""caralho q vergonha kkkkk""","""caralho q vergonha kkkkk""","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]","[""caralho"", ""q"", … ""kkkkk""]"
"""ToLD-Br""","""3886050625220892585""","""foda-se, vou encher o cu de po…",[1],[0],"""foda se vou encher o cu de por…","""foda se vou encher o cu de por…","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""se"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]","[""foda"", ""vou"", … ""lol""]","[""foda"", ""ir"", … ""lol""]"
"""ToLD-Br""","""14936095030342170465""","""USER USER USER Vc só pensa no …",[1],[1],"""USER USER USER Vc só pensa no …","""user user user vc só pensa no …","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]","[""user"", ""user"", … ""esperta""]","[""user"", ""user"", … ""esperto""]"
"""ToLD-Br""","""18279259074216789411""","""família""",[0],[0],"""família""","""família""","[""família""]","[""família""]","[""familia""]","[""familia""]","[""família""]","[""família""]","[""familia""]","[""familia""]"
"""OLID-Br""","""7f36b160e8624968a32e82b1c6750f…","""RT USER: vey a juliette veio c…",[0],[0],"""RT USER vey a juliette veio co…","""rt user vey a juliette veio co…","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]","[""rt"", ""user"", … ""t""]"


### Load Embeddings

In [7]:
EMBEDDING_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}')
os.makedirs(EMBEDDING_PATH, exist_ok=True)

emb_dim = None
token_count = None
embeddings = {}

if not os.path.exists(f'{EMBEDDING_PATH}/embeddings.parquet.zstd'):
    with open(EMBEDDING_FILE, 'r') as f:
        fl = f.readline()
        token_count, emb_dim = map(int, fl.split(' '))

        while line := f.readline():
            emb = line.split(' ')

            token = emb[0]
            values = [float(v) for v in emb[1:]]

            if emb_dim is None:
                emb_dim = len(values)
            elif emb_dim != len(values):
                raise ValueError('Inconsistent embedding length')

            embeddings[token] = values
    
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')
    embedding_df = pl.DataFrame({
        'token': list(embeddings.keys()),
        'embedding': list(embeddings.values())
    })
    embedding_df.write_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd', compression="zstd", compression_level=9)
else:
    embedding_df = pl.read_parquet(f'{EMBEDDING_PATH}/embeddings.parquet.zstd')
    token_list = embedding_df['token'].to_list()
    embedding_list = embedding_df['embedding'].to_list()
    embeddings = dict(zip(token_list, embedding_list))
    emb_dim = len(embeddings[next(iter(embeddings))])
    token_count = len(embeddings)
    print(f'Embedding Length: {emb_dim}')
    print(f'Embedding Vocab Size: {len(embeddings)}; Expected: {token_count}')


Embedding Length: 100
Embedding Vocab Size: 929606; Expected: 929606


## Init Model

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks.

In [8]:
model = EmbeddingModel(emb_dim, MAX_LEN)
model.to(PYTORCH_DEVICE)

loss_fn = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

## Data Split

In [9]:
train_df, other_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)
validate_df, test_df = train_test_split(other_df, train_size=0.5, random_state=RANDOM_SEED)

train_loader = DataLoader(
    EmbeddingDataset(train_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=True, num_workers=0, batch_size=TRAIN_BATCH_SIZE,
)
test_loader = DataLoader(
    EmbeddingDataset(test_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)
validate_loader = DataLoader(
    EmbeddingDataset(validate_df, 'lemma_no_stop_words', 'off_relaxed', embeddings=embeddings, emb_dim=emb_dim, seq_len=MAX_LEN), 
    shuffle=False, num_workers=0, batch_size=TEST_BATCH_SIZE,
)

train_distribution = train_df['off_relaxed'].to_pandas().value_counts()
neg_count, pos_count = train_distribution.iloc[0], train_distribution.iloc[1]
print(f'Training distribution: {neg_count} negative, {pos_count} positive')
POS_WEIGHT = neg_count / pos_count
print(f'Positive weight: {POS_WEIGHT}')

Training distribution: 13932 negative, 8429 positive
Positive weight: 1.6528651085538024


## Training

In [10]:
def validate_result(loader: DataLoader, model: nn.Module):
    # Validate the results
    raw_results, raw_targets = validate(model, loader, PYTORCH_DEVICE)
    raw_results = np.array(raw_results)
    raw_targets = np.array(raw_targets)

    # Apply a fixed threshold to the results
    FIXED_THRESHOLD = 0.5
    fixed_results = raw_results > FIXED_THRESHOLD
    fixed_targets = raw_targets > FIXED_THRESHOLD

    # Compute metrics
    metrics = model_metrics(fixed_targets, fixed_results)
    return metrics

### Training Loop

In [11]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
MODEL_PATH = os.path.join(ROOT_PATH, 'models', f'embeddings-{EMBEDDING_NAME}', TIMESTAMP)
BEST_MODEL_PATH = os.path.join(MODEL_PATH, 'best_model.pt')
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', f'embeddings-{EMBEDDING_NAME}', TIMESTAMP)
os.makedirs(CHECKPOINT_PATH, exist_ok=True)
os.makedirs(MODEL_PATH, exist_ok=True)

loss_history = []
metric_history = []
test_metric_history = []
target_metric = ('Weighted F2', 'weighted_f2')
best_metric = float('-inf')
best_epoch = 0

# Save the best model; Override checkpoints; Track metrics
def epoch_callback(epoch, avg_loss):
    global loss_history, metric_history, test_metric_history, target_metric, best_metric, best_epoch
    
    metrics = validate_result(validate_loader, model)
    loss_history.append(avg_loss)
    metric_history.append(metrics)
    test_metrics = validate_result(test_loader, model)
    test_metric_history.append(test_metrics)

    print(f'Epoch {epoch+1}: Loss: {avg_loss:.4f}')
    print(f'Validation {target_metric[0]}: {metrics[target_metric[1]]:.4f}')
    print(f'Test {target_metric[0]}: {test_metrics[target_metric[1]]:.4f}')
    
    if metrics[target_metric[1]] > best_metric:
        print(f'New best model found!')
        best_metric = metrics[target_metric[1]]
        best_epoch = epoch
        torch.save(model, BEST_MODEL_PATH)



train_epochs(
    trainer, EPOCHS, model, train_loader, loss_fn, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH, epoch_callback=epoch_callback)

Running training epoch 1/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 1: Loss: 0.8217
Validation Weighted F2: 0.6255
Test Weighted F2: 0.6190
New best model found!
Finished training epoch 1/20; Average Loss: 0.8217
Running training epoch 2/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 2: Loss: 0.7304
Validation Weighted F2: 0.6832
Test Weighted F2: 0.6753
New best model found!
Finished training epoch 2/20; Average Loss: 0.7304
Running training epoch 3/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 3: Loss: 0.6899
Validation Weighted F2: 0.6846
Test Weighted F2: 0.6699
New best model found!
Finished training epoch 3/20; Average Loss: 0.6899
Running training epoch 4/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 4: Loss: 0.6541
Validation Weighted F2: 0.6995
Test Weighted F2: 0.6863
New best model found!
Finished training epoch 4/20; Average Loss: 0.6541
Running training epoch 5/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 5: Loss: 0.6136
Validation Weighted F2: 0.6933
Test Weighted F2: 0.6792
Finished training epoch 5/20; Average Loss: 0.6136
Running training epoch 6/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 6: Loss: 0.5669
Validation Weighted F2: 0.6923
Test Weighted F2: 0.6831
Finished training epoch 6/20; Average Loss: 0.5669
Running training epoch 7/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 7: Loss: 0.5122
Validation Weighted F2: 0.6846
Test Weighted F2: 0.6877
Finished training epoch 7/20; Average Loss: 0.5122
Running training epoch 8/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 8: Loss: 0.4535
Validation Weighted F2: 0.6952
Test Weighted F2: 0.7009
Finished training epoch 8/20; Average Loss: 0.4535
Running training epoch 9/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 9: Loss: 0.3976
Validation Weighted F2: 0.6911
Test Weighted F2: 0.6890
Finished training epoch 9/20; Average Loss: 0.3976
Running training epoch 10/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 10: Loss: 0.3458
Validation Weighted F2: 0.6864
Test Weighted F2: 0.6921
Finished training epoch 10/20; Average Loss: 0.3458
Running training epoch 11/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 11: Loss: 0.2979
Validation Weighted F2: 0.6878
Test Weighted F2: 0.7034
Finished training epoch 11/20; Average Loss: 0.2979
Running training epoch 12/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 12: Loss: 0.2617
Validation Weighted F2: 0.6893
Test Weighted F2: 0.6935
Finished training epoch 12/20; Average Loss: 0.2617
Running training epoch 13/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 13: Loss: 0.2258
Validation Weighted F2: 0.6814
Test Weighted F2: 0.6874
Finished training epoch 13/20; Average Loss: 0.2258
Running training epoch 14/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 14: Loss: 0.1985
Validation Weighted F2: 0.6799
Test Weighted F2: 0.6782
Finished training epoch 14/20; Average Loss: 0.1985
Running training epoch 15/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 15: Loss: 0.1768
Validation Weighted F2: 0.6777
Test Weighted F2: 0.6731
Finished training epoch 15/20; Average Loss: 0.1768
Running training epoch 16/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 16: Loss: 0.1624
Validation Weighted F2: 0.6715
Test Weighted F2: 0.6725
Finished training epoch 16/20; Average Loss: 0.1624
Running training epoch 17/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 17: Loss: 0.1426
Validation Weighted F2: 0.6741
Test Weighted F2: 0.6737
Finished training epoch 17/20; Average Loss: 0.1426
Running training epoch 18/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 18: Loss: 0.1319
Validation Weighted F2: 0.6742
Test Weighted F2: 0.6636
Finished training epoch 18/20; Average Loss: 0.1319
Running training epoch 19/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 19: Loss: 0.1180
Validation Weighted F2: 0.6815
Test Weighted F2: 0.6797
Finished training epoch 19/20; Average Loss: 0.1180
Running training epoch 20/20


  0%|          | 0/1398 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

  0%|          | 0/175 [00:00<?, ?it/s]

Epoch 20: Loss: 0.1131
Validation Weighted F2: 0.6793
Test Weighted F2: 0.6800
Finished training epoch 20/20; Average Loss: 0.1131


In [12]:
# Build result metrics data frame per epoch
result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
    'loss': loss_history,
}).with_columns(pl.from_dicts(metric_history))

test_result_df = pl.DataFrame({
    'epoch': range(1, len(loss_history) + 1),
}).with_columns(pl.from_dicts(test_metric_history))

result_df.head()


epoch,loss,weighted_f1,macro_f1,weighted_f2,macro_f2,accuracy,recall,precision
i64,f64,f64,f64,f64,f64,f64,f64,f64
1,0.821739,0.633112,0.626195,0.625527,0.635294,0.628265,0.628265,0.676534
2,0.730445,0.686137,0.670944,0.68317,0.674477,0.68229,0.68229,0.695327
3,0.689854,0.687161,0.67122,0.684619,0.674221,0.683721,0.683721,0.694494
4,0.65406,0.69952,0.679777,0.699486,0.679818,0.699463,0.699463,0.699578
5,0.61356,0.694634,0.676868,0.693325,0.678434,0.692665,0.692665,0.697648


In [13]:
# Plot Loss and Target Metric per Epoch, highlighting the peak
fig_a = px.line(result_df, x='epoch', y='loss', title='Loss per Epoch', template='plotly_dark')
fig_a.add_scatter(x=[best_epoch+1], y=[loss_history[best_epoch]], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_a.update_layout()
fig_a.show()

fig_b = px.line(result_df, x='epoch', y='weighted_f2', title='Validation Weighted F2 per Epoch', template='plotly_dark')
fig_b.add_scatter(x=[best_epoch+1], y=[metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_b.update_layout()
fig_b.show()

fig_c = px.line(test_result_df, x='epoch', y='weighted_f2', title='Test Weighted F2 per Epoch', template='plotly_dark')
fig_c.add_scatter(x=[best_epoch+1], y=[test_metric_history[best_epoch]['weighted_f2']], mode='markers', showlegend=False, marker={'color': 'red', 'size': 10}, name='Best Model')
fig_c.update_layout()
fig_c.show()