# Neural Solution - Transformers: BERT

In [11]:
import os
import sys
import random
from datetime import datetime

import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [12]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Register the parent directory of the current script as a package root,
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.transformers.xlmroberta_large import xlm_roberta_tokenizer, XLMRobertaModule, XLMRobertaDataset
from toxicity.transformers.training import train_epochs, validate

## Setup

In [14]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 20
LEARNING_RATE = 3e-05
POS_WEIGHT = 1.663

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [15]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data Loading

In [16]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,"array[i32, 1]","array[i32, 1]"
"""ToLD-Br""","""55087272852267…","""medo de ir pra…",[0],[0]
"""ToLD-Br""","""16827841903506…","""https://t.co/2…",[0],[0]
"""ToLD-Br""","""76416288800248…","""rt USER bruno …",[0],[0]
"""ToLD-Br""","""16866242508514…","""tinha que ter …",[1],[1]
"""ToLD-Br""","""30682712524038…","""eu sou a pesso…",[0],[0]


In [17]:
train_df, test_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)
display(train_df.head(5))
display(test_df.head(5))

dataset,id,text,off_strict,off_relaxed
str,str,str,"array[i32, 1]","array[i32, 1]"
"""OLID-Br""","""3d85473d1c4b4f…","""USER merda, ri…",[1],[1]
"""OLID-Br""","""b344c5518f0d44…","""USER espero qu…",[1],[1]
"""ToLD-Br""","""43355433174616…","""eu tenho essas…",[0],[0]
"""OLID-Br""","""7ada9be164434f…","""USER USER é US…",[0],[0]
"""ToLD-Br""","""16784738693255…","""meu pai me deu…",[1],[0]


dataset,id,text,off_strict,off_relaxed
str,str,str,"array[i32, 1]","array[i32, 1]"
"""ToLD-Br""","""10657414299548…","""rt USER mano t…",[1],[0]
"""ToLD-Br""","""11088205621966…","""USER horrível!…",[1],[0]
"""ToLD-Br""","""11546370057009…","""gnt como pode …",[0],[0]
"""ToLD-Br""","""94504692628727…","""Que foda o USE…",[0],[0]
"""ToLD-Br""","""16835911729407…","""sapatão é foda…",[1],[1]


## Setup Model

In [18]:
tokenizer = xlm_roberta_tokenizer()

model = XLMRobertaModule(feature_count=1)
model.to(PYTORCH_DEVICE)

train_loader = DataLoader(XLMRobertaDataset(data_frame=train_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), shuffle=True,
                          num_workers=0, batch_size=TRAIN_BATCH_SIZE)
test_loader = DataLoader(XLMRobertaDataset(data_frame=test_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), shuffle=True,
                         num_workers=0, batch_size=TEST_BATCH_SIZE)

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks, with good known results for BERT models.

In [19]:
loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

### Training

In [20]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', 'xlm-roberta-large', TIMESTAMP)

train_epochs(
    EPOCHS, model, train_loader, loss_function, optimizer, PYTORCH_DEVICE,
    autocast=False, # Autocast didn't work well for this model
    checkpoint_path=CHECKPOINT_PATH)

Running training epoch 1/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 1/20; Average Loss: 0.8755
Running training epoch 2/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 2/20; Average Loss: 0.8742
Running training epoch 3/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 3/20; Average Loss: 0.8727
Running training epoch 4/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 4/20; Average Loss: 0.8734
Running training epoch 5/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 5/20; Average Loss: 0.8754
Running training epoch 6/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 6/20; Average Loss: 0.8711
Running training epoch 7/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 7/20; Average Loss: 0.8701
Running training epoch 8/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 8/20; Average Loss: 0.8694
Running training epoch 9/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 9/20; Average Loss: 0.8694
Running training epoch 10/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 10/20; Average Loss: 0.8686
Running training epoch 11/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 11/20; Average Loss: 0.8695
Running training epoch 12/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 12/20; Average Loss: 0.8672
Running training epoch 13/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 13/20; Average Loss: 0.8680
Running training epoch 14/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 14/20; Average Loss: 0.8680
Running training epoch 15/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 15/20; Average Loss: 0.8671
Running training epoch 16/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 16/20; Average Loss: 0.8678
Running training epoch 17/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 17/20; Average Loss: 0.8663
Running training epoch 18/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 18/20; Average Loss: 0.8669
Running training epoch 19/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 19/20; Average Loss: 0.8658
Running training epoch 20/20


  0%|          | 0/699 [00:00<?, ?it/s]

Finished training epoch 20/20; Average Loss: 0.8664


### Save the model and tokenizer

In [21]:
target_dir = os.path.join(ROOT_PATH, 'models/trained-xlm-roberta-large-{TIMESTAMP}')

os.makedirs(target_dir, exist_ok=True)

tokenizer.save_vocabulary(target_dir)
torch.save(model, f'{target_dir}/model.pth')

### Run the model with the test data

In [22]:
# Validate the results
raw_results, raw_targets = validate(model, test_loader, PYTORCH_DEVICE)
raw_results = np.array(raw_results)
raw_targets = np.array(raw_targets)

  0%|          | 0/175 [00:00<?, ?it/s]

### Check results

In [23]:
FIXED_THRESHOLD = 0.75
fixed_results = raw_results > FIXED_THRESHOLD
fixed_targets = raw_targets > FIXED_THRESHOLD

In [24]:
from sklearn.metrics import (
    f1_score, fbeta_score, accuracy_score, recall_score, precision_score)

fixed_weighted_f1 = f1_score(fixed_targets, fixed_results, average='weighted')
fixed_macro_f1 = f1_score(fixed_targets, fixed_results, average='macro')
fixed_weighted_f2 = fbeta_score(fixed_targets, fixed_results, beta=2, average='weighted')
fixed_macro_f2 = fbeta_score(fixed_targets, fixed_results, beta=2, average='macro')
fixed_accuracy = accuracy_score(fixed_targets, fixed_results)
fixed_recall = recall_score(fixed_targets, fixed_results, average='weighted')
fixed_precision = precision_score(fixed_targets, fixed_results, average='weighted')

print("Model Metrics:")
print(f"Weighted F1 = {fixed_weighted_f1:.6f}")
print(f"Macro F1 = {fixed_macro_f1:.6f}")
print(f"Weighted F2 Score = {fbeta_score(fixed_targets, fixed_results, beta=2, average='weighted'):.6f}")
print(f"Macro F2 Score = {fbeta_score(fixed_targets, fixed_results, beta=2, average='macro'):.6f}")
print(f"Accuracy = {fixed_accuracy:.6f}")
print(f"Recall = {fixed_recall:.6f}")
print(f"Precision = {fixed_precision:.6f}")

Model Metrics:
Weighted F1 = 0.487366
Macro F1 = 0.386615
Weighted F2 Score = 0.564122
Macro F2 Score = 0.447503
Accuracy = 0.630299
Recall = 0.630299
Precision = 0.397276


  _warn_prf(average, modifier, msg_start, len(result))
