# Neural Solution - Transformers: BERT

In [1]:
import os
import sys
import random
from datetime import datetime

import numpy as np
import polars as pl
from sklearn.model_selection import train_test_split

import torch
from torch import nn, optim, cuda
from torch.utils.data import DataLoader

In [2]:
ROOT_PATH = '../'
DRIVE_PATH = 'Colab/ToxicityClassification'

# When on Colab, use Google Drive as the root path to persist and load data
if 'google.colab' in sys.modules:
    from google.colab import drive
    drive.mount('/content/drive')
    ROOT_PATH = os.path.join('/content/drive/My Drive/', DRIVE_PATH)
    os.makedirs(ROOT_PATH, exist_ok=True)
    os.chdir(ROOT_PATH)

In [3]:
# Register the parent directory of the current script as a package root, 
# so that we can import modules from the parent directory
sys.path.append(os.path.abspath(os.path.join(ROOT_PATH, 'src')))

from toxicity.transformers.bertimbau_large import bert_tokenizer, BertDatasetBF16, BertModuleBF16
from toxicity.transformers.training import train_epochs, validate

## Setup

In [4]:
# Target device for running the model
PYTORCH_DEVICE = 'cuda' if cuda.is_available() else 'cpu'

# Random Seed
RANDOM_SEED = 777

# Training & Validation configs
TRAIN_RATIO = 0.8
MAX_LEN = 256
TRAIN_BATCH_SIZE = 32
TEST_BATCH_SIZE = 32
EPOCHS = 5
LEARNING_RATE = 3e-05
POS_WEIGHT = 1.663

print(f'Using device: {PYTORCH_DEVICE}')

Using device: cuda


In [5]:
random.seed(RANDOM_SEED)
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
torch.cuda.manual_seed_all(RANDOM_SEED)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

## Data Loading

In [6]:
df = pl.read_parquet(os.path.join(ROOT_PATH, 'data', 'joint', 'data.parquet.zstd'))
df = df.with_columns(
    df['off_relaxed'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
    df['off_strict'].cast(pl.Int32).cast(pl.List(pl.Int32)).cast(pl.Array(pl.Int32, 1)),
)
df.sample(5, seed=42)

dataset,id,text,off_strict,off_relaxed
str,str,str,"array[i32, 1]","array[i32, 1]"
"""ToLD-Br""","""5508727285226739644""","""medo de ir pra um rolê de novo…",[0],[0]
"""ToLD-Br""","""16827841903506270139""","""https://t.co/2bs6oD330q Ele a…",[0],[0]
"""ToLD-Br""","""7641628880024884135""","""rt USER bruno fernandes assina…",[0],[0]
"""ToLD-Br""","""16866242508514532033""","""tinha que ter jogado esse bran…",[1],[1]
"""ToLD-Br""","""3068271252403811869""","""eu sou a pessoa certa no bairr…",[0],[0]


In [7]:
train_df, test_df = train_test_split(df, train_size=TRAIN_RATIO, random_state=RANDOM_SEED)
display(train_df.head(5))
display(test_df.head(5))

dataset,id,text,off_strict,off_relaxed
str,str,str,"array[i32, 1]","array[i32, 1]"
"""OLID-Br""","""3d85473d1c4b4f86a78159f23d7746…","""USER merda, ridículo essa impo…",[1],[1]
"""OLID-Br""","""b344c5518f0d44688ed45cc4a3183e…","""USER espero que eles sejam mor…",[1],[1]
"""ToLD-Br""","""4335543317461660187""","""eu tenho essas paran贸ias de ac…",[0],[0]
"""OLID-Br""","""7ada9be164434f0e925f50616b637c…","""USER USER é USER USER""",[0],[0]
"""ToLD-Br""","""16784738693255454158""","""meu pai me deu esse perfume eu…",[1],[0]


dataset,id,text,off_strict,off_relaxed
str,str,str,"array[i32, 1]","array[i32, 1]"
"""ToLD-Br""","""10657414299548058873""","""rt USER mano tá tudo me irrita…",[1],[0]
"""ToLD-Br""","""11088205621966361413""","""USER horrível!""",[1],[0]
"""ToLD-Br""","""11546370057009176494""","""gnt como pode falar q esse hom…",[0],[0]
"""ToLD-Br""","""9450469262872738701""","""Que foda o USER PUTA QUE PARIU…",[0],[0]
"""ToLD-Br""","""16835911729407698751""","""sapatão é foda, não pode beber…",[1],[1]


## Setup Model

In [8]:
tokenizer = bert_tokenizer()

model = BertModuleBF16(feature_count=1)
model.to(PYTORCH_DEVICE)

train_loader = DataLoader(BertDatasetBF16(data_frame=train_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), shuffle=True,
                          num_workers=0, batch_size=TRAIN_BATCH_SIZE)
test_loader = DataLoader(BertDatasetBF16(data_frame=test_df, tokenizer=tokenizer, max_len=MAX_LEN, target_col='off_relaxed'), shuffle=True,
                         num_workers=0, batch_size=TEST_BATCH_SIZE)

tokenizer_config.json:   0%|          | 0.00/155 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/210k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/648 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

### Loss and Optimizer

Using a Binary Cross Entropy loss as it shows good results for binary classification tasks. We are also applying differente weights to the positive and negative classes to account for the class imbalance.

Adam optimizer is also used as it is a good general optimizer for training neural networks, with good known results for BERT models.

In [9]:
loss_function = nn.BCEWithLogitsLoss(pos_weight=torch.tensor([POS_WEIGHT], device=PYTORCH_DEVICE))
optimizer = optim.AdamW(params=model.parameters(), lr=LEARNING_RATE)

### Training

In [10]:
TIMESTAMP = datetime.now().strftime('%Y%m%d%H%M%S')
CHECKPOINT_PATH = os.path.join(ROOT_PATH, 'checkpoints', 'bertimbau', TIMESTAMP)

train_epochs(
    EPOCHS, model, train_loader, loss_function, optimizer, PYTORCH_DEVICE,
    checkpoint_path=CHECKPOINT_PATH)

Running training epoch 1/5


  0%|          | 0/699 [00:00<?, ?it/s]

OutOfMemoryError: CUDA out of memory. Tried to allocate 20.00 MiB. GPU 0 has a total capacity of 11.73 GiB of which 47.19 MiB is free. Process 293642 has 5.41 GiB memory in use. Including non-PyTorch memory, this process has 5.55 GiB memory in use. Of the allocated memory 5.18 GiB is allocated by PyTorch, and 75.30 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

### Save the model and tokenizer

In [None]:
target_dir = os.path.join(ROOT_PATH, 'models/trained-bertimbau-{TIMESTAMP}')

os.makedirs(target_dir, exist_ok=True)

tokenizer.save_vocabulary(target_dir)
torch.save(model, f'{target_dir}/model.pth')

### Run the model with the test data

In [None]:
# Validate the results
raw_results, raw_targets = validate(model, test_loader, PYTORCH_DEVICE)
raw_results = np.array(raw_results)
raw_targets = np.array(raw_targets)

  0%|          | 0/175 [00:00<?, ?it/s]

### Check results

In [None]:
FIXED_THRESHOLD = 0.75
fixed_results = raw_results > FIXED_THRESHOLD
fixed_targets = raw_targets > FIXED_THRESHOLD

In [None]:
from sklearn.metrics import (
    f1_score, fbeta_score, accuracy_score, recall_score, precision_score)

fixed_weighted_f1 = f1_score(fixed_targets, fixed_results, average='weighted')
fixed_macro_f1 = f1_score(fixed_targets, fixed_results, average='macro')
fixed_weighted_f2 = fbeta_score(fixed_targets, fixed_results, beta=2, average='weighted')
fixed_macro_f2 = fbeta_score(fixed_targets, fixed_results, beta=2, average='macro')
fixed_accuracy = accuracy_score(fixed_targets, fixed_results)
fixed_recall = recall_score(fixed_targets, fixed_results, average='weighted')
fixed_precision = precision_score(fixed_targets, fixed_results, average='weighted')

print("Model Metrics:")
print(f"Weighted F1 = {fixed_weighted_f1:.6f}")
print(f"Macro F1 = {fixed_macro_f1:.6f}")
print(f"Weighted F2 Score = {fbeta_score(fixed_targets, fixed_results, beta=2, average='weighted'):.6f}")
print(f"Macro F2 Score = {fbeta_score(fixed_targets, fixed_results, beta=2, average='macro'):.6f}")
print(f"Accuracy = {fixed_accuracy:.6f}")
print(f"Recall = {fixed_recall:.6f}")
print(f"Precision = {fixed_precision:.6f}")

Model Metrics:
Weighted F1 = 0.809839
Macro F1 = 0.793777
Weighted F2 Score = 0.810959
Macro F2 Score = 0.789865
Accuracy = 0.812198
Recall = 0.812198
Precision = 0.810035
