In [1]:
import sys
from pathlib import Path

if str(Path(".").absolute().parent) not in sys.path:
    sys.path.append(str(Path(".").absolute().parent))

In [2]:
from dotenv import load_dotenv

# Initialize the env vars
load_dotenv("../.env")

True

In [3]:
import pandas as pd
from src.utils import (
    compute_pos_weight,
    download_dataset,
    flatten
)

In [4]:
dataset = download_dataset(["train.csv", "test.csv"])

train_df = dataset["train.csv"]
test_df = dataset["test.csv"]

del dataset

print(f"Train shape: {train_df.shape}")
train_df.head()

Downloading OLID-BR from Kaggle.
Train shape: (4765, 17)


Unnamed: 0,id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia
0,430b13705cf34e13b74bc999425187c3,USER USER é muito bom. USER ^^ E claro a equip...,NOT,UNT,,,False,False,False,False,False,False,False,False,False,False,False
1,c779826dc43f460cb18e8429ca443477,Pior do que adolescentezinhas de merda...são p...,OFF,UNT,,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",False,False,True,False,False,False,True,False,False,True,False
2,e64148caa4474fc79298e01d0dda8f5e,USER Toma no cu é vitamina como tu e tua prima.,OFF,TIN,GRP,"[5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17...",False,False,True,False,False,False,True,False,False,False,False
3,cc66b54eeec24607a67e2259134a1cdd,"Muito bom, pena a circunstâncias serem ruins, ...",OFF,UNT,,"[119, 120, 121, 122, 123, 124, 125, 126, 127, ...",False,False,True,False,False,False,False,False,False,False,False
4,a3d7839456ae4258a70298fcf637952e,"Podia ter beijo também, pra ver se o homofóbic...",OFF,UNT,,"[24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 3...",False,False,True,False,False,False,False,False,False,False,False


In [5]:
print(f"Test shape: {test_df.shape}")
test_df.head()

Test shape: (1589, 17)


Unnamed: 0,id,text,is_offensive,is_targeted,targeted_type,toxic_spans,health,ideology,insult,lgbtqphobia,other_lifestyle,physical_aspects,profanity_obscene,racism,religious_intolerance,sexism,xenophobia
0,da19df36730945f08df3d09efa354876,USER Adorei o comercial também Jesus. Só achei...,OFF,UNT,,"[52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 6...",False,False,True,False,False,False,True,False,False,False,False
1,80f1a8c981864887b13963fed1261acc,Cara isso foi muito babaca geral USER conhece ...,OFF,TIN,GRP,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",False,False,True,False,False,False,False,False,False,False,False
2,2f67025f913e4a6292e3d000d9e2b5a8,"Se vc for porco, folgado e relaxado, você não ...",OFF,UNT,,"[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,...",False,False,True,False,False,False,False,False,False,False,False
3,738ccd4476784f47af3a5a6cfdda4695,Se fosse um sniper ia ser louco,OFF,UNT,,"[26, 27, 28, 29, 30]",False,False,True,False,False,True,False,False,False,False,False
4,e0064da693bd4c9e90ce8e6db8bd3bbb,USER é o meu saco USER USER USER,OFF,UNT,,"[13, 14, 15, 16]",False,False,True,False,False,False,True,False,False,False,False


In [6]:
def preprocessing(df: pd.DataFrame) -> pd.DataFrame:
    """Preprocess the dataset.

    Args:
    - df: The dataset to be preprocessed.

    Returns:
    - The preprocessed dataset.
    """
    import warnings

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        
        # Filter only offensive comments
        df = df[df["is_offensive"] == "OFF"]

        # Remove religious_intolerance that has only one  sample
        if "religious_intolerance" in df.columns:
            df.drop("religious_intolerance", axis=1, inplace=True)

        # Filter only offensive comments with at least one toxicity label
        df = df.loc[df.select_dtypes("bool").sum(axis=1).ge(1)]

        return df.reset_index(drop=True)

toxicity_labels = [
    "health",
    "ideology",
    "insult",
    "lgbtqphobia",
    "other_lifestyle",
    "physical_aspects",
    "profanity_obscene",
    "racism",
    "sexism",
    "xenophobia"
]

train_df = preprocessing(train_df)
test_df = preprocessing(test_df)

print(f"Train shape: {train_df.shape}")
print(f"Test shape: {test_df.shape}")

Train shape: (4272, 16)
Test shape: (1438, 16)


In [7]:
X_train = train_df["text"].values
y_train = train_df[toxicity_labels].astype(int).values

X_test = test_df["text"].values
y_test = test_df[toxicity_labels].astype(int).values

In [20]:
from dataclasses import dataclass

@dataclass
class Parameters:
    max_seq_length: int = len(max(X_train, key=len))
    model_name: str = "neuralmind/bert-base-portuguese-cased"
    model_type: str = "bert"
    num_train_epochs: int = 1
    num_train_epochs_per_child: int = 3
    batch_size: int = 1
    validation_split: float = 0.2
    learning_rate: float = 2e-5
    seed: int = 1993

params = Parameters()
params

Parameters(max_seq_length=1084, model_name='neuralmind/bert-base-portuguese-cased', model_type='bert', num_train_epochs=1, num_train_epochs_per_child=3, batch_size=1, validation_split=0.2, learning_rate=2e-05, seed=1993)

## Transformers

In [21]:
import numpy as np
import tensorflow as tf
from typing import Any, Union
from datasets import Dataset
from transformers import (
    BertTokenizer,
    TFBertForSequenceClassification,
    DataCollatorWithPadding,
    create_optimizer
)
    
model = TFBertForSequenceClassification.from_pretrained(
    params.model_name,
    num_labels=len(toxicity_labels),
    problem_type="multi_label_classification",
    id2label={k:v for k, v in enumerate(toxicity_labels)},
    label2id={v:k for k, v in enumerate(toxicity_labels)}
)

tokenizer = BertTokenizer.from_pretrained(
    params.model_name
)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier', 'bert/pooler/dense/kernel:0', 'bert/pooler/dense/bias:0']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [30]:
def preprocess_fn(examples, tokenizer, max_seq_length):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
        max_length=max_seq_length
    )

dataset = Dataset.from_dict(
    {
        "text": X_train,
        "labels": y_train
    }
)

dataset = dataset.map(
    preprocess_fn,
    batched=True,
    fn_kwargs={
        "tokenizer": tokenizer,
        "max_seq_length": params.max_seq_length
    }
)

dataset = dataset.train_test_split(
    test_size=params.validation_split,
    shuffle=True,
    seed=params.seed
)

dataset

  0%|          | 0/5 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 3417
    })
    test: Dataset({
        features: ['text', 'labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 855
    })
})

In [31]:
data_collator = DataCollatorWithPadding(
    tokenizer=tokenizer,
    padding="max_length",
    max_length=params.max_seq_length,
    return_tensors="tf"
)

tf_train_set = model.prepare_tf_dataset(
    dataset["train"],
    shuffle=True,
    batch_size=params.batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    dataset["test"],
    shuffle=False,
    batch_size=params.batch_size,
    collate_fn=data_collator,
)

In [32]:
batches_per_epoch = len(tf_train_set) // params.batch_size
total_train_steps = int(batches_per_epoch * params.num_train_epochs)

optimizer, lr_scheduler = create_optimizer(
    init_lr=params.learning_rate,
    num_train_steps=total_train_steps,
    num_warmup_steps=0
)

# Define loss and metrics
loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

model.compile(
    optimizer=optimizer,
    loss=loss,
    metrics=[
        tf.keras.metrics.BinaryAccuracy()
    ]
)

In [33]:
model.fit(
    tf_train_set,
    validation_data=tf_validation_set,
    epochs=params.num_train_epochs,
    batch_size=params.batch_size,
    # callbacks=[lr_scheduler]
)

ResourceExhaustedError: in user code:

    File "c:\Python310\lib\site-packages\keras\engine\training.py", line 1160, in train_function  *
        return step_function(self, iterator)
    File "c:\Python310\lib\site-packages\keras\engine\training.py", line 1146, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "c:\Python310\lib\site-packages\keras\engine\training.py", line 1135, in run_step  **
        outputs = model.train_step(data)
    File "c:\Python310\lib\site-packages\transformers\modeling_tf_utils.py", line 1532, in train_step
        self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    File "c:\Python310\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 579, in minimize
        return self.apply_gradients(grads_and_vars, name=name)
    File "c:\Python310\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 695, in apply_gradients
        self._create_all_weights(var_list)
    File "c:\Python310\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 959, in _create_all_weights
        self._create_slots(var_list)
    File "c:\Python310\lib\site-packages\keras\optimizers\optimizer_v2\adam.py", line 128, in _create_slots
        self.add_slot(var, "v")
    File "c:\Python310\lib\site-packages\keras\optimizers\optimizer_v2\optimizer_v2.py", line 1050, in add_slot
        weight = tf.Variable(
    File "c:\Python310\lib\site-packages\keras\initializers\initializers_v2.py", line 171, in __call__
        return tf.zeros(shape, dtype)

    ResourceExhaustedError: {{function_node __wrapped__Fill_device_/job:localhost/replica:0/task:0/device:GPU:0}} OOM when allocating tensor with shape[29794,768] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Fill]


## ToxicityTypeDetector

In [None]:
import numpy as np
from typing import Any, Dict, List, Union
from sklearn.base import BaseEstimator
import tensorflow as tf
from datasets import Dataset
from transformers import (
    BertTokenizer,
    TFBertForSequenceClassification,
    DataCollatorWithPadding,
    create_optimizer
)


def format_scores(scores: Dict[str, Dict[str, float]]) -> Dict[str, float | int]:
    """Format scores to be logged in mlflow.

    Args:
    - scores: classification report scores

    Returns:
    - formatted_scores: formatted scores
    """
    # Flatten the score dict
    scores = flatten(scores)

    # Remove whitespaces and "-" from keys
    scores = {k.replace(" ", "_").replace("-", "_"): v for k, v in scores.items()}

    # Remove "_support" items
    scores = {k: v for k, v in scores.items() if not k.endswith("_support")}

    return scores
    

class ToxicityTypeDetector(BaseEstimator):
    def __init__(self,
                 model_type: str = "bert",
                 model_name: str = "neuralmind/bert-base-portuguese-cased",
                 labels: List[str] = None,
                 max_seq_length: int = 512,
                 num_train_epochs: int = 30,
                 batch_size: int = 8,
                 validation_split: float = 0.2,
                 learning_rate: float = 2e-5,
                 use_cuda: bool = True,
                 **kwargs):

        self.model_type = model_type
        self.model_name = model_name
        self.labels = labels
        self.max_seq_length = max_seq_length
        self.num_train_epochs = num_train_epochs
        self.batch_size = batch_size
        self.validation_split = validation_split
        self.learning_rate = learning_rate
        self.use_cuda = use_cuda
        
        self.model = self.init_model()
        self.tokenizer = self.init_tokenizer()

    def init_tokenizer(self) -> Any:
        return BertTokenizer.from_pretrained(
            self.model_name,
            model_max_length=self.max_seq_length
        )

    def init_model(self) -> Any:
        return TFBertForSequenceClassification.from_pretrained(
            self.model_name,
            num_labels=len(self.labels),
            problem_type="multi_label_classification",
            id2label={k:v for k, v in enumerate(self.labels)},
            label2id={v:k for k, v in enumerate(self.labels)}
        )

    def init_optimizer(self, total_train_steps: int) -> Any:
        """Initialize the optimizer.

        Args:
        - total_train_steps: The total number of training steps.

        Returns:
        - The optimizer.
        - The learning rate scheduler.
        """
        return create_optimizer(
            init_lr=self.learning_rate,
            num_train_steps=total_train_steps,
            num_warmup_steps=0
        )

    def _tokenize(self, X: str) -> Any:
        return self.tokenizer.encode_plus(
            X,
            add_special_tokens=True,
            max_length=self.max_seq_length,
            padding="max_length",
            return_tensors="tf"
        )

    def _predict(self, X: str) -> np.ndarray:
        """Predicts the toxicity type of a given text.

        Args:
        - X: The text to be predicted.
        
        Returns:
        - Probabilities of each toxicity type.
        """
        inputs = self._tokenize(X)

        logits = self.model(**inputs).logits
        probs = tf.nn.sigmoid(logits).numpy()[0]
        return probs

    def predict_proba(self, X: Union[str, List[str], np.ndarray]) -> List[List[float]]:
        """Predicts the toxicity types of a given text.

        Args:
        - X: The text or texts to be predicted.

        Returns:
        - A list with a dictionary of probabilities for each toxicity type.
        """
        if isinstance(X, np.ndarray):
            X = X.tolist()
        elif isinstance(X, str):
            X = [X]

        response = []
        for x in X:
            probs = self._predict(x)
            response.append(
                {k: v for k, v in zip(self.labels, probs)}
            )
            
        return response

    def predict(self, X: Union[str, List[str], np.ndarray]) -> List[List[float]]:
        """Predicts the toxicity types of a given text.

        Args:
        - X: The text or texts to be predicted.

        Returns:
        - A list with a dictionary of predicted toxicity types.
        """
        preds = self.predict_proba(X)
        preds = [{k: 1 if v > 0.5 else 0 for k, v in pred.items()} for pred in preds]
        return preds

    def prepare_data(self, X: np.ndarray, y: Union[list, np.ndarray]) -> Any:
        """Prepares the data to be used in the model.

        Args:
        - X: The texts to be used in the model.
        - y: The labels to be used in the model.

        Returns:
        - train_dataset: The train dataset.
        - val_dataset: The validation dataset.
        """
        def preprocess_fn(examples, tokenizer):
            return tokenizer(examples["text"], truncation=True)

        # Convert to numpy array
        if isinstance(y, list):
            y = np.array(y)

        dataset = Dataset.from_dict(
            {
                "text": X,
                "labels": y
            }
        )

        dataset = dataset.map(
            preprocess_fn,
            batched=True,
            fn_kwargs={"tokenizer": self.tokenizer}
        )

        data_collator = DataCollatorWithPadding(
            tokenizer=self.tokenizer,
            return_tensors="tf")

        dataset = dataset.train_test_split(
            test_size=self.validation_split,
            shuffle=True
        )

        # print(dataset)
        
        tf_train_set = self.model.prepare_tf_dataset(
            dataset["train"],
            shuffle=True,
            batch_size=self.batch_size,
            collate_fn=data_collator,
        )

        tf_validation_set = self.model.prepare_tf_dataset(
            dataset["test"],
            shuffle=False,
            batch_size=self.batch_size,
            collate_fn=data_collator,
        )

        return tf_train_set, tf_validation_set

    def fit(self,
            X: List[str],
            y: List[List[int]] | np.ndarray) -> None:
        """Fits the model.

        Args:
        - X: The texts to be used for training.
        - y: The labels to be used for training.
        """
        self.model = self.init_model()
        
        train_set, val_set = self.prepare_data(X, y)

        batches_per_epoch = len(train_set) // self.batch_size
        total_train_steps = int(batches_per_epoch * self.num_train_epochs)

        optimizer, lr_scheduler = self.init_optimizer(total_train_steps)

        # Define loss and metrics
        loss = tf.keras.losses.BinaryCrossentropy(from_logits=True)

        self.model.compile(
            optimizer=optimizer,
            loss=loss,
            metrics=[
                tf.keras.metrics.BinaryAccuracy()
            ]
        )

        self.model.fit(
            train_set,
            validation_data=val_set,
            epochs=self.num_train_epochs,
            batch_size=self.batch_size,
            # callbacks=[lr_scheduler]
        )

    def score(self, X, y):
        pass

In [None]:
detector = ToxicityTypeDetector(
    labels=toxicity_labels,
    **params
)

In [None]:
detector.fit(X_train, y_train)

In [None]:
import tensorflow as tf
from transformers import (
    BertTokenizer,
    TFBertForSequenceClassification
)

tokenizer = BertTokenizer.from_pretrained(params["model_name"])

model = TFBertForSequenceClassification.from_pretrained(
    params["model_name"],
    num_labels=len(toxicity_labels),
    problem_type="multi_label_classification",
    id2label={k:v for k, v in enumerate(toxicity_labels)},
    label2id={v:k for k, v in enumerate(toxicity_labels)}
)

inputs = tokenizer("Olá, meu cachorro é tão fofo", return_tensors="tf")

logits = model(**inputs).logits

tf.nn.sigmoid(logits).numpy()[0]