# 3. BERT

Inicialmente se va a realizar un preprocesado de los datos, eliminando las palabras sin significado útil, los url y los signos de puntuación.

In [1]:
import pandas as pd
import numpy as np

df = pd.read_csv('./data/train.csv')

In [2]:
import spacy

nlp = spacy.load('en_core_web_lg')

In [3]:
en_stopwords = nlp.Defaults.stop_words

def remove_stop_words(text):
    cleanText = ''
    phrase = nlp(text)
    for token in phrase:
        if not token.is_stop and not token.is_punct and not token.like_url:
            cleanText += ' ' + token.text

    return cleanText

df['text_cleaned'] = df['text'].apply(remove_stop_words)

In [4]:
df['text_cleaned']

0        Hi Roy hope ok Trans people gay thing s ramme...
1                                     fuckin hell biology
2                                  nice looking clergyman
3                           AIDS WAY SIN CONSEQUENCES BAD
4                                                   learn
                              ...                        
8143     Yeah alive time election happen fairly soon U...
8144                                  fundamentally wrong
8145     confused homosexuality big deal proud normal ...
8146                                           disgusting
8147     Peter Sørensen note Peter poor maths 13 27 eq...
Name: text_cleaned, Length: 8148, dtype: object

In [5]:
from sklearn.model_selection import train_test_split

X = df['text_cleaned']
y = df['label']

X_train, X_test, y_train, y_test, = train_test_split(X, y, test_size=0.33, random_state=42)



Se va a utilizar BERT para realizar la clasificación de lenguaje ofensivo.

In [6]:
import torch
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

2024-05-16 12:55:26.616066: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
class Dataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        return item

    def __len__(self):
        return len(self.encodings['input_ids'])

In [8]:
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding='max_length', max_length=128)
train_encodings['labels'] = y_train.tolist()
train_DS = Dataset(train_encodings)

val_encodings = tokenizer(X_test.tolist(), truncation=True, padding='max_length', max_length=128)
val_encodings['labels'] = y_test.tolist()
test_DS = Dataset(val_encodings)

BERT permite tokenizar los datos, preparándolos para el modelo.

In [9]:
args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_DS,
    eval_dataset=test_DS
)

trainer.train()

Epoch,Training Loss,Validation Loss
1,0.5694,0.502094


TrainOutput(global_step=683, training_loss=0.5576010239071937, metrics={'train_runtime': 1907.1145, 'train_samples_per_second': 2.862, 'train_steps_per_second': 0.358, 'total_flos': 359080812802560.0, 'train_loss': 0.5576010239071937, 'epoch': 1.0})

In [14]:
from datasets import load_metric

metric = load_metric("accuracy")
eval_results = trainer.evaluate(eval_dataset=test_DS)

predictions = trainer.predict(test_DS)

preds = predictions.predictions.argmax(-1)
labels = predictions.label_ids

accuracy = metric.compute(predictions=preds, references=labels)
print(f"Accuracy: {accuracy['accuracy']}")

You can avoid this message in future by passing the argument `trust_remote_code=True`.
Passing `trust_remote_code=True` will be mandatory to load this metric from the next major release of `datasets`.


Accuracy: 0.7575306805503905


Tras el entrenamiento y la evaluación del clasificador se observa una precisión del 75,75%, la más alta obtenida por el grupo y bastante buena para el reto.