# Sexism Detection Using BERT

In [3]:
from datasets import load_dataset, Dataset

edos = load_dataset('csv', data_files={'train': 'train.csv', 'test': 'test.csv'}, delimiter=',', column_names=['text', 'label'])

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [5]:
def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True)

In [6]:
tokenized_edos = edos.map(preprocess_function, batched=True)

In [7]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

## Evaluate

In [8]:
import evaluate

accuracy = evaluate.load("accuracy")

In [9]:
import numpy as np


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

## Train

In [10]:
from transformers import create_optimizer
import tensorflow as tf

batch_size = 8
num_epochs = 2
batches_per_epoch = len(tokenized_edos["train"]) // batch_size
total_train_steps = int(batches_per_epoch * num_epochs)
optimizer, schedule = create_optimizer(init_lr=1e-5, num_warmup_steps=0, num_train_steps=total_train_steps)

In [11]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained(
    "distilbert-base-uncased", num_labels=len(label2id.keys()), id2label=id2label, label2id=label2id
)

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertForSequenceClassification: ['vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight']
- This IS expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForSequenceClassification from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
Some weights or buffers of the TF 2.0 model TFDistilBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight', 'classifier.bias']
You should 

In [12]:
tf_train_set = model.prepare_tf_dataset(
    tokenized_edos["train"],
    shuffle=True,
    batch_size=batch_size,
    collate_fn=data_collator,
)

tf_validation_set = model.prepare_tf_dataset(
    tokenized_edos["test"],
    shuffle=False,
    batch_size=batch_size,
    collate_fn=data_collator,
)

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [23]:
np.array(x for x in tf_validation_set.as_numpy_iterator())

array(<generator object <genexpr> at 0x000001F282FB1620>, dtype=object)

In [13]:
import tensorflow as tf

model.compile(optimizer=optimizer)

In [14]:
from tensorflow.keras.callbacks import EarlyStopping
from transformers.keras_callbacks import KerasMetricCallback

callbacks = [
    KerasMetricCallback(metric_fn=compute_metrics, eval_dataset=tf_validation_set),
    EarlyStopping(patience=5, monitor='val_loss'),
]

In [15]:
%%time
model.fit(x=tf_train_set, validation_data=tf_validation_set, epochs=num_epochs, callbacks=callbacks)

Epoch 1/2
Epoch 2/2
CPU times: total: 40.4 s
Wall time: 1min 15s


<keras.callbacks.History at 0x259221b5ba0>

In [24]:
predictions = model.predict(tf_validation_set)[0]



In [30]:
y_pred = np.argmax(predictions, 1)
y_true = edos['test']['label']

In [33]:
from sklearn.metrics import classification_report

print(classification_report(y_true, y_pred, digits=4))

              precision    recall  f1-score   support

           0     0.6056    0.2529    0.3568       170
           1     0.5531    0.8075    0.6565       213
           3     0.5283    0.4828    0.5045        58
           4     0.6275    0.7111    0.6667        45

    accuracy                         0.5658       486
   macro avg     0.5786    0.5636    0.5461       486
weighted avg     0.5754    0.5658    0.5345       486



In [36]:
model.save('category', save_format='tf')





INFO:tensorflow:Assets written to: category\assets


INFO:tensorflow:Assets written to: category\assets
