In [1]:
from datasets import load_dataset

dataset = load_dataset("glue", "cola")
dataset

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 8551
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1043
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1063
    })
})

In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased")

def tokenize_dataset(data):
    # Keys of the returned dictionary will be added to the dataset as columns
    return tokenizer(data["sentence"])

train_dataset = dataset["train"].map(tokenize_dataset)
validation_dataset = dataset["validation"].map(tokenize_dataset)
test_dataset = dataset["test"].map(tokenize_dataset)

In [3]:
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased")

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
tf_train_dataset = model.prepare_tf_dataset(train_dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
tf_validation_dataset = model.prepare_tf_dataset(validation_dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)
tf_test_dataset = model.prepare_tf_dataset(test_dataset, batch_size=16, shuffle=True, tokenizer=tokenizer)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [5]:
from tensorflow.keras.optimizers.schedules import PolynomialDecay

num_epochs = 3
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate= 5e-5, end_learning_rate = 0.0,
    decay_steps = num_train_steps
)

In [6]:
from tensorflow.keras.optimizers import Adam
import tensorflow as tf

opt = Adam(learning_rate = lr_scheduler)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer= opt , loss= loss)  # No need for loss argument! , you can also use metrics=["accuracy"] if you want.

In [7]:
model.fit(tf_train_dataset,
          epochs = 4)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.src.callbacks.History at 0x1d87ee4e170>

In [44]:
preds = []
labels = []
for x, y in tf_validation_dataset:
    preds.append(model(x)) #--> prediction
    labels.append(y)

In [52]:
len(preds)

65

In [49]:
len(labels)

65

In [12]:
import numpy as np

preds = model.predict(tf_validation_dataset)["logits"]
probabilities = tf.nn.softmax(preds)
class_preds = np.argmax(probabilities, axis=1)

real_valids = np.array(validation_dataset['label'])



In [50]:
from datasets import load_metric

metric = load_metric("glue", "mrpc")
metric.compute(predictions = preds , references = labels )

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'dict'

In [19]:
tf_validation_dataset.label

AttributeError: '_PrefetchDataset' object has no attribute 'label'

In [25]:
validation_dataset

Dataset({
    features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
    num_rows: 1043
})