In [9]:
from datasets import load_dataset
#load dataset
raw_datasets = load_dataset('glue', 'sst2')
raw_datasets



  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx'],
        num_rows: 1821
    })
})

In [10]:
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification

checkpoint = "bert-base-uncased"
#instantiate tokenizer and model objects
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
model = TFAutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=2)

All model checkpoint layers were used when initializing TFBertForSequenceClassification.

Some layers of TFBertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
def tokenizer_function(example):

  return tokenizer(example['sentence'], truncation=True)

# map the tokenizer_function to the sentence column of the raw dataset
tokenized_datasets = raw_datasets.map(tokenizer_function, batched= True)
tokenized_datasets



  0%|          | 0/1 [00:00<?, ?ba/s]



DatasetDict({
    train: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 67349
    })
    validation: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 872
    })
    test: Dataset({
        features: ['sentence', 'label', 'idx', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 1821
    })
})

In [12]:
from transformers import DataCollatorWithPadding

data_collator = DataCollatorWithPadding(tokenizer=tokenizer, return_tensors="tf")

In [13]:
tf_train_dataset = tokenized_datasets["train"].to_tf_dataset(
    columns=["attention_mask", "input_ids","token_type_ids"],
    label_cols=['labels'],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=30,
)

tf_validation_dataset = tokenized_datasets["validation"].to_tf_dataset(
    columns=["attention_mask", "input_ids","token_type_ids"],
    label_cols=['labels'],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=30,
)

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [14]:
import tensorflow as tf
from tensorflow.keras.optimizers.schedules import PolynomialDecay
from tensorflow.keras.optimizers import Adam

num_epochs = 2
num_train_steps = len(tf_train_dataset) * num_epochs
lr_scheduler = PolynomialDecay(
    initial_learning_rate=5e-5, end_learning_rate=0.0, decay_steps=num_train_steps
)
opt = Adam(learning_rate=lr_scheduler)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(loss=loss, optimizer=opt, metrics= ['accuracy'])

In [15]:
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=3)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fb561e45ca0>

In [36]:
preds = model.predict(tf_validation_dataset)["logits"]
preds[0]



array([-4.1110034,  3.8389833], dtype=float32)

In [51]:
import numpy as np
probabilities = tf.nn.softmax(preds)
class_preds = np.argmax(probabilities, axis=1)

In [34]:
label_names = raw_datasets['train'].features['label'].names
label_names

['negative', 'positive']

In [46]:
model.config.id2label = {i: lbl for i, lbl in enumerate(label_names)}
model.config.label2id = {lbl: i for i, lbl in enumerate(label_names)}

In [50]:
labels = [model.config.id2label[label_id] for label_id in class_preds.tolist()]
labels[:5]

['positive', 'negative', 'positive', 'positive', 'negative']

In [23]:
import evaluate

metric = evaluate.load("glue", "sst2")
metric.compute(predictions=class_preds, references=raw_datasets["validation"]["label"])

{'accuracy': 0.9311926605504587}