<a href="https://colab.research.google.com/github/AndreassOlsson/HuggingFace/blob/main/ag-news-bert.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Working with Hugging Face

1.   Download the English BERT from Huggingface’s Model Zoo
2.   Download the AG News dataset
3.   Prepare the dataset accordingly
4.   Train the BERT model to do “News Topic Classification” using the training data.
  1. Try to get the best possible test-score!


In [None]:
!pip install datasets transformers

In [None]:
from datasets import load_dataset

dataset = load_dataset("ag_news")
dataset['train'][0]

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

In [None]:
# Prepare dataset by tokenizing the text and include padding to handle variable length sequences 

def tokenize_function(examples):
  return tokenizer(examples['text'], padding='max_length', truncation=True)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

In [None]:
# Transform datasets into TF dataset
from transformers import DefaultDataCollator

data_collator = DefaultDataCollator(return_tensors="tf")

training_set = tokenized_dataset['train'].shuffle(seed=42).select(range(50000))
test_set = tokenized_dataset['test'].shuffle(seed=42).select(range(10000))

tf_train_dataset = training_set.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=True,
    collate_fn=data_collator,
    batch_size=8,
)

tf_validation_dataset = test_set.to_tf_dataset(
    columns=["attention_mask", "input_ids", "token_type_ids"],
    label_cols=["labels"],
    shuffle=False,
    collate_fn=data_collator,
    batch_size=8,
)

In [None]:
import tensorflow as tf
from transformers import TFAutoModelForSequenceClassification

model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=4)

In [None]:
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=5e-5),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=tf.metrics.SparseCategoricalAccuracy(),
)

In [None]:
import os
checkpoint_path = "drive/MyDrive/Andreas Olsson/Huggingface/checkpoints/Large/"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

# Train the model with the new callback
model.fit(tf_train_dataset, validation_data=tf_validation_dataset, epochs=10, callbacks=[cp_callback])  # Pass callback to training