In [1]:
!pip install -qqq seaborn # for evaluation visualization
!pip install -qqq wandb   # for logging
!pip install -qqq datasets # huggingface's lib.
!pip install -qqq transformers==4.39.2
!pip install -qqq accelerate==0.28.0
!pip install -qqq shortuuid

!pip install -U accelerate
!pip install tensorboard

Collecting accelerate
  Using cached accelerate-0.31.0-py3-none-any.whl (309 kB)
Installing collected packages: accelerate
  Attempting uninstall: accelerate
    Found existing installation: accelerate 0.28.0
    Uninstalling accelerate-0.28.0:
      Successfully uninstalled accelerate-0.28.0
Successfully installed accelerate-0.31.0


In [2]:
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from datasets import load_dataset
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
import wandb
import torch
import random
import os

# Function to set the seed for reproducibility
def set_seed(seed_value=42):
    """Set seed for reproducibility."""
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)  # if you are using multi-GPU.
    random.seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)

    # The below two lines are for deterministic algorithm behavior in CUDA
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# Set the seed
set_seed()

In [3]:
dataset = load_dataset("ag_news")
print(dataset)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [4]:
dataset = load_dataset("ag_news")
print(dataset)

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


In [5]:
from pprint import pprint # Using Python's pprint (pretty-print) library to prevent long horizontal output and make the data more readable

print(type(dataset)) # Data type
print(dataset) # Data structure and count
print("\n"*2+ "Train dataset:")
print(dataset["train"][1000]) # Print to check the content of train data

# Explanation of labels - 4 classes # https://huggingface.co/datasets/ag_news
# 1 class: World news
# 2 class: Sports news
# 3 class: Business news
# 4 class: Science/Technology news

<class 'datasets.dataset_dict.DatasetDict'>
DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 120000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 7600
    })
})


Train dataset:
{'text': 'European Union Extends Microsoft-Time Warner Review BRUSSELS, Belgium (AP) -- European antitrust regulators said Monday they have extended their review of a deal between Microsoft Corp. (MSFT) and Time Warner Inc...', 'label': 3}


In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

def tokenize_function(examples):
    return tokenizer(examples['text'], padding=True, truncation=True)

tokenized_datasets = dataset.map(tokenize_function, batched=True)



In [7]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

In [8]:
model = BertForSequenceClassification\
        .from_pretrained('bert-base-uncased', num_labels=4)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
training_args = TrainingArguments(
    output_dir='./results-bert-topic-cls',
    num_train_epochs=3,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=8,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',

    evaluation_strategy='epoch',  # Evaluate at the end of each epoch
    logging_steps=10,
    ## ----
    report_to="tensorboard",
)

In [10]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=1)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions, average='weighted')
    acc = accuracy_score(labels, predictions)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [11]:
# Select the first N samples from the tokenized training dataset
subset_train_dataset = tokenized_datasets['train'].select(range(100)) # 1/2 data for time saving

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=subset_train_dataset,
    eval_dataset=tokenized_datasets['test'],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [None]:
trainer.train()

In [None]:
# Specify the directory where you want to save your model
output_dir = './bert-topic-cls'

# Save the model
model.save_pretrained(output_dir)
# Save the tokenizer
tokenizer.save_pretrained(output_dir)

In [None]:

# Evaluate the model
results = trainer.evaluate()

In [None]:
print( results )


In [None]:
# Predictions to get the confusion matrix
predictions = trainer.predict(tokenized_datasets['test'])
preds = np.argmax(predictions.predictions, axis=-1)

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

label_map = {
    'LABEL_0': 'World',
    'LABEL_1': 'Sports',
    'LABEL_2': 'Business',
    'LABEL_3': 'Sci/Tech'
}

cm = confusion_matrix(predictions.label_ids, preds)

# label_map to labels
labels = [label_map[f'LABEL_{i}'] for i in range(len(label_map))]

# Confusion Matrix
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=labels, yticklabels=labels)
plt.xlabel('Predicted labels')
plt.ylabel('True labels')
plt.title('Confusion Matrix with Label Names')
plt.show()

In [None]:
# Example sentence
sentence = "The stock market is reaching new heights."

# Tokenize the sentence
inputs = tokenizer(sentence, padding=True, truncation=True, max_length=512, return_tensors="pt")

In [None]:
import torch
# Move inputs to the same device as the model
inputs = {k: v.to(model.device) for k, v in inputs.items()}

# Make prediction
model.eval()  # Set the model to evaluation mode
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits.argmax(-1).item()  # Get the predicted class (index)

# Map the prediction index to the class name (if you have a label map)
simple_label_map = {0: "World", 1: "Sports", 2: "Business", 3: "Sci/Tech"}
predicted_label = simple_label_map[predictions]

print(f"Sentence: '{sentence}'")
print(f"Predicted Label: '{predicted_label}'")

In [None]:
from transformers import pipeline

# Specify the path to your fine-tuned model or use a pre-trained model from the Hugging Face Model Hub
model_path = './bert-topic-cls'  # Change this to your model's path or a Hugging Face model name

# Load the pipeline for text classification
classifier = pipeline("text-classification", model=model_path, tokenizer=model_path)

In [None]:
# Example sentences
sentences = [
    "The stock market is reaching new heights.",
    "The new sports car has been unveiled at the auto show.",
    "The tech company announced its latest gadget yesterday."
]

# Make predictions
predictions = classifier(sentences)

# Print the predictions using the label map
for sentence, prediction in zip(sentences, predictions):
    # Map the predicted label to the actual class name
    class_name = label_map[prediction['label']]
    print(f"Sentence: '{sentence}'")
    print(f"Predicted Label: '{class_name}' with score {prediction['score']:.4f}\n")